Source code for pyacs.gts.lib.outliers_old


import numpy as np

from pyacs.gts.Gts import get_index_from_dates

###################################################################
## Remove outliers
###################################################################

###############################################################################
[docs] def remove_outliers(self,periods=None,in_place=False): ############################################################################### """ Remove outliers listed in self.outliers from the time series. Parameters ---------- periods : list, optional If given, only remove outliers within these periods. in_place : bool, optional If True, modify self; otherwise return a new Gts. Returns ------- Gts Time series without outliers (new or self if in_place). """ if self.outliers: if periods==None: data = np.delete(self.data,self.outliers,axis=0) else: lindex=self.lindex_in_periods(periods) ldelete=np.intersect1d(lindex,self.outliers) data = np.delete(self.data,ldelete,axis=0) else: data = np.copy(self.data) new_Gts=self.copy() new_Gts.outliers=[] new_Gts.data = data if in_place: self.data=new_Gts.data.copy() del new_Gts self.outliers=[] return(self) else: return(new_Gts)
###################################################################
[docs] def find_outlier_around_date(self,date,conf_level=95,n=3, lcomponent='NE',verbose=True): ################################################################### """ Find an outlier around a given date (F-ratio test). Returns the index of the outlier, or [] if none found. Parameters ---------- date : float Date in decimal year. conf_level : float, optional Confidence level for F-ratio test (default 95). n : int, optional Number of dates on each side (default 3). lcomponent : str, optional Components 'N', 'E', 'U', 'NE', 'NEU' (default 'NE'). verbose : bool, optional Verbose mode. Returns ------- Gts or list self with outlier flagged, or [] if no significant outlier. """ if verbose: print(("-- Searching outlier around date %10.5lf on components %s with confidence level %6.1lf and %02d samples" % (date,lcomponent,conf_level,2*n))) #self.info() tmp_gts=self.detrend().remove_outliers().extract_ndates_around_date(date,n) nn=tmp_gts.data.shape[0] score={} llindex={} # F_ratio test ############################################### def f_ratio(chi_square_1,p1,chi_square_2,p2,n): ############################################### """ Return the F-ratio test CDF value. Parameters ---------- chi_square_1, chi_square_2 : float Chi-square values. p1, p2 : int Parameter counts. n : int Sample size. Returns ------- float F CDF value. """ F=( (chi_square_1-chi_square_2)/(p2-p1) ) / (chi_square_2 / (n-p2) ) from scipy.stats import f return(f.cdf(F,p2-p1,n-p2)) # H_component={1:'North',2:'East',3:'Up'} # find outlier li=[] if 'N' in lcomponent:li.append(1) if 'E' in lcomponent:li.append(2) if 'U' in lcomponent:li.append(3) for i in sorted(li): # if verbose: # if i==1:print " => Testing component: North" # if i==2:print " => Testing component: East" # if i==3:print " => Testing component: Up" index=np.where(np.abs(tmp_gts.data[:,i]-np.median(tmp_gts.data[:,i])) == np.max(np.abs(tmp_gts.data[:,i]-np.median(tmp_gts.data[:,i])))) if verbose: print(("-- suspected outlier at date %10.4lf on component %s " % (tmp_gts.data[index,0][0],H_component[i]))) tmp_gts_no_outlier=tmp_gts.copy() tmp_gts_no_outlier.outliers=[index] tmp_gts_no_outlier.remove_outliers(in_place=True) chi_square_1=nn*np.std(tmp_gts.data[:,i])**2 chi_square_2=(nn-1)*np.std(tmp_gts_no_outlier.data[:,i])**2 score[i]=f_ratio(chi_square_1,1,chi_square_2,2,2*n)*100.0 print(("-- probability of outlier (F_ratio) %5.2lf%% " % (score[i]))) llindex[i]=index # make decision if np.max(list(score.values())) < conf_level: if verbose:print("-- No significant outlier found") return(self) else: # choose the outlier as the maximum probability component=li[0] current_score=score[component] del li[0] for i in li: if score[i]>current_score: current_score=score[i] component=i date=tmp_gts.data[llindex[component],0] # return the index in the original time series if verbose: print("=> Getting index for date ",date) returned_index=get_index_from_dates([date],self.data,tol=0.25) self.outliers+=returned_index return(self)
###############################################################################
[docs] def find_outliers_percentage(self,percentage=0.03,in_place=False,verbose=False, component='NEU', periods=None,excluded_periods=None): ############################################################################### """ detrend a time series and ranks the residuals by increasing absolute value populate the outliers with the x % largest ones on each component """ n_to_remove=int(percentage*self.data.shape[0]) if n_to_remove < 1: n_to_remove=1 if verbose:print("-- Removing the ",n_to_remove," largest outliers (each time series)") new_ts=self.detrend() lindex_north=list(np.argsort(new_ts.data[:,1]**2))[-n_to_remove:] lindex_east=list(np.argsort(new_ts.data[:,2]**2))[-n_to_remove:] lindex_up=list(np.argsort(new_ts.data[:,3]**2))[-n_to_remove:] lindex=[] if 'N' in component:lindex=lindex+lindex_north if 'E' in component:lindex=lindex+lindex_east if 'U' in component:lindex=lindex+lindex_up if periods and excluded_periods: print("!!!! periods and excluded_periods provided. Possible overlap not checked.") print("!!!! The program will run first on periods and then will exclude outliers in excluded_periods.") if periods: lkept_index=[] for index in lindex: for period in periods: start_date_period=period[0] end_date_period =period[1] if self.data[index,0]>=start_date_period and self.data[index,0]<=end_date_period: lkept_index.append(index) break lindex=lkept_index if excluded_periods: lexcluded_index=[] for index in lindex: for period in excluded_periods: start_date_period=period[0] end_date_period =period[1] if self.data[index,0]>=start_date_period and self.data[index,0]<=end_date_period: lexcluded_index.append(index) break lkept_index=[] for index in lindex: if index not in lexcluded_index:lkept_index.append() lindex=lkept_index new_Gts=self.copy() if in_place: self.outliers=list(set(lindex)) return(self) del new_Gts else: new_Gts=self.copy() new_Gts.outliers=list(set(lindex)) return(new_Gts)
################################################################### ## A very simple isolated outliers method ###################################################################
[docs] def find_outliers_simple(self,threshold=100,window_length=10,in_place=False,verbose=False, component='NEU', periods=None,excluded_periods=None): lindex_outlier=[] for i in range(0,self.data.shape[0]-window_length): window=self.data[i:i+window_length,1:4]*1000. #print 'window',window #print 'median ',np.median(window,axis=0) residuals=np.abs(window-np.median(window,axis=0)) #print 'residuals',residuals median_of_residuals=np.median(residuals,axis=0) #print 'median of res ',median_of_residuals for index in range(0,residuals.shape[0]): #print residuals[index,0],median_of_residuals[0] if residuals[index,0]>threshold*median_of_residuals[0]: if (i+index) not in lindex_outlier:print('outlier at ',self.data[i+index,0],' N');lindex_outlier.append(i+index) if residuals[index,1]>threshold*median_of_residuals[1]: if (i+index) not in lindex_outlier:print('outlier at ',self.data[i+index,0],' E');lindex_outlier.append(i+index) if residuals[index,2]>threshold*median_of_residuals[2]: if (i+index) not in lindex_outlier:print('outlier at ',self.data[i+index,0],' U');lindex_outlier.append(i+index) if periods and excluded_periods: print("!!!! periods and excluded_periods provided. Possible overlap not checked.") print("!!!! The program will run first on periods and then will exclude outliers in excluded_periods.") if periods: lkept_index=[] for index in lindex_outlier: for period in periods: start_date_period=period[0] end_date_period =period[1] if self.data[index,0]>=start_date_period and self.data[index,0]<=end_date_period: lkept_index.append(index) break lindex_outlier=lkept_index if excluded_periods: lexcluded_index=[] for index in lindex_outlier: for period in excluded_periods: start_date_period=period[0] end_date_period =period[1] if self.data[index,0]>=start_date_period and self.data[index,0]<=end_date_period: lexcluded_index.append(index) break lkept_index=[] for index in lindex_outlier: if index not in lexcluded_index:lkept_index.append() lindex_outlier=lkept_index new_Gts=self.copy() if in_place: self.outliers=list(set(lindex_outlier)) return(self) del new_Gts else: new_Gts=self.copy() new_Gts.outliers=list(set(lindex_outlier)) return(new_Gts)
################################################################### ## find_outliers and offsets using differenciation ###################################################################
[docs] def find_outliers_and_offsets_through_differentiation(self,th=100): """ find outliers and offsets using differenciation """ loutlier={} loffset={} new_Gts=self.differentiate() median=np.median(np.fabs(new_Gts.data),axis=0) lindex_max=np.argmax(new_Gts.data,axis=0) # test each component for component in (1,2,3): loutlier[component]=[] loffset[component]=[] index=lindex_max[component] #print 'th*median[component] ', th*median[component] if np.fabs(new_Gts.data[index,component])> th*median[component]: print("Suspect either an outlier or an offset at ",index, 'date ',new_Gts.data[index,0], ' on component ',component) # case beginning of the ts => This can only be an outlier if index==0: # then the outlier could be either the first or the second record of the time series if np.fabs(new_Gts.data[1,component]) > th*median[component]: # the outlier is the second record loutlier[component].append(1) else: # the outlier is the first loutlier[component].append(1) continue # case end of the ts => This can only be an outlier if index==len(new_Gts.data[:,0])-1: # then the outlier could be either the last or the ante-last record of the time series if np.fabs(new_Gts.data[len(new_Gts.data[:,0])-2,component]) > th*median[component]: # the outlier is the second last record loutlier[component].append(len(new_Gts.data[:,0])-2) else: # the outlier is the first loutlier[component].append(len(new_Gts.data[:,0])-1) continue # regular case # an isolated outlier should have two opposite AND large successive values in the differenciated time series # offset only has only one large value if np.fabs(new_Gts.data[index-1,component])< 2*median[component] and np.fabs(new_Gts.data[index+1,component])< 2*median[component]: # this is a jump loffset[component].append(new_Gts.data[index,0]) print("-- Found offset on component ",component, " at ",new_Gts.data[index,0]) else: # this is an outlier if np.fabs(new_Gts.data[index-1,0])>np.fabs(new_Gts.data[index+1,0]): # sequence of index-1,index large differenciation index=index-1 # else the sequence is index,index+1 # we now test 2 criterions: similar differenciated value and # opposite signs test_large_difference=False test_opposite_sign=False if np.fabs(np.fabs(new_Gts.data[index,0])-np.fabs(new_Gts.data[index+1,0]))< 4*median[component]:test_large_difference=True if new_Gts.data[index,0]*new_Gts.data[index,0]:test_opposite_sign=True if test_large_difference and test_opposite_sign: print("Found isolated outlier at : ",index) loutlier[component].append(index) else: print("Not sure what this is at date ",new_Gts.data[index,0]) for outlier in loutlier[1]+loutlier[2]+loutlier[3]: if self.outliers: if outlier not in self.outliers:self.outliers.append(outlier) else: self.outliers=[outlier] for offset in loffset[1]+loffset[2]+loffset[3]: if self.offsets: if offset not in self.offsets:self.offsets.append(offset) else: self.offsets=[offset]
################################################################## ## search the outliers by the time series of rms ##################################################################
[docs] def find_outliers_by_RMS_ts(self,ndays=7,th_detection=5,th_rejection=2): """ Find index of outliers in a time series, populate self.outliers. - rms time series are first calculated over ndays - time windows are kept for further inspection if rms(t) > th_detection * median(rms(ts)) - for each anomalous time windows, differentiate positions, find the max - test whether it is a true outlier (differentiated(t) > th_rejection * median(differentiated)) output: None """ data_rms = self.rms(ndays) loutlier = [] for i in (1,2,3): loutlier_i = [] out_rms = np.argwhere(data_rms[:,i] > th_detection*np.median(data_rms[:,i])) if len(out_rms) > 0: out_rms = np.hstack(out_rms) for j in out_rms: differentiate_ndays = np.hstack(np.fabs(np.diff(self.data[j:(j+ndays),i]))) out_threshold = th_rejection*np.median(differentiate_ndays) index = np.argwhere(differentiate_ndays > out_threshold) if len(index) == 2: ### 1 outlier if (index[0] == index[1]-1): loutlier_i.append(index[1] + j) #### 2 continuous outliers elif (index[0] == index[1]-2): loutlier_i.append(index[0] + 1 + j) loutlier_i.append(index[1] + j) if len(loutlier_i)!=0: loutlier_i = np.hstack(loutlier_i) for i in np.unique(loutlier_i): if len(np.argwhere(loutlier_i == i)) == ndays-2: loutlier.append(i) elif len(np.argwhere(loutlier_i == i)) == ndays-3: loutlier.append(i) if self.outliers: for outlier in loutlier: if outlier not in self.outliers:self.outliers.append(outlier) else: self.outliers=loutlier
################################################################## ## search the outliers by the trendline ################################################################## ###############################################################################
[docs] def find_outliers_by_residuals(self,threshold=5,model='detrend_seasonal',component='NE',in_place=False): ############################################################################### """ Find index of outliers by trendline/trendline_annual/trendline_seasonal (the complete model) Then the outliers are detected if their residuals are greater than th_rejection*standard_deviation output: Add the list of outlier index into self.outliers """ try: detrended=self.make_model(option=model).data except: try: detrended=self.make_model(option='detrend').data except: detrended=self.data.copy() # print detrended.data # print detrended.data[:,1:4] # print np.diff(detrended.data[:,1:4],axis=0) # print np.abs(np.diff(detrended.data[:,1:4],axis=0),axis=0) # print np.median(np.abs(np.diff(detrended.data[:,1:4],axis=0),axis=0),axis=0) [median_north,median_east,median_up]=np.median(np.abs(np.diff(detrended[:,1:4],axis=0)),axis=0) lindex_north=[] lindex_east=[] lindex_up=[] if 'N' in component: lindex_north=np.where(np.abs(detrended[:,1]) > threshold*median_north)[0].tolist() if 'E' in component: lindex_east=np.where(np.abs(detrended[:,2]) > threshold*median_east)[0].tolist() if 'U' in component: lindex_up=np.where(np.abs(detrended[:,3]) > threshold*median_up)[0].tolist() loutliers=self.outliers+list(set(lindex_north+lindex_east+lindex_up)) new_gts=self.copy() new_gts.outliers=loutliers if in_place: self.data=new_gts.data else: return(new_gts)
###############################################################################
[docs] def find_outliers_vondrak(self, threshold=10, fc=2., in_place=False,verbose=True, \ periods=[[]],excluded_periods=[[]], component='NE'): ############################################################################### """ Find outliers using a Vondrak filter """ # init loutliers_dates = [] lindex_north=[] lindex_east=[] lindex_up=[] # keep selected period tmp_ts=self.extract_periods(periods).exclude_periods(excluded_periods).detrend() # calculates vondrak filter vondrak_ts = tmp_ts.vondrak(fc,component=component,verbose=verbose) # calculates the residual time series residual_ts = tmp_ts.copy() residual_ts.data[:,1] = tmp_ts.data[:,1] - vondrak_ts.data[:,1] residual_ts.data[:,2] = tmp_ts.data[:,2] - vondrak_ts.data[:,2] residual_ts.data[:,3] = tmp_ts.data[:,3] - vondrak_ts.data[:,3] # get the median [median_north,median_east,median_up]=np.median(np.abs( residual_ts.data[:,1:4]) , axis=0) # get the outliers if 'N' in component: lindex_north=np.where(np.abs(residual_ts.data[:,1]) > threshold*median_north)[0].tolist() if 'E' in component: lindex_east=np.where(np.abs(residual_ts.data[:,2]) > threshold*median_east)[0].tolist() if 'U' in component: lindex_up=np.where(np.abs(residual_ts.data[:,3]) > threshold*median_up)[0].tolist() loutliers=list(set(lindex_north+lindex_east+lindex_up)) if verbose:print(("-- Outliers detection using Vondrak filter with fc=%.2lf : %03d new outliers detected" % (fc,len(loutliers)))) # get the outliers dates loutliers_dates+=tmp_ts.data[loutliers,0].tolist() # get outliers index in original time series if loutliers != []: loutliers_index=get_index_from_dates(loutliers_dates,self.data,tol=0.25) else: loutliers_index=self.outliers # return new_Gts=self.copy() if in_place: self.outliers=loutliers_index return(self) del new_Gts else: new_Gts=self.copy() new_Gts.outliers=loutliers_index return(new_Gts)
###############################################################################
[docs] def find_outliers_sliding_window(self,\ threshold=3,in_place=False,verbose=True, \ periods=[[]],excluded_periods=[[]], component='NE',window_len=15,automatic=True): ############################################################################### """ Find outliers using sliding windows """ lindex_north=[] lindex_east=[] lindex_up=[] if self.data.shape[0] > window_len: itermax=5 lindex_north=[] lindex_east=[] lindex_up=[] OK=True loutliers=[] loutliers_dates=[] i=0 smooth=self.extract_periods(periods).exclude_periods(excluded_periods).smooth(window_len=window_len) new_ts=self.extract_periods(periods).exclude_periods(excluded_periods) residual_ts=self.extract_periods(periods).exclude_periods(excluded_periods) residual_ts.data[:,1:4]=new_ts.data[:,1:4]-smooth.data[:,1:4] diff_data=np.diff(self.data[:,1:4],n=1,axis=0) [median_north,median_east,median_up]=np.median(np.abs(diff_data),axis=0) while OK: if 'N' in component: lindex_north=np.where(np.abs(residual_ts.data[:,1]) > threshold*median_north)[0].tolist() if 'E' in component: lindex_east=np.where(np.abs(residual_ts.data[:,2]) > threshold*median_east)[0].tolist() if 'U' in component: lindex_up=np.where(np.abs(residual_ts.data[:,3]) > threshold*median_up)[0].tolist() loutliers=list(set(lindex_north+lindex_east+lindex_up)) if verbose:print(("-- Outliers detection pass #%02d : %03d new outliers detected" % (i,len(loutliers)))) #print loutliers_dates,new_ts.data[loutliers,0].tolist() loutliers_dates+=new_ts.data[loutliers,0].tolist() if loutliers==[]:OK=False i+=1 if i>itermax:OK=False smooth=self.extract_periods(periods).exclude_periods([[]]).smooth(window_len=window_len) new_ts.outliers=loutliers new_ts=new_ts.remove_outliers() smooth=new_ts.smooth(window_len=window_len) residual_ts=new_ts.copy() residual_ts.data[:,1:4]=new_ts.data[:,1:4]-smooth.data[:,1:4] diff_data=np.diff(self.data[:,1:4],n=1,axis=0) [median_north,median_east,median_up]=np.median(np.abs(diff_data),axis=0) if verbose:print("-- ",len(loutliers_dates)," outliers found") loutliers_index=get_index_from_dates(loutliers_dates,self.data,tol=0.25) else: loutliers_index=self.outliers new_Gts=self.copy() if in_place: self.outliers=loutliers_index return(self) del new_Gts else: new_Gts=self.copy() new_Gts.outliers=loutliers_index return(new_Gts)