import numpy as np
from pyacs.gts.Gts import get_index_from_dates
###################################################################
## Remove outliers
###################################################################
###############################################################################
[docs]
def remove_outliers(self,periods=None,in_place=False):
###############################################################################
"""
Remove outliers listed in self.outliers from the time series.
Parameters
----------
periods : list, optional
If given, only remove outliers within these periods.
in_place : bool, optional
If True, modify self; otherwise return a new Gts.
Returns
-------
Gts
Time series without outliers (new or self if in_place).
"""
if self.outliers:
if periods==None:
data = np.delete(self.data,self.outliers,axis=0)
else:
lindex=self.lindex_in_periods(periods)
ldelete=np.intersect1d(lindex,self.outliers)
data = np.delete(self.data,ldelete,axis=0)
else:
data = np.copy(self.data)
new_Gts=self.copy()
new_Gts.outliers=[]
new_Gts.data = data
if in_place:
self.data=new_Gts.data.copy()
del new_Gts
self.outliers=[]
return(self)
else:
return(new_Gts)
###################################################################
[docs]
def find_outlier_around_date(self,date,conf_level=95,n=3, lcomponent='NE',verbose=True):
###################################################################
"""
Find an outlier around a given date (F-ratio test).
Returns the index of the outlier, or [] if none found.
Parameters
----------
date : float
Date in decimal year.
conf_level : float, optional
Confidence level for F-ratio test (default 95).
n : int, optional
Number of dates on each side (default 3).
lcomponent : str, optional
Components 'N', 'E', 'U', 'NE', 'NEU' (default 'NE').
verbose : bool, optional
Verbose mode.
Returns
-------
Gts or list
self with outlier flagged, or [] if no significant outlier.
"""
if verbose:
print(("-- Searching outlier around date %10.5lf on components %s with confidence level %6.1lf and %02d samples" % (date,lcomponent,conf_level,2*n)))
#self.info()
tmp_gts=self.detrend().remove_outliers().extract_ndates_around_date(date,n)
nn=tmp_gts.data.shape[0]
score={}
llindex={}
# F_ratio test
###############################################
def f_ratio(chi_square_1,p1,chi_square_2,p2,n):
###############################################
"""
Return the F-ratio test CDF value.
Parameters
----------
chi_square_1, chi_square_2 : float
Chi-square values.
p1, p2 : int
Parameter counts.
n : int
Sample size.
Returns
-------
float
F CDF value.
"""
F=( (chi_square_1-chi_square_2)/(p2-p1) ) / (chi_square_2 / (n-p2) )
from scipy.stats import f
return(f.cdf(F,p2-p1,n-p2))
#
H_component={1:'North',2:'East',3:'Up'}
# find outlier
li=[]
if 'N' in lcomponent:li.append(1)
if 'E' in lcomponent:li.append(2)
if 'U' in lcomponent:li.append(3)
for i in sorted(li):
# if verbose:
# if i==1:print " => Testing component: North"
# if i==2:print " => Testing component: East"
# if i==3:print " => Testing component: Up"
index=np.where(np.abs(tmp_gts.data[:,i]-np.median(tmp_gts.data[:,i])) == np.max(np.abs(tmp_gts.data[:,i]-np.median(tmp_gts.data[:,i]))))
if verbose:
print(("-- suspected outlier at date %10.4lf on component %s " % (tmp_gts.data[index,0][0],H_component[i])))
tmp_gts_no_outlier=tmp_gts.copy()
tmp_gts_no_outlier.outliers=[index]
tmp_gts_no_outlier.remove_outliers(in_place=True)
chi_square_1=nn*np.std(tmp_gts.data[:,i])**2
chi_square_2=(nn-1)*np.std(tmp_gts_no_outlier.data[:,i])**2
score[i]=f_ratio(chi_square_1,1,chi_square_2,2,2*n)*100.0
print(("-- probability of outlier (F_ratio) %5.2lf%% " % (score[i])))
llindex[i]=index
# make decision
if np.max(list(score.values())) < conf_level:
if verbose:print("-- No significant outlier found")
return(self)
else:
# choose the outlier as the maximum probability
component=li[0]
current_score=score[component]
del li[0]
for i in li:
if score[i]>current_score:
current_score=score[i]
component=i
date=tmp_gts.data[llindex[component],0]
# return the index in the original time series
if verbose:
print("=> Getting index for date ",date)
returned_index=get_index_from_dates([date],self.data,tol=0.25)
self.outliers+=returned_index
return(self)
###############################################################################
[docs]
def find_outliers_percentage(self,percentage=0.03,in_place=False,verbose=False, component='NEU', periods=None,excluded_periods=None):
###############################################################################
"""
detrend a time series and
ranks the residuals by increasing absolute value
populate the outliers with the x % largest ones on each component
"""
n_to_remove=int(percentage*self.data.shape[0])
if n_to_remove < 1: n_to_remove=1
if verbose:print("-- Removing the ",n_to_remove," largest outliers (each time series)")
new_ts=self.detrend()
lindex_north=list(np.argsort(new_ts.data[:,1]**2))[-n_to_remove:]
lindex_east=list(np.argsort(new_ts.data[:,2]**2))[-n_to_remove:]
lindex_up=list(np.argsort(new_ts.data[:,3]**2))[-n_to_remove:]
lindex=[]
if 'N' in component:lindex=lindex+lindex_north
if 'E' in component:lindex=lindex+lindex_east
if 'U' in component:lindex=lindex+lindex_up
if periods and excluded_periods:
print("!!!! periods and excluded_periods provided. Possible overlap not checked.")
print("!!!! The program will run first on periods and then will exclude outliers in excluded_periods.")
if periods:
lkept_index=[]
for index in lindex:
for period in periods:
start_date_period=period[0]
end_date_period =period[1]
if self.data[index,0]>=start_date_period and self.data[index,0]<=end_date_period:
lkept_index.append(index)
break
lindex=lkept_index
if excluded_periods:
lexcluded_index=[]
for index in lindex:
for period in excluded_periods:
start_date_period=period[0]
end_date_period =period[1]
if self.data[index,0]>=start_date_period and self.data[index,0]<=end_date_period:
lexcluded_index.append(index)
break
lkept_index=[]
for index in lindex:
if index not in lexcluded_index:lkept_index.append()
lindex=lkept_index
new_Gts=self.copy()
if in_place:
self.outliers=list(set(lindex))
return(self)
del new_Gts
else:
new_Gts=self.copy()
new_Gts.outliers=list(set(lindex))
return(new_Gts)
###################################################################
## A very simple isolated outliers method
###################################################################
[docs]
def find_outliers_simple(self,threshold=100,window_length=10,in_place=False,verbose=False, component='NEU', periods=None,excluded_periods=None):
lindex_outlier=[]
for i in range(0,self.data.shape[0]-window_length):
window=self.data[i:i+window_length,1:4]*1000.
#print 'window',window
#print 'median ',np.median(window,axis=0)
residuals=np.abs(window-np.median(window,axis=0))
#print 'residuals',residuals
median_of_residuals=np.median(residuals,axis=0)
#print 'median of res ',median_of_residuals
for index in range(0,residuals.shape[0]):
#print residuals[index,0],median_of_residuals[0]
if residuals[index,0]>threshold*median_of_residuals[0]:
if (i+index) not in lindex_outlier:print('outlier at ',self.data[i+index,0],' N');lindex_outlier.append(i+index)
if residuals[index,1]>threshold*median_of_residuals[1]:
if (i+index) not in lindex_outlier:print('outlier at ',self.data[i+index,0],' E');lindex_outlier.append(i+index)
if residuals[index,2]>threshold*median_of_residuals[2]:
if (i+index) not in lindex_outlier:print('outlier at ',self.data[i+index,0],' U');lindex_outlier.append(i+index)
if periods and excluded_periods:
print("!!!! periods and excluded_periods provided. Possible overlap not checked.")
print("!!!! The program will run first on periods and then will exclude outliers in excluded_periods.")
if periods:
lkept_index=[]
for index in lindex_outlier:
for period in periods:
start_date_period=period[0]
end_date_period =period[1]
if self.data[index,0]>=start_date_period and self.data[index,0]<=end_date_period:
lkept_index.append(index)
break
lindex_outlier=lkept_index
if excluded_periods:
lexcluded_index=[]
for index in lindex_outlier:
for period in excluded_periods:
start_date_period=period[0]
end_date_period =period[1]
if self.data[index,0]>=start_date_period and self.data[index,0]<=end_date_period:
lexcluded_index.append(index)
break
lkept_index=[]
for index in lindex_outlier:
if index not in lexcluded_index:lkept_index.append()
lindex_outlier=lkept_index
new_Gts=self.copy()
if in_place:
self.outliers=list(set(lindex_outlier))
return(self)
del new_Gts
else:
new_Gts=self.copy()
new_Gts.outliers=list(set(lindex_outlier))
return(new_Gts)
###################################################################
## find_outliers and offsets using differenciation
###################################################################
[docs]
def find_outliers_and_offsets_through_differentiation(self,th=100):
"""
find outliers and offsets using differenciation
"""
loutlier={}
loffset={}
new_Gts=self.differentiate()
median=np.median(np.fabs(new_Gts.data),axis=0)
lindex_max=np.argmax(new_Gts.data,axis=0)
# test each component
for component in (1,2,3):
loutlier[component]=[]
loffset[component]=[]
index=lindex_max[component]
#print 'th*median[component] ', th*median[component]
if np.fabs(new_Gts.data[index,component])> th*median[component]:
print("Suspect either an outlier or an offset at ",index, 'date ',new_Gts.data[index,0], ' on component ',component)
# case beginning of the ts => This can only be an outlier
if index==0:
# then the outlier could be either the first or the second record of the time series
if np.fabs(new_Gts.data[1,component]) > th*median[component]:
# the outlier is the second record
loutlier[component].append(1)
else:
# the outlier is the first
loutlier[component].append(1)
continue
# case end of the ts => This can only be an outlier
if index==len(new_Gts.data[:,0])-1:
# then the outlier could be either the last or the ante-last record of the time series
if np.fabs(new_Gts.data[len(new_Gts.data[:,0])-2,component]) > th*median[component]:
# the outlier is the second last record
loutlier[component].append(len(new_Gts.data[:,0])-2)
else:
# the outlier is the first
loutlier[component].append(len(new_Gts.data[:,0])-1)
continue
# regular case
# an isolated outlier should have two opposite AND large successive values in the differenciated time series
# offset only has only one large value
if np.fabs(new_Gts.data[index-1,component])< 2*median[component] and np.fabs(new_Gts.data[index+1,component])< 2*median[component]:
# this is a jump
loffset[component].append(new_Gts.data[index,0])
print("-- Found offset on component ",component, " at ",new_Gts.data[index,0])
else:
# this is an outlier
if np.fabs(new_Gts.data[index-1,0])>np.fabs(new_Gts.data[index+1,0]):
# sequence of index-1,index large differenciation
index=index-1
# else the sequence is index,index+1
# we now test 2 criterions: similar differenciated value and
# opposite signs
test_large_difference=False
test_opposite_sign=False
if np.fabs(np.fabs(new_Gts.data[index,0])-np.fabs(new_Gts.data[index+1,0]))< 4*median[component]:test_large_difference=True
if new_Gts.data[index,0]*new_Gts.data[index,0]:test_opposite_sign=True
if test_large_difference and test_opposite_sign:
print("Found isolated outlier at : ",index)
loutlier[component].append(index)
else:
print("Not sure what this is at date ",new_Gts.data[index,0])
for outlier in loutlier[1]+loutlier[2]+loutlier[3]:
if self.outliers:
if outlier not in self.outliers:self.outliers.append(outlier)
else:
self.outliers=[outlier]
for offset in loffset[1]+loffset[2]+loffset[3]:
if self.offsets:
if offset not in self.offsets:self.offsets.append(offset)
else:
self.offsets=[offset]
##################################################################
## search the outliers by the time series of rms
##################################################################
[docs]
def find_outliers_by_RMS_ts(self,ndays=7,th_detection=5,th_rejection=2):
"""
Find index of outliers in a time series, populate self.outliers.
- rms time series are first calculated over ndays
- time windows are kept for further inspection if rms(t) > th_detection * median(rms(ts))
- for each anomalous time windows, differentiate positions, find the max
- test whether it is a true outlier (differentiated(t) > th_rejection * median(differentiated))
output:
None
"""
data_rms = self.rms(ndays)
loutlier = []
for i in (1,2,3):
loutlier_i = []
out_rms = np.argwhere(data_rms[:,i] > th_detection*np.median(data_rms[:,i]))
if len(out_rms) > 0: out_rms = np.hstack(out_rms)
for j in out_rms:
differentiate_ndays = np.hstack(np.fabs(np.diff(self.data[j:(j+ndays),i])))
out_threshold = th_rejection*np.median(differentiate_ndays)
index = np.argwhere(differentiate_ndays > out_threshold)
if len(index) == 2:
### 1 outlier
if (index[0] == index[1]-1): loutlier_i.append(index[1] + j)
#### 2 continuous outliers
elif (index[0] == index[1]-2):
loutlier_i.append(index[0] + 1 + j)
loutlier_i.append(index[1] + j)
if len(loutlier_i)!=0:
loutlier_i = np.hstack(loutlier_i)
for i in np.unique(loutlier_i):
if len(np.argwhere(loutlier_i == i)) == ndays-2: loutlier.append(i)
elif len(np.argwhere(loutlier_i == i)) == ndays-3: loutlier.append(i)
if self.outliers:
for outlier in loutlier:
if outlier not in self.outliers:self.outliers.append(outlier)
else:
self.outliers=loutlier
##################################################################
## search the outliers by the trendline
##################################################################
###############################################################################
[docs]
def find_outliers_by_residuals(self,threshold=5,model='detrend_seasonal',component='NE',in_place=False):
###############################################################################
"""
Find index of outliers by trendline/trendline_annual/trendline_seasonal (the complete model)
Then the outliers are detected if their residuals are greater than th_rejection*standard_deviation
output:
Add the list of outlier index into self.outliers
"""
try:
detrended=self.make_model(option=model).data
except:
try:
detrended=self.make_model(option='detrend').data
except:
detrended=self.data.copy()
# print detrended.data
# print detrended.data[:,1:4]
# print np.diff(detrended.data[:,1:4],axis=0)
# print np.abs(np.diff(detrended.data[:,1:4],axis=0),axis=0)
# print np.median(np.abs(np.diff(detrended.data[:,1:4],axis=0),axis=0),axis=0)
[median_north,median_east,median_up]=np.median(np.abs(np.diff(detrended[:,1:4],axis=0)),axis=0)
lindex_north=[]
lindex_east=[]
lindex_up=[]
if 'N' in component:
lindex_north=np.where(np.abs(detrended[:,1]) > threshold*median_north)[0].tolist()
if 'E' in component:
lindex_east=np.where(np.abs(detrended[:,2]) > threshold*median_east)[0].tolist()
if 'U' in component:
lindex_up=np.where(np.abs(detrended[:,3]) > threshold*median_up)[0].tolist()
loutliers=self.outliers+list(set(lindex_north+lindex_east+lindex_up))
new_gts=self.copy()
new_gts.outliers=loutliers
if in_place:
self.data=new_gts.data
else:
return(new_gts)
###############################################################################
[docs]
def find_outliers_vondrak(self, threshold=10, fc=2., in_place=False,verbose=True, \
periods=[[]],excluded_periods=[[]], component='NE'):
###############################################################################
"""
Find outliers using a Vondrak filter
"""
# init
loutliers_dates = []
lindex_north=[]
lindex_east=[]
lindex_up=[]
# keep selected period
tmp_ts=self.extract_periods(periods).exclude_periods(excluded_periods).detrend()
# calculates vondrak filter
vondrak_ts = tmp_ts.vondrak(fc,component=component,verbose=verbose)
# calculates the residual time series
residual_ts = tmp_ts.copy()
residual_ts.data[:,1] = tmp_ts.data[:,1] - vondrak_ts.data[:,1]
residual_ts.data[:,2] = tmp_ts.data[:,2] - vondrak_ts.data[:,2]
residual_ts.data[:,3] = tmp_ts.data[:,3] - vondrak_ts.data[:,3]
# get the median
[median_north,median_east,median_up]=np.median(np.abs( residual_ts.data[:,1:4]) , axis=0)
# get the outliers
if 'N' in component:
lindex_north=np.where(np.abs(residual_ts.data[:,1]) > threshold*median_north)[0].tolist()
if 'E' in component:
lindex_east=np.where(np.abs(residual_ts.data[:,2]) > threshold*median_east)[0].tolist()
if 'U' in component:
lindex_up=np.where(np.abs(residual_ts.data[:,3]) > threshold*median_up)[0].tolist()
loutliers=list(set(lindex_north+lindex_east+lindex_up))
if verbose:print(("-- Outliers detection using Vondrak filter with fc=%.2lf : %03d new outliers detected" % (fc,len(loutliers))))
# get the outliers dates
loutliers_dates+=tmp_ts.data[loutliers,0].tolist()
# get outliers index in original time series
if loutliers != []:
loutliers_index=get_index_from_dates(loutliers_dates,self.data,tol=0.25)
else:
loutliers_index=self.outliers
# return
new_Gts=self.copy()
if in_place:
self.outliers=loutliers_index
return(self)
del new_Gts
else:
new_Gts=self.copy()
new_Gts.outliers=loutliers_index
return(new_Gts)
###############################################################################
[docs]
def find_outliers_sliding_window(self,\
threshold=3,in_place=False,verbose=True, \
periods=[[]],excluded_periods=[[]], component='NE',window_len=15,automatic=True):
###############################################################################
"""
Find outliers using sliding windows
"""
lindex_north=[]
lindex_east=[]
lindex_up=[]
if self.data.shape[0] > window_len:
itermax=5
lindex_north=[]
lindex_east=[]
lindex_up=[]
OK=True
loutliers=[]
loutliers_dates=[]
i=0
smooth=self.extract_periods(periods).exclude_periods(excluded_periods).smooth(window_len=window_len)
new_ts=self.extract_periods(periods).exclude_periods(excluded_periods)
residual_ts=self.extract_periods(periods).exclude_periods(excluded_periods)
residual_ts.data[:,1:4]=new_ts.data[:,1:4]-smooth.data[:,1:4]
diff_data=np.diff(self.data[:,1:4],n=1,axis=0)
[median_north,median_east,median_up]=np.median(np.abs(diff_data),axis=0)
while OK:
if 'N' in component:
lindex_north=np.where(np.abs(residual_ts.data[:,1]) > threshold*median_north)[0].tolist()
if 'E' in component:
lindex_east=np.where(np.abs(residual_ts.data[:,2]) > threshold*median_east)[0].tolist()
if 'U' in component:
lindex_up=np.where(np.abs(residual_ts.data[:,3]) > threshold*median_up)[0].tolist()
loutliers=list(set(lindex_north+lindex_east+lindex_up))
if verbose:print(("-- Outliers detection pass #%02d : %03d new outliers detected" % (i,len(loutliers))))
#print loutliers_dates,new_ts.data[loutliers,0].tolist()
loutliers_dates+=new_ts.data[loutliers,0].tolist()
if loutliers==[]:OK=False
i+=1
if i>itermax:OK=False
smooth=self.extract_periods(periods).exclude_periods([[]]).smooth(window_len=window_len)
new_ts.outliers=loutliers
new_ts=new_ts.remove_outliers()
smooth=new_ts.smooth(window_len=window_len)
residual_ts=new_ts.copy()
residual_ts.data[:,1:4]=new_ts.data[:,1:4]-smooth.data[:,1:4]
diff_data=np.diff(self.data[:,1:4],n=1,axis=0)
[median_north,median_east,median_up]=np.median(np.abs(diff_data),axis=0)
if verbose:print("-- ",len(loutliers_dates)," outliers found")
loutliers_index=get_index_from_dates(loutliers_dates,self.data,tol=0.25)
else:
loutliers_index=self.outliers
new_Gts=self.copy()
if in_place:
self.outliers=loutliers_index
return(self)
del new_Gts
else:
new_Gts=self.copy()
new_Gts.outliers=loutliers_index
return(new_Gts)