Source code for pyacs.gts.lib.outliers.find_outliers_around_date

###################################################################
[docs] def find_outliers_around_date(self, date, conf_level=95, n=3, lcomponent='NE', verbose=True): ################################################################### """ Find an outlier around a given date using F-ratio test. Returns the index of the outlier, or [] if none found. Parameters ---------- date : float Date in decimal year. conf_level : float, optional Confidence level for F-ratio test (default 95). n : int, optional Number of dates on each side of date (default 3). lcomponent : str, optional Components to test: 'N', 'E', 'U', 'NE', 'NEU' (default 'NE'). verbose : bool, optional Verbose mode. Returns ------- Gts or list self with outlier flagged, or [] if no significant outlier. """ # import import numpy as np from pyacs.gts.Gts import get_index_from_dates if verbose: print(( "-- Searching outlier around date %10.5lf on components %s with confidence level %6.1lf and %02d samples" % ( date, lcomponent, conf_level, 2 * n))) # self.info() tmp_gts = self.detrend().remove_outliers().extract_ndates_around_date(date, n) nn = tmp_gts.data.shape[0] score = {} llindex = {} # F_ratio test ############################################### def f_ratio(chi_square_1, p1, chi_square_2, p2, n): ############################################### """ Return the F-ratio test CDF value (probability). Parameters ---------- chi_square_1, chi_square_2 : float Chi-square values. p1, p2 : int Number of parameters. n : int Sample size. Returns ------- float F CDF value. """ F = ((chi_square_1 - chi_square_2) / (p2 - p1)) / (chi_square_2 / (n - p2)) from scipy.stats import f return (f.cdf(F, p2 - p1, n - p2)) # H_component = {1: 'North', 2: 'East', 3: 'Up'} # find outlier li = [] if 'N' in lcomponent: li.append(1) if 'E' in lcomponent: li.append(2) if 'U' in lcomponent: li.append(3) for i in sorted(li): # if verbose: # if i==1:print " => Testing component: North" # if i==2:print " => Testing component: East" # if i==3:print " => Testing component: Up" index = np.where(np.abs(tmp_gts.data[:, i] - np.median(tmp_gts.data[:, i])) == np.max( np.abs(tmp_gts.data[:, i] - np.median(tmp_gts.data[:, i])))) if verbose: print( ("-- suspected outlier at date %10.4lf on component %s " % (tmp_gts.data[index, 0][0], H_component[i]))) tmp_gts_no_outlier = tmp_gts.copy() tmp_gts_no_outlier.outliers = [index] tmp_gts_no_outlier.remove_outliers(in_place=True) chi_square_1 = nn * np.std(tmp_gts.data[:, i]) ** 2 chi_square_2 = (nn - 1) * np.std(tmp_gts_no_outlier.data[:, i]) ** 2 score[i] = f_ratio(chi_square_1, 1, chi_square_2, 2, 2 * n) * 100.0 print(("-- probability of outlier (F_ratio) %5.2lf%% " % (score[i]))) llindex[i] = index # make decision if np.max(list(score.values())) < conf_level: if verbose: print("-- No significant outlier found") return (self) else: # choose the outlier as the maximum probability component = li[0] current_score = score[component] del li[0] for i in li: if score[i] > current_score: current_score = score[i] component = i date = tmp_gts.data[llindex[component], 0] # return the index in the original time series if verbose: print("=> Getting index for date ", date) returned_index = get_index_from_dates([date], self.data, tol=0.25) self.outliers += returned_index return (self)