There are several ways of finding data. All the techniques that have been introduced will work if the sample is distributed normally. In all examples, the function will return the reference mass in which the emissions are replaced by NAN, the statistical characteristics of the reference and impactor sample and the emission position numbers. In all examples Alpha - Level of importance.The simplest method is based on average and standard deviation. Once the sample is scaled up in such a way that its mean value is zero and the deviation is a unit, it is possible to select those elements that are different from zero to plus-minus alpha. They'll be emissions.def Maxstd(df, alpha=0.97):
X = df
Zero = pd.DataFrame(0, index=X.index, columns=X.columns)
numbers = np.empty([0])
ResArray = X.copy()
OperatingArray = X.copy()
while True:
m = OperatingArray.mean()
s = OperatingArray.std()
XX = (OperatingArray - m) / s
L = paired_distances(XX, Zero)
maxindex = np.argmax(L)
XX.ix[maxindex] = np.nan
ss = XX.std()
if ss.min() >= alpha:
break;
numbers = np.append(numbers, maxindex)
OperatingArray.ix[maxindex] = m
ResArray.ix[numbers] = np.nan
Result = pd.DataFrame(columns=X.columns)
Result = Result.append(X.mean(), ignore_index=True)
Result = Result.append(ResArray.mean(), ignore_index=True)
Result = Result.append(X.std(), ignore_index=True)
Result = Result.append(ResArray.std(), ignore_index=True)
Result = Result.append(X.min(), ignore_index=True)
Result = Result.append(ResArray.min(), ignore_index=True)
Result = Result.append(X.max(), ignore_index=True)
Result = Result.append(ResArray.max(), ignore_index=True)
Result.index = ['Mean (before)', 'Mean (after)', 'Standard deviation (before)', 'Standard deviation (after)', 'Minimum (before)',
'Minimum(after)', 'Maximum (before)', 'Maximum(after)']
return ResArray, Result, numbers
Another method is based on the calculation of the metrics between the average sample and each vector of the sample. The example is Mahalanobis, but you can choose the one that suits your sample.def Mahalanobis(df, alpha=0.9):
X = df
mean = X.mean()
L = pd.DataFrame(pairwise_distances(X, mean.reshape(1, -1), metric='mahalanobis'))
upper_bound = L.values.max() * alpha
numbers = L.loc[L[0] > upper_bound]
XX = X.copy()
XX.ix[numbers.index.values] = np.nan
Result = pd.DataFrame(columns=X.columns)
Result = Result.append(X.mean(), ignore_index=True)
Result = Result.append(XX.mean(), ignore_index=True)
Result = Result.append(X.std(), ignore_index=True)
Result = Result.append(XX.std(), ignore_index=True)
Result = Result.append(X.min(), ignore_index=True)
Result = Result.append(XX.min(), ignore_index=True)
Result = Result.append(X.max(), ignore_index=True)
Result = Result.append(XX.max(), ignore_index=True)
Result.index = ['Mean (before)', 'Mean (after)', 'Standard deviation (before)', 'Standard deviation (after)', 'Minimum (before)',
'Minimum(after)', 'Maximum (before)', 'Maximum(after)']
return XX, Result, numbers.index.values
The Resampling Half Means (RHM) method randomly selects half of all vectors in the reference sample. The selected vectors shall have an average and standard deviation. The whole reference sample shall be scaled up on the basis of the calculated mean and standard deviations. For each sample vector, the distance between the vector and zero shall be calculated. All specified distances shall be placed on the list for further processing. def RHM(df, alpha=0.9):
X = df
L = pd.DataFrame()
Zero = pd.DataFrame(0, index=np.arange(X.shape[0]), columns=X.columns)
for i in range(X.shape[0] * 2):
#Random sampling
rows = random.sample(X.index, X.shape[0] / 2)
Xsami = X.ix[rows]
#Calculate mean and standard deviation
mi = Xsami.mean()
si = Xsami.std()
#Scale data frame
Xi = (X - mi) / si
#Calculate vectors length
Li = paired_distances(Xi, Zero)
L['X'+str(i)] = Li
upperBound = L.values.max() * alpha
counts = L.gt(upperBound).sum(axis=1)
numbers = counts.loc[counts > 0]
XX = X.copy()
XX.ix[numbers.index.values] = np.nan
Result = pd.DataFrame(columns=X.columns)
Result = Result.append(X.mean(), ignore_index=True)
Result = Result.append(XX.mean(), ignore_index=True)
Result = Result.append(X.std(), ignore_index=True)
Result = Result.append(XX.std(), ignore_index=True)
Result = Result.append(X.min(), ignore_index=True)
Result = Result.append(XX.min(), ignore_index=True)
Result = Result.append(X.max(), ignore_index=True)
Result = Result.append(XX.max(), ignore_index=True)
Result.index = ['Mean (before)', 'Mean (after)', 'Standard deviation (before)', 'Standard deviation (after)', 'Minimum (before)',
'Minimum(after)', 'Maximum (before)', 'Maximum(after)']
return XX, Result, numbers.index.values
In the case of real-time data rather than a normal sample, the challenge is complicated. In this case, reference may be made to the use of the reservoir method, attribution of the difference between i-m and (i+1)-m, i-m and (i+2)-m, etc. I wrote this code to determine if there's a noise in the sample, but you can refine it under your task.def find_minmax(column):
coldif1 = np.array(0)
for i in range(0, column.size - 1):
coldif1 = np.append(coldif1, np.absolute(column[i] - column[i + 1]))
coldif1 = np.delete(coldif1, 0, 0)
coldif2 = np.array(0)
for i in range(0, column.size - 2):
coldif2 = np.append(coldif2, np.absolute(column[i] - column[i + 2]))
coldif2 = np.delete(coldif2, 0, 0)
coldif3 = np.array(0)
for i in range(0, column.size - 3):
coldif3 = np.append(coldif3, np.absolute(column[i] - column[i + 3]))
coldif3 = np.delete(coldif3, 0, 0)
coldif4 = np.array(0)
for i in range(0, column.size - 4):
coldif4 = np.append(coldif4, np.absolute(column[i] - column[i + 4]))
coldif4 = np.delete(coldif4, 0, 0)
coldif5 = np.array(0)
for i in range(0, column.size - 5):
coldif5 = np.append(coldif5, np.absolute(column[i] - column[i + 5]))
coldif5 = np.delete(coldif5, 0, 0)
return [np.amax(coldif1), np.amax(coldif2), np.amax(coldif3), np.amax(coldif4), np.amax(coldif5)]
xls = pd.ExcelFile('noise_sensor.xlsx')
df = pd.read_excel(io=xls, sheet=0, header=0)
df = df.drop(['A', 'B'], axis=1)
X_train = np.array([np.zeros(5)])
for i in range(0, 16):
column = df.as_matrix(columns=df.columns[i:i+1])
#print(column)
X_train = np.append(X_train, [find_minmax(column)], axis=0)
X_train = np.delete(X_train, 0, axis=0)
print(X_train)
Y_train = np.array([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
clf = SVC()
clf.fit(X_train, Y_train)
test_df = pd.read_excel(io=xls, sheetname=2, header=0)
test_df = test_df.drop(['A', 'B'], axis=1)
X_test = np.array([np.zeros(5)])
for i in range(0, 16):
column = test_df.as_matrix(columns=test_df.columns[i:i+1])
X_test = np.append(X_test, [find_minmax(column)], axis=0)
X_test = np.delete(X_test, 0, axis=0)
print(clf.predict(X_test))