4 from sklearn.preprocessing import StandardScaler
7 class preprocess(object):
9 def __init__(self, data):
11 Columns that are not useful for the prediction will be dropped(UEID, Category, & Timestamp)
14 self.time = data.MeasTimestampRF
15 self.data = data.drop(['UEID', 'MeasTimestampRF'], axis=1)
18 """ drop the constant parameters """
19 self.data = self.data.loc[:, self.data.apply(pd.Series.nunique) != 1]
21 def numerical_data(self):
22 """ Filters only numeric data types """
23 numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
24 self.data = self.data.select_dtypes(include=numerics)
27 """ drop observations having nan values """
28 self.data = self.data.dropna(axis=0)
30 def correlation(self):
31 """ check and drop high correlation parameters """
32 corr = self.data.corr().abs()
33 corr = pd.DataFrame(np.tril(corr, k=-1), columns=self.data.columns)
34 drop = [column for column in corr.columns if any(corr[column] > 0.98)]
35 self.data = self.data.drop(drop, axis=1)
37 # check skewness of all parameters and use log transform if half of parameters are enough skewd
38 # otherwise use standardization
40 """ use log transform for skewed data """
41 scale = StandardScaler()
42 data = scale.fit_transform(self.data)
43 self.data = pd.DataFrame(data, columns=self.data.columns)
44 joblib.dump(scale, 'ad/scale')
48 Calls the modules for the data preprocessing like dropping columns, normalization etc.,
55 self.data.loc[:, 'UEID'] = self.id
56 self.data.loc[:, 'MeasTimestampRF'] = self.time