1 # ==================================================================================
2 # Copyright (c) 2020 HCL Technologies Limited.
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
8 # http://www.apache.org/licenses/LICENSE-2.0
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15 # ==================================================================================
18 from scipy.stats import skew
21 from sklearn.decomposition import PCA
22 from sklearn.preprocessing import StandardScaler
24 class preprocess(object):
26 def __init__(self,data):
28 Columns that are not useful for the prediction will be dropped(UEID, Category, & Timestamp)
31 self.time = data.MeasTimestampRF
32 self.data = data.drop(['UEID','MeasTimestampRF'], axis = 1)
35 """ drop the constant parameters """
36 self.data = self.data.loc[:,self.data.apply(pd.Series.nunique) != 1]
39 def numerical_data(self):
40 """ Filters only numeric data types """
41 numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
42 self.data = self.data.select_dtypes(include=numerics)
45 """ drop observations having nan values """
46 self.data = self.data.dropna(axis=0)
48 def correlation(self):
49 """ check and drop high correlation parameters """
50 corr = self.data.corr().abs()
51 corr = pd.DataFrame(np.tril(corr, k =-1), columns = self.data.columns)
52 drop = [column for column in corr.columns if any(corr[column] > 0.98)]
53 self.data = self.data.drop(drop,axis=1)
55 #check skewness of all parameters and use log transform if half of parameters are enough skewd
56 #otherwise use standardization
58 """ Use standard scalar and save the scale """
59 scale = StandardScaler()
60 data = scale.fit_transform(self.data)
61 self.data = pd.DataFrame(data, columns = self.data.columns)
62 joblib.dump(scale, '/tmp/ad/scale')
65 """ normalize the data """
66 upper = self.data.max()
67 lower = self.data.min()
68 self.data = (self.data - lower)/(upper-lower)
71 """ Calls the modules for the data preprocessing like dropping columns, normalization etc., """
77 self.data.loc[:,'UEID'] = self.id
78 self.data.loc[:,'MeasTimestampRF'] = self.time