1 # ==================================================================================
2 # Copyright (c) 2020 HCL Technologies Limited.
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
8 # http://www.apache.org/licenses/LICENSE-2.0
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15 # ==================================================================================
20 from sklearn.preprocessing import Normalizer
23 class PREPROCESS(object):
24 r""" This PREPROCESS class takes raw data and apply prepocessing on to that.
28 data: pandas dataframe
29 input dataset to process in pandas dataframe
34 DataFrame that has processed data
36 list of attributes to drop
39 def __init__(self, data):
41 Columns that are not useful for the prediction will be dropped(UEID, Category, & Timestamp)
44 self.convert_gb_to_mb()
47 """ drop the constant parameters """
48 if len(self.data) > 1:
49 self.data = self.data.loc[:, self.data.apply(pd.Series.nunique) != 1]
51 def convert_gb_to_mb(self):
52 self.data.iloc[:]['DRB.UEThpDl'] = self.data['DRB.UEThpDl'].apply(lambda x: x*1024)
54 def numerical_data(self):
55 """ Filters only numeric data types """
56 numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
57 self.data = self.data.select_dtypes(include=numerics)
60 """ drop observations having nan values """
61 self.data = self.data.dropna(axis=0)
63 def correlation(self):
64 """ check and drop high correlation parameters """
65 corr = self.data.corr().abs()
66 corr = pd.DataFrame(np.tril(corr, k=-1), columns=self.data.columns)
67 drop = [column for column in corr.columns if any(corr[column] > 0.98)]
68 self.data = self.data.drop(drop, axis=1)
70 # check skewness of all parameters and use log transform if half of parameters are enough skewd
71 # otherwise use standardization
72 def fit_transform(self):
73 """ use normalizer transformation to bring all parameters in same scale """
74 scale = Normalizer().fit(self.data)
75 joblib.dump(scale, 'src/scale')
78 scale = joblib.load('src/scale')
79 self.data = pd.DataFrame(scale.transform(self.data), columns=self.data.columns)
82 joblib.dump(self.data.columns, 'src/num_params')
86 Calls the modules for the data preprocessing like dropping columns, normalization etc.,
89 for col in self.data.columns:
90 if 'nb' in col or 'Geo' in col or 'anomal' in col or 'target' in col:
92 self.data = self.data.drop(temp, axis=1)