1 # ==================================================================================
2 # Copyright (c) 2020 HCL Technologies Limited.
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
8 # http://www.apache.org/licenses/LICENSE-2.0
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15 # ==================================================================================
20 from sklearn.preprocessing import Normalizer
23 class PREPROCESS(object):
24 r""" This PREPROCESS class takes raw data and apply prepocessing on to that.
28 data: pandas dataframe
29 input dataset to process in pandas dataframe
34 DataFrame that has processed data
36 list of attributes to drop
39 def __init__(self, data):
41 Columns that are not useful for the prediction will be dropped(UEID, Category, & Timestamp)
47 """ drop the constant parameters """
48 if len(self.data) > 1:
49 self.data = self.data.loc[:, self.data.apply(pd.Series.nunique) != 1]
51 def numerical_data(self):
52 """ Filters only numeric data types """
53 numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
54 self.data = self.data.select_dtypes(include=numerics)
57 """ drop observations having nan values """
58 self.data = self.data.dropna(axis=0)
60 def correlation(self):
61 """ check and drop high correlation parameters """
62 corr = self.data.corr().abs()
63 corr = pd.DataFrame(np.tril(corr, k=-1), columns=self.data.columns)
64 drop = [column for column in corr.columns if any(corr[column] > 0.98)]
65 self.data = self.data.drop(drop, axis=1)
67 # check skewness of all parameters and use log transform if half of parameters are enough skewd
68 # otherwise use standardization
70 """ use normalizer transformation to bring all parameters in same scale """
71 scale = Normalizer() # StandardScaler()
72 data = scale.fit_transform(self.data)
73 self.data = pd.DataFrame(data, columns=self.data.columns)
74 joblib.dump(scale, 'scale')
78 Calls the modules for the data preprocessing like dropping columns, normalization etc.,
80 temp = ['du-id', 'measTimeStampRf', 'ue-id', 'nrCellIdentity', 'targetTput', 'x', 'y']
81 for col in self.data.columns:
85 if set(temp).issubset(self.data.columns):
86 self.temp = self.data[temp]
87 self.data = self.data.drop(temp, axis=1)