# ================================================================================== # Copyright (c) 2020 HCL Technologies Limited. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ================================================================================== import pandas as pd import numpy as np import joblib from sklearn.preprocessing import Normalizer class PREPROCESS(object): r""" This PREPROCESS class takes raw data and apply prepocessing on to that. Parameters ---------- data: pandas dataframe input dataset to process in pandas dataframe Attributes ---------- data: DataFrame DataFrame that has processed data temp: list list of attributes to drop """ def __init__(self, data): """ Columns that are not useful for the prediction will be dropped(UEID, Category, & Timestamp) """ self.data = data self.convert_gb_to_mb() def variation(self): """ drop the constant parameters """ if len(self.data) > 1: self.data = self.data.loc[:, self.data.apply(pd.Series.nunique) != 1] def convert_gb_to_mb(self): self.data.iloc[:]['DRB.UEThpDl'] = self.data['DRB.UEThpDl'].apply(lambda x: x*1024) def numerical_data(self): """ Filters only numeric data types """ numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] self.data = self.data.select_dtypes(include=numerics) def drop_na(self): """ drop observations having nan values """ self.data = self.data.dropna(axis=0) def correlation(self): """ check and drop high correlation parameters """ corr = self.data.corr().abs() corr = pd.DataFrame(np.tril(corr, k=-1), columns=self.data.columns) drop = [column for column in corr.columns if any(corr[column] > 0.98)] self.data = self.data.drop(drop, axis=1) # check skewness of all parameters and use log transform if half of parameters are enough skewd # otherwise use standardization def fit_transform(self): """ use normalizer transformation to bring all parameters in same scale """ scale = Normalizer().fit(self.data) joblib.dump(scale, 'src/scale') def transform(self): scale = joblib.load('src/scale') self.data = pd.DataFrame(scale.transform(self.data), columns=self.data.columns) def save_cols(self): joblib.dump(self.data.columns, 'src/num_params') def process(self): """ Calls the modules for the data preprocessing like dropping columns, normalization etc., """ temp = [] for col in self.data.columns: if 'nb' in col or 'Geo' in col or 'anomal' in col or 'target' in col: temp.append(col) self.data = self.data.drop(temp, axis=1) self.numerical_data() self.drop_na() self.variation() self.correlation() self.fit_transform() self.transform() self.save_cols()