src/processing.py

   1 # ==================================================================================
   2 #  Copyright (c) 2020 HCL Technologies Limited.
   3 #
   4 #  Licensed under the Apache License, Version 2.0 (the "License");
   5 #  you may not use this file except in compliance with the License.
   6 #  You may obtain a copy of the License at
   7 #
   8 #     http://www.apache.org/licenses/LICENSE-2.0
   9 #
  10 #  Unless required by applicable law or agreed to in writing, software
  11 #  distributed under the License is distributed on an "AS IS" BASIS,
  12 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 #  See the License for the specific language governing permissions and
  14 #  limitations under the License.
  15 # ==================================================================================
  16
  17 import pandas as pd
  18 import numpy as np
  19 import joblib
  20 from sklearn.preprocessing import Normalizer
  21
  22
  23 class PREPROCESS(object):
  24     r""" This PREPROCESS class takes raw data and apply prepocessing on to that.
  25
  26     Parameters
  27     ----------
  28     data: pandas dataframe
  29         input dataset to process in pandas dataframe
  30
  31     Attributes
  32     ----------
  33     data: DataFrame
  34         DataFrame that has processed data
  35     temp: list
  36         list of attributes to drop
  37     """
  38
  39     def __init__(self, data):
  40         """
  41            Columns that are not useful for the prediction will be dropped(UEID, Category, & Timestamp)
  42         """
  43         self.data = data
  44         self.convert_gb_to_mb()
  45
  46     def variation(self):
  47         """ drop the constant parameters """
  48         if len(self.data) > 1:
  49             self.data = self.data.loc[:, self.data.apply(pd.Series.nunique) != 1]
  50
  51     def convert_gb_to_mb(self):
  52         self.data.iloc[:]['DRB.UEThpDl'] = self.data['DRB.UEThpDl'].apply(lambda x: x*1024)
  53
  54     def numerical_data(self):
  55         """  Filters only numeric data types """
  56         numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
  57         self.data = self.data.select_dtypes(include=numerics)
  58
  59     def drop_na(self):
  60         """ drop observations having nan values """
  61         self.data = self.data.dropna(axis=0)
  62
  63     def correlation(self):
  64         """  check and drop high correlation parameters  """
  65         corr = self.data.corr().abs()
  66         corr = pd.DataFrame(np.tril(corr, k=-1), columns=self.data.columns)
  67         drop = [column for column in corr.columns if any(corr[column] > 0.98)]
  68         self.data = self.data.drop(drop, axis=1)
  69
  70     # check skewness of all parameters and use log transform if half of parameters are enough skewd
  71     # otherwise use standardization
  72     def fit_transform(self):
  73         """ use normalizer transformation to bring all parameters in same scale """
  74         scale = Normalizer().fit(self.data)
  75         joblib.dump(scale, 'src/scale')
  76
  77     def transform(self):
  78         scale = joblib.load('src/scale')
  79         self.data = pd.DataFrame(scale.transform(self.data), columns=self.data.columns)
  80
  81     def save_cols(self):
  82         joblib.dump(self.data.columns, 'src/num_params')
  83
  84     def process(self):
  85         """
  86           Calls the modules for the data preprocessing like dropping columns, normalization etc.,
  87         """
  88         temp = []
  89         for col in self.data.columns:
  90             if 'nb' in col or 'Geo' in col or 'anomal' in col or 'target' in col:
  91                 temp.append(col)
  92         self.data = self.data.drop(temp, axis=1)
  93         self.numerical_data()
  94         self.drop_na()
  95         self.variation()
  96         self.correlation()
  97         self.fit_transform()
  98         self.transform()
  99         self.save_cols()