ad/ad_model/processing.py

   1 # ==================================================================================
   2 #  Copyright (c) 2020 HCL Technologies Limited.
   3 #
   4 #  Licensed under the Apache License, Version 2.0 (the "License");
   5 #  you may not use this file except in compliance with the License.
   6 #  You may obtain a copy of the License at
   7 #
   8 #     http://www.apache.org/licenses/LICENSE-2.0
   9 #
  10 #  Unless required by applicable law or agreed to in writing, software
  11 #  distributed under the License is distributed on an "AS IS" BASIS,
  12 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 #  See the License for the specific language governing permissions and
  14 #  limitations under the License.
  15 # ==================================================================================
  16
  17 import pandas as pd
  18 import numpy as np
  19 import joblib
  20 from sklearn.preprocessing import Normalizer
  21
  22
  23 class PREPROCESS(object):
  24     r""" This PREPROCESS class takes raw data and apply prepocessing on to that.
  25
  26     Parameters
  27     ----------
  28     data: pandas dataframe
  29         input dataset to process in pandas dataframe
  30
  31     Attributes
  32     ----------
  33     data: DataFrame
  34         DataFrame that has processed data
  35     temp: list
  36         list of attributes to drop
  37     """
  38
  39     def __init__(self, data):
  40         """
  41            Columns that are not useful for the prediction will be dropped(UEID, Category, & Timestamp)
  42         """
  43         self.temp = None
  44         self.data = data
  45
  46     def variation(self):
  47         """ drop the constant parameters """
  48         if len(self.data) > 1:
  49             self.data = self.data.loc[:, self.data.apply(pd.Series.nunique) != 1]
  50
  51     def numerical_data(self):
  52         """  Filters only numeric data types """
  53         numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
  54         self.data = self.data.select_dtypes(include=numerics)
  55
  56     def drop_na(self):
  57         """ drop observations having nan values """
  58         self.data = self.data.dropna(axis=0)
  59
  60     def correlation(self):
  61         """  check and drop high correlation parameters  """
  62         corr = self.data.corr().abs()
  63         corr = pd.DataFrame(np.tril(corr, k=-1), columns=self.data.columns)
  64         drop = [column for column in corr.columns if any(corr[column] > 0.98)]
  65         self.data = self.data.drop(drop, axis=1)
  66
  67     # check skewness of all parameters and use log transform if half of parameters are enough skewd
  68     # otherwise use standardization
  69     def transform(self):
  70         """ use normalizer transformation to bring all parameters in same scale """
  71         scale = Normalizer()  # StandardScaler()
  72         data = scale.fit_transform(self.data)
  73         self.data = pd.DataFrame(data, columns=self.data.columns)
  74         joblib.dump(scale, 'scale')
  75
  76     def process(self):
  77         """
  78           Calls the modules for the data preprocessing like dropping columns, normalization etc.,
  79         """
  80         temp = ['du-id', 'measTimeStampRf', 'ue-id', 'nrCellIdentity', 'targetTput', 'x', 'y']
  81         for col in self.data.columns:
  82             if 'nb' in col:
  83                 temp.append(col)
  84
  85         if set(temp).issubset(self.data.columns):
  86             self.temp = self.data[temp]
  87             self.data = self.data.drop(temp, axis=1)
  88         self.numerical_data()
  89         self.drop_na()
  90         self.variation()
  91         self.correlation()
  92         self.transform()
  93         return self.data