ad/ad_model/processing.py

   1 # ==================================================================================
   2 #  Copyright (c) 2020 HCL Technologies Limited.
   3 #
   4 #  Licensed under the Apache License, Version 2.0 (the "License");
   5 #  you may not use this file except in compliance with the License.
   6 #  You may obtain a copy of the License at
   7 #
   8 #     http://www.apache.org/licenses/LICENSE-2.0
   9 #
  10 #  Unless required by applicable law or agreed to in writing, software
  11 #  distributed under the License is distributed on an "AS IS" BASIS,
  12 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 #  See the License for the specific language governing permissions and
  14 #  limitations under the License.
  15 # ==================================================================================
  16 import pandas as pd
  17 import numpy as np
  18 from scipy.stats import skew
  19 import json
  20 import joblib
  21 from sklearn.decomposition import PCA
  22 from sklearn.preprocessing import StandardScaler
  23
  24 class preprocess(object):
  25
  26     def __init__(self,data):
  27         """
  28            Columns that are not useful for the prediction will be dropped(UEID, Category, & Timestamp)
  29         """
  30         self.id = data.UEID
  31         self.time = data.MeasTimestampRF
  32         self.data = data.drop(['UEID','MeasTimestampRF'], axis = 1)
  33
  34     def variation(self):
  35         """ drop the constant parameters """
  36         self.data =  self.data.loc[:,self.data.apply(pd.Series.nunique) != 1]
  37
  38
  39     def numerical_data(self):
  40         """  Filters only numeric data types """
  41         numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
  42         self.data =  self.data.select_dtypes(include=numerics)
  43
  44     def drop_na(self):
  45         """ drop observations having nan values """
  46         self.data = self.data.dropna(axis=0)
  47
  48     def correlation(self):
  49         """  check and drop high correlation parameters  """
  50         corr = self.data.corr().abs()
  51         corr = pd.DataFrame(np.tril(corr, k =-1), columns = self.data.columns)
  52         drop = [column for column in corr.columns if any(corr[column] > 0.98)]
  53         self.data = self.data.drop(drop,axis=1)
  54
  55     #check skewness of all parameters and use log transform if half of parameters are enough skewd
  56     #otherwise use standardization
  57     def transform(self):
  58         """ Use standard scalar and save the scale """
  59         scale = StandardScaler()
  60         data = scale.fit_transform(self.data)
  61         self.data = pd.DataFrame(data, columns = self.data.columns)
  62         joblib.dump(scale, '/tmp/ad/scale')
  63
  64     def normalize(self):
  65         """ normalize the data  """
  66         upper = self.data.max()
  67         lower = self.data.min()
  68         self.data = (self.data - lower)/(upper-lower)
  69
  70     def process(self):
  71         """  Calls the modules for the data preprocessing like dropping columns, normalization etc.,  """
  72         self.numerical_data()
  73         self.drop_na()
  74         self.variation()
  75 #        self.correlation()
  76         self.transform()
  77         self.data.loc[:,'UEID'] = self.id
  78         self.data.loc[:,'MeasTimestampRF'] = self.time
  79         return self.data