ad/ad_model/processing.py

   1 import pandas as pd
   2 import numpy as np
   3 import joblib
   4 from sklearn.preprocessing import StandardScaler
   5
   6
   7 class preprocess(object):
   8
   9     def __init__(self, data):
  10         """
  11            Columns that are not useful for the prediction will be dropped(UEID, Category, & Timestamp)
  12         """
  13         self.id = data.UEID
  14         self.time = data.MeasTimestampRF
  15         self.data = data.drop(['UEID', 'MeasTimestampRF'], axis=1)
  16
  17     def variation(self):
  18         """ drop the constant parameters """
  19         self.data = self.data.loc[:, self.data.apply(pd.Series.nunique) != 1]
  20
  21     def numerical_data(self):
  22         """  Filters only numeric data types """
  23         numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
  24         self.data = self.data.select_dtypes(include=numerics)
  25
  26     def drop_na(self):
  27         """ drop observations having nan values """
  28         self.data = self.data.dropna(axis=0)
  29
  30     def correlation(self):
  31         """  check and drop high correlation parameters  """
  32         corr = self.data.corr().abs()
  33         corr = pd.DataFrame(np.tril(corr, k=-1), columns=self.data.columns)
  34         drop = [column for column in corr.columns if any(corr[column] > 0.98)]
  35         self.data = self.data.drop(drop, axis=1)
  36
  37     # check skewness of all parameters and use log transform if half of parameters are enough skewd
  38     # otherwise use standardization
  39     def transform(self):
  40         """ use log transform for skewed data """
  41         scale = StandardScaler()
  42         data = scale.fit_transform(self.data)
  43         self.data = pd.DataFrame(data, columns=self.data.columns)
  44         joblib.dump(scale, 'ad/scale')
  45
  46     def process(self):
  47         """
  48           Calls the modules for the data preprocessing like dropping columns, normalization etc.,
  49         """
  50         self.numerical_data()
  51         self.drop_na()
  52         self.variation()
  53 #        self.correlation()
  54         self.transform()
  55         self.data.loc[:, 'UEID'] = self.id
  56         self.data.loc[:, 'MeasTimestampRF'] = self.time
  57         return self.data