import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler


class preprocess(object):

    def __init__(self, data):
        """
           Columns that are not useful for the prediction will be dropped(UEID, Category, & Timestamp)
        """
        self.id = data.UEID
        self.time = data.MeasTimestampRF
        self.data = data.drop(['UEID', 'MeasTimestampRF'], axis=1)

    def variation(self):
        """ drop the constant parameters """
        self.data = self.data.loc[:, self.data.apply(pd.Series.nunique) != 1]

    def numerical_data(self):
        """  Filters only numeric data types """
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        self.data = self.data.select_dtypes(include=numerics)

    def drop_na(self):
        """ drop observations having nan values """
        self.data = self.data.dropna(axis=0)

    def correlation(self):
        """  check and drop high correlation parameters  """
        corr = self.data.corr().abs()
        corr = pd.DataFrame(np.tril(corr, k=-1), columns=self.data.columns)
        drop = [column for column in corr.columns if any(corr[column] > 0.98)]
        self.data = self.data.drop(drop, axis=1)

    # check skewness of all parameters and use log transform if half of parameters are enough skewd
    # otherwise use standardization
    def transform(self):
        """ use log transform for skewed data """
        scale = StandardScaler()
        data = scale.fit_transform(self.data)
        self.data = pd.DataFrame(data, columns=self.data.columns)
        joblib.dump(scale, 'ad/scale')

    def process(self):
        """
          Calls the modules for the data preprocessing like dropping columns, normalization etc.,
        """
        self.numerical_data()
        self.drop_na()
        self.variation()
#        self.correlation()
        self.transform()
        self.data.loc[:, 'UEID'] = self.id
        self.data.loc[:, 'MeasTimestampRF'] = self.time
        return self.data