X-Git-Url: https://gerrit.o-ran-sc.org/r/gitweb?a=blobdiff_plain;f=ad%2Fad_model%2Fprocessing.py;h=3a19dd13890f2030e8d1974a72c46064b0d9c779;hb=refs%2Fheads%2Fe-release;hp=300dc71b75bdeb16b72ea4fda9b010eb71ba6346;hpb=297dbd6245ec69571c8ad7091a18cbe9c7ba2488;p=ric-app%2Fad.git diff --git a/ad/ad_model/processing.py b/ad/ad_model/processing.py index 300dc71..3a19dd1 100644 --- a/ad/ad_model/processing.py +++ b/ad/ad_model/processing.py @@ -13,34 +13,46 @@ # See the License for the specific language governing permissions and # limitations under the License. # ================================================================================== + import pandas as pd import numpy as np -from scipy.stats import skew -import json import joblib -from sklearn.decomposition import PCA -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import Normalizer + + +class PREPROCESS(object): + r""" This PREPROCESS class takes raw data and apply prepocessing on to that. -class preprocess(object): - - def __init__(self,data): + Parameters + ---------- + data: pandas dataframe + input dataset to process in pandas dataframe + + Attributes + ---------- + data: DataFrame + DataFrame that has processed data + temp: list + list of attributes to drop + """ + + def __init__(self, data): """ Columns that are not useful for the prediction will be dropped(UEID, Category, & Timestamp) - """ - self.id = data.UEID - self.time = data.MeasTimestampRF - self.data = data.drop(['UEID','MeasTimestampRF'], axis = 1) + """ + self.temp = None + self.data = data def variation(self): """ drop the constant parameters """ - self.data = self.data.loc[:,self.data.apply(pd.Series.nunique) != 1] - - + if len(self.data) > 1: + self.data = self.data.loc[:, self.data.apply(pd.Series.nunique) != 1] + def numerical_data(self): """ Filters only numeric data types """ numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] - self.data = self.data.select_dtypes(include=numerics) - + self.data = self.data.select_dtypes(include=numerics) + def drop_na(self): """ drop observations having nan values """ self.data = self.data.dropna(axis=0) @@ -48,32 +60,34 @@ class preprocess(object): def correlation(self): """ check and drop high correlation parameters """ corr = self.data.corr().abs() - corr = pd.DataFrame(np.tril(corr, k =-1), columns = self.data.columns) + corr = pd.DataFrame(np.tril(corr, k=-1), columns=self.data.columns) drop = [column for column in corr.columns if any(corr[column] > 0.98)] - self.data = self.data.drop(drop,axis=1) - - #check skewness of all parameters and use log transform if half of parameters are enough skewd - #otherwise use standardization + self.data = self.data.drop(drop, axis=1) + + # check skewness of all parameters and use log transform if half of parameters are enough skewd + # otherwise use standardization def transform(self): - """ Use standard scalar and save the scale """ - scale = StandardScaler() + """ use normalizer transformation to bring all parameters in same scale """ + scale = Normalizer() # StandardScaler() data = scale.fit_transform(self.data) - self.data = pd.DataFrame(data, columns = self.data.columns) - joblib.dump(scale, '/tmp/ad/scale') + self.data = pd.DataFrame(data, columns=self.data.columns) + joblib.dump(scale, 'scale') - def normalize(self): - """ normalize the data """ - upper = self.data.max() - lower = self.data.min() - self.data = (self.data - lower)/(upper-lower) - def process(self): - """ Calls the modules for the data preprocessing like dropping columns, normalization etc., """ + """ + Calls the modules for the data preprocessing like dropping columns, normalization etc., + """ + temp = ['du-id', 'measTimeStampRf', 'ue-id', 'nrCellIdentity', 'targetTput', 'x', 'y'] + for col in self.data.columns: + if 'nb' in col: + temp.append(col) + + if set(temp).issubset(self.data.columns): + self.temp = self.data[temp] + self.data = self.data.drop(temp, axis=1) self.numerical_data() self.drop_na() self.variation() -# self.correlation() + self.correlation() self.transform() - self.data.loc[:,'UEID'] = self.id - self.data.loc[:,'MeasTimestampRF'] = self.time return self.data