"""
Columns that are not useful for the prediction will be dropped(UEID, Category, & Timestamp)
"""
- self.temp = None
self.data = data
+ self.convert_gb_to_mb()
def variation(self):
""" drop the constant parameters """
if len(self.data) > 1:
self.data = self.data.loc[:, self.data.apply(pd.Series.nunique) != 1]
+ def convert_gb_to_mb(self):
+ self.data.iloc[:]['DRB.UEThpDl'] = self.data['DRB.UEThpDl'].apply(lambda x: x*1024)
+
def numerical_data(self):
""" Filters only numeric data types """
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
# check skewness of all parameters and use log transform if half of parameters are enough skewd
# otherwise use standardization
- def transform(self):
+ def fit_transform(self):
""" use normalizer transformation to bring all parameters in same scale """
- scale = Normalizer() # StandardScaler()
- data = scale.fit_transform(self.data)
- self.data = pd.DataFrame(data, columns=self.data.columns)
- joblib.dump(scale, 'scale')
+ scale = Normalizer().fit(self.data)
+ joblib.dump(scale, 'src/scale')
+
+ def transform(self):
+ scale = joblib.load('src/scale')
+ self.data = pd.DataFrame(scale.transform(self.data), columns=self.data.columns)
+
+ def save_cols(self):
+ joblib.dump(self.data.columns, 'src/num_params')
def process(self):
"""
Calls the modules for the data preprocessing like dropping columns, normalization etc.,
"""
- temp = ['du-id', 'measTimeStampRf', 'ue-id', 'nrCellIdentity', 'targetTput', 'x', 'y']
+ temp = []
for col in self.data.columns:
- if 'nb' in col:
+ if 'nb' in col or 'Geo' in col or 'anomal' in col or 'target' in col:
temp.append(col)
-
- if set(temp).issubset(self.data.columns):
- self.temp = self.data[temp]
- self.data = self.data.drop(temp, axis=1)
+ self.data = self.data.drop(temp, axis=1)
self.numerical_data()
self.drop_na()
self.variation()
self.correlation()
+ self.fit_transform()
self.transform()
- return self.data
+ self.save_cols()