X-Git-Url: https://gerrit.o-ran-sc.org/r/gitweb?a=blobdiff_plain;f=ad%2Fad_model%2Fprocessing.py;h=3a19dd13890f2030e8d1974a72c46064b0d9c779;hb=refs%2Fheads%2Fe-release;hp=300dc71b75bdeb16b72ea4fda9b010eb71ba6346;hpb=297dbd6245ec69571c8ad7091a18cbe9c7ba2488;p=ric-app%2Fad.git

diff --git a/ad/ad_model/processing.py b/ad/ad_model/processing.py
index 300dc71..3a19dd1 100644
--- a/ad/ad_model/processing.py
+++ b/ad/ad_model/processing.py
@@ -13,34 +13,46 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 # ==================================================================================
+
 import pandas as pd
 import numpy as np
-from scipy.stats import skew
-import json
 import joblib
-from sklearn.decomposition import PCA
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import Normalizer
+
+
+class PREPROCESS(object):
+    r""" This PREPROCESS class takes raw data and apply prepocessing on to that.
 
-class preprocess(object):
-    
-    def __init__(self,data):
+    Parameters
+    ----------
+    data: pandas dataframe
+        input dataset to process in pandas dataframe
+
+    Attributes
+    ----------
+    data: DataFrame
+        DataFrame that has processed data
+    temp: list
+        list of attributes to drop
+    """
+
+    def __init__(self, data):
         """
            Columns that are not useful for the prediction will be dropped(UEID, Category, & Timestamp)
-        """        
-        self.id = data.UEID
-        self.time = data.MeasTimestampRF
-        self.data = data.drop(['UEID','MeasTimestampRF'], axis = 1)
+        """
+        self.temp = None
+        self.data = data
 
     def variation(self):
         """ drop the constant parameters """
-        self.data =  self.data.loc[:,self.data.apply(pd.Series.nunique) != 1]
-    
-    
+        if len(self.data) > 1:
+            self.data = self.data.loc[:, self.data.apply(pd.Series.nunique) != 1]
+
     def numerical_data(self):
         """  Filters only numeric data types """
         numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
-        self.data =  self.data.select_dtypes(include=numerics)
-    
+        self.data = self.data.select_dtypes(include=numerics)
+
     def drop_na(self):
         """ drop observations having nan values """
         self.data = self.data.dropna(axis=0)
@@ -48,32 +60,34 @@ class preprocess(object):
     def correlation(self):
         """  check and drop high correlation parameters  """
         corr = self.data.corr().abs()
-        corr = pd.DataFrame(np.tril(corr, k =-1), columns = self.data.columns)
+        corr = pd.DataFrame(np.tril(corr, k=-1), columns=self.data.columns)
         drop = [column for column in corr.columns if any(corr[column] > 0.98)]
-        self.data = self.data.drop(drop,axis=1)
-    
-    #check skewness of all parameters and use log transform if half of parameters are enough skewd
-    #otherwise use standardization
+        self.data = self.data.drop(drop, axis=1)
+
+    # check skewness of all parameters and use log transform if half of parameters are enough skewd
+    # otherwise use standardization
     def transform(self):
-        """ Use standard scalar and save the scale """
-        scale = StandardScaler()
+        """ use normalizer transformation to bring all parameters in same scale """
+        scale = Normalizer()  # StandardScaler()
         data = scale.fit_transform(self.data)
-        self.data = pd.DataFrame(data, columns = self.data.columns)
-        joblib.dump(scale, '/tmp/ad/scale')
+        self.data = pd.DataFrame(data, columns=self.data.columns)
+        joblib.dump(scale, 'scale')
 
-    def normalize(self):
-        """ normalize the data  """
-        upper = self.data.max()
-        lower = self.data.min()
-        self.data = (self.data - lower)/(upper-lower)  
-    
     def process(self):
-        """  Calls the modules for the data preprocessing like dropping columns, normalization etc.,  """
+        """
+          Calls the modules for the data preprocessing like dropping columns, normalization etc.,
+        """
+        temp = ['du-id', 'measTimeStampRf', 'ue-id', 'nrCellIdentity', 'targetTput', 'x', 'y']
+        for col in self.data.columns:
+            if 'nb' in col:
+                temp.append(col)
+
+        if set(temp).issubset(self.data.columns):
+            self.temp = self.data[temp]
+            self.data = self.data.drop(temp, axis=1)
         self.numerical_data()
         self.drop_na()
         self.variation()
-#        self.correlation()
+        self.correlation()
         self.transform()
-        self.data.loc[:,'UEID'] = self.id      
-        self.data.loc[:,'MeasTimestampRF'] = self.time
         return self.data