[RICAPP-173] AD xApp Release version 0.0.2 (Dawn)

[ric-app/ad.git] / ad / ad_train.py
diff --git a/ad/ad_train.py b/ad/ad_train.py

index b6aa843..e6eb73d 100644 (file)
--- a/ad/ad_train.py
+++ b/ad/ad_train.py
@@ -13,125 +13,92 @@
  #  See the License for the specific language governing permissions and
  #  limitations under the License.
  # ==================================================================================
-import warnings
-import json
-import hdbscan
-import pandas as pd
-import numpy as np
-import joblib, os
-from ad_model.processing import preprocess
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import accuracy_score, confusion_matrix,f1_score
-from sklearn.preprocessing import LabelEncoder
-from sklearn.model_selection import train_test_split
-
-# Ranges for input features based on excellent, good, average, & poor category
-UEKeyList = ['MeasTimestampRF','UEPDCPBytesDL', 'UEPDCPBytesUL', 'UEPRBUsageDL', 'UEPRBUsageUL', 'S_RSRP', 'S_RSRQ', 'S_SINR','UEID']
-#UEKeyList = ['S_RSRP', 'S_RSRQ', 'S_SINR','UEID','MeasTimestampRF']
  
-sigstr = {'S_RSRP': {'Excellent Signal' : [-80, 10000000000000000], 'Good Signal': [-90,-80], 'Average Signal':[-100,-90], 'Poor Signal':[-100000000000000000,-100]}, 'S_RSRQ' : {'Excellent Signal' : [-10, 10000000000000000], 'Good Signal': [-15,-10], 'Average Signal':[-20,-15], 'Poor Signal':[-100000000000000000,-20]}, 'S_SINR' : {'Excellent Signal' : [20, 10000000000000000], 'Good Signal': [13,20], 'Average Signal':[0,13], 'Poor Signal':[-100000000000000000,0]}}
+import joblib
+from ad_model.processing import PREPROCESS
+from sklearn.metrics import f1_score
+from sklearn.ensemble import IsolationForest
+from database import DATABASE, DUMMY
+import numpy as np
  
-PRB = {'UEPRBUsageDL': {'Excellent Signal' : [25, 10000000000000000], 'Good Signal': [20,25], 'Average Signal':[10,20], 'Poor Signal':[-100000000000000000,10]}, 'UEPRBUsageUL' : {'Excellent Signal' : [15, 10000000000000000], 'Good Signal': [10,15], 'Average Signal':[5,10], 'Poor Signal':[-100000000000000000,5]}}
  
-tput = {'UEPDCPBytesDL': {'Excellent Signal' : [300000, 10000000000000000], 'Good Signal': [200000,300000], 'Average Signal':[100000,200000], 'Poor Signal':[-100000000000000000,100000]}, 'UEPDCPBytesUL' : {'Excellent Signal' : [125000, 10000000000000000], 'Good Signal': [100000,125000], 'Average Signal':[10000,100000], 'Poor Signal':[-100000000000000000,10000]}}
+class modelling(object):
+    r""" The modelling class takes input as dataframe or array and train Isolation Forest model
  
+    Paramteres
+    .........
+    data: DataFrame or array
+        input dataset
+    cols: list
+        list of parameters in input dataset
  
-def category(df,ranges):
+    Attributes
+    ----------
+    actual:array
+        actual label for test data
+    X: DataFrame or array
+        transformed values of input data
      """
-     Based on ranges, each sample is return with category(excellent, good, average, & poor category).
-    """
-    data = df.copy()
-    for block in ranges:
-        df = data[list(block.keys())].copy()
-        for key, value in block.items():
-            temp = data[list(block.keys())].copy()
-            for cat, bounds in value.items():
-                ind = temp[(temp[key] <= bounds[1]) & (temp[key] > bounds[0])].index
-                df.loc[ind, key] = cat
-        data[df.columns] = df
-    category =  data[['UEPDCPBytesDL', 'UEPDCPBytesUL', 'UEPRBUsageDL', 'UEPRBUsageUL',
-       'S_RSRP', 'S_RSRQ', 'S_SINR']].mode(axis = 1)[0]
-    return category
+    def __init__(self, data):
+        self.data = data
+        self.cols = data.columns
  
+    def read_test(self, db):
+        """ Read test dataset for model validation"""
  
-class modelling(object):
-    def __init__(self,data):
-        self.time = data.MeasTimestampRF
-        self.id = data.UEID
-        self.data = data.drop(['UEID', 'MeasTimestampRF'], axis = 1)
-        
-    def dbscan(self):
-        """
-         Train hdbscan for the input dataframe
-         save the hdbscan model
-        """        
-        df = self.data.copy()
-        hdb = hdbscan.HDBSCAN(min_cluster_size=16000, min_samples = 5, prediction_data = True).fit(df)
-        joblib.dump(hdb, '/tmp/ad/hdbscan')
-        self.data['Category'] = hdb.labels_
+        db.read_data('valid')
+        test = db.data
+        self.actual = test['Anomaly']
+        X = test[self.cols]
+        sc = joblib.load('scale')
+        self.X = sc.transform(X)
  
-    def RandomForest(self, y):
-        """
-         Transform categorical label into numeric(Save the LabelEncoder).
-         Create Train and Test split for Random Forest Classifier and Save the model
+    def isoforest(self, outliers_fraction=0.05, random_state=42, push_model=False):
+        """ Train isolation forest
+
+        Parameters
+        ----------
+        outliers_fraction: float between 0.01 to 0.5 (default=0.05)
+            percentage of anomalous available in input data
+        push_model: boolean (default=False)
+            return f_1 score if True else push model into repo
+        random_state: int (default=42)
          """
-        df = self.data.copy()
-        le = LabelEncoder()
-        y = le.fit_transform(y)
-        joblib.dump(le, '/tmp/ad/LabelEncoder')
-        X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.20, stratify=y, random_state=42)
-        rf = RandomForestClassifier(max_depth=9, random_state=0)
-        rf.fit(X_train, y_train)
-        
-        joblib.dump(rf, '/tmp/ad/RF')
-        print('--------------------------- Training Score------------------------------------')
-        score(X_test, y_test, rf)
-        print('--------------------------- Test Score------------------------------------')
-        test = pd.read_csv('/tmp/ad/ue_test.csv')
-        test = test[UEKeyList]
-        y = category(test, [sigstr, PRB, tput])
-        y =le.transform(y)
-        ps = preprocess(test)
-        ps.process()
-        test = ps.data.drop(['UEID', 'MeasTimestampRF'], axis = 1)
-        score(test, y, rf)
+        iso = IsolationForest(contamination=outliers_fraction, random_state=random_state)
+        md = iso.fit(self.data, None)
+        if push_model:
+            joblib.dump(self.cols, 'params')
+            joblib.dump(md, 'model')
+        return test(self, md)
  
-def score(X, y, model):
-    y_pred = model.predict(X)
-    print('Accuracy : {}'.format(accuracy_score(y, y_pred)))
  
-    print('confusion matrix : {}'.format(confusion_matrix(y, y_pred)))
-    print('f1-score : {}'.format(f1_score(y, y_pred, average = 'macro')))
+def train(thread=False):
+    """
+     Main function to perform training on input data
+    """
+    if thread:
+        db = DUMMY()
+    else:
+        db = DATABASE('UEData')
+    db.read_data('train')
+    ps = PREPROCESS(db.data)
+    ps.process()
+    df = ps.data
  
+    mod = modelling(df)
+    mod.read_test(db)
  
-def train():
-    """
-     Main function to perform training on input files
-     Read all the csv file in the current path and create trained model
-    """    
-    print('Training Starts : ')
-    path = '/tmp/ad/ue_data/'
-    df = pd.DataFrame()
-    # Read all the csv files and store the combined data into df
-    for file in os.listdir(path):
-        df = df.append(pd.read_csv(path + file))
-    
-    df = df[UEKeyList]
-    df.index = range(len(df))
-    y =  category(df, [sigstr, PRB, tput])
-    seg = {}
+    scores = []
+    for of in np.arange(0.01, 0.4, 0.01):
+        scores.append(mod.isoforest(outliers_fraction=of))
+    opt_f1 = scores.index(max(scores)) + 1
+    mod.isoforest(outliers_fraction=opt_f1*0.01, push_model=True)
+    print("Optimum value of contamination : {}".format(opt_f1*0.01))
+    print('Training Ends : ')
  
-    #Save the category of each UEID and save it as json file
-    for ue in df.UEID.unique():
-        seg[str(ue)] = list(set(y[df[df['UEID'] == ue].index]))
-    
-    with open('ue_seg.json', 'w') as outfile:
-        json.dump(seg, outfile)
  
-    # Do a preprocessing, processing and save the model
-    ps = preprocess(df)
-    ps.process()
-    df = ps.data
-    db = modelling(df)
-#    db.dbscan()
-    db.RandomForest(y)
+def test(self, model):
+    pred = model.predict(self.X)
+    if -1 in pred:
+        pred = [1 if p == -1 else 0 for p in pred]
+    return f1_score(self.actual, pred)