ad/ad_train.py

   1 # ==================================================================================
   2 #  Copyright (c) 2020 HCL Technologies Limited.
   3 #
   4 #  Licensed under the Apache License, Version 2.0 (the "License");
   5 #  you may not use this file except in compliance with the License.
   6 #  You may obtain a copy of the License at
   7 #
   8 #     http://www.apache.org/licenses/LICENSE-2.0
   9 #
  10 #  Unless required by applicable law or agreed to in writing, software
  11 #  distributed under the License is distributed on an "AS IS" BASIS,
  12 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 #  See the License for the specific language governing permissions and
  14 #  limitations under the License.
  15 # ==================================================================================
  16 import warnings
  17 import json
  18 import hdbscan
  19 import pandas as pd
  20 import numpy as np
  21 import joblib, os
  22 from ad_model.processing import preprocess
  23 from sklearn.ensemble import RandomForestClassifier
  24 from sklearn.metrics import accuracy_score, confusion_matrix,f1_score
  25 from sklearn.preprocessing import LabelEncoder
  26 from sklearn.model_selection import train_test_split
  27
  28 # Ranges for input features based on excellent, good, average, & poor category
  29 UEKeyList = ['MeasTimestampRF','UEPDCPBytesDL', 'UEPDCPBytesUL', 'UEPRBUsageDL', 'UEPRBUsageUL', 'S_RSRP', 'S_RSRQ', 'S_SINR','UEID']
  30 #UEKeyList = ['S_RSRP', 'S_RSRQ', 'S_SINR','UEID','MeasTimestampRF']
  31
  32 sigstr = {'S_RSRP': {'Excellent Signal' : [-80, 10000000000000000], 'Good Signal': [-90,-80], 'Average Signal':[-100,-90], 'Poor Signal':[-100000000000000000,-100]}, 'S_RSRQ' : {'Excellent Signal' : [-10, 10000000000000000], 'Good Signal': [-15,-10], 'Average Signal':[-20,-15], 'Poor Signal':[-100000000000000000,-20]}, 'S_SINR' : {'Excellent Signal' : [20, 10000000000000000], 'Good Signal': [13,20], 'Average Signal':[0,13], 'Poor Signal':[-100000000000000000,0]}}
  33
  34 PRB = {'UEPRBUsageDL': {'Excellent Signal' : [25, 10000000000000000], 'Good Signal': [20,25], 'Average Signal':[10,20], 'Poor Signal':[-100000000000000000,10]}, 'UEPRBUsageUL' : {'Excellent Signal' : [15, 10000000000000000], 'Good Signal': [10,15], 'Average Signal':[5,10], 'Poor Signal':[-100000000000000000,5]}}
  35
  36 tput = {'UEPDCPBytesDL': {'Excellent Signal' : [300000, 10000000000000000], 'Good Signal': [200000,300000], 'Average Signal':[100000,200000], 'Poor Signal':[-100000000000000000,100000]}, 'UEPDCPBytesUL' : {'Excellent Signal' : [125000, 10000000000000000], 'Good Signal': [100000,125000], 'Average Signal':[10000,100000], 'Poor Signal':[-100000000000000000,10000]}}
  37
  38
  39 def category(df,ranges):
  40     """
  41      Based on ranges, each sample is return with category(excellent, good, average, & poor category).
  42     """
  43     data = df.copy()
  44     for block in ranges:
  45         df = data[list(block.keys())].copy()
  46         for key, value in block.items():
  47             temp = data[list(block.keys())].copy()
  48             for cat, bounds in value.items():
  49                 ind = temp[(temp[key] <= bounds[1]) & (temp[key] > bounds[0])].index
  50                 df.loc[ind, key] = cat
  51         data[df.columns] = df
  52     category =  data[['UEPDCPBytesDL', 'UEPDCPBytesUL', 'UEPRBUsageDL', 'UEPRBUsageUL',
  53        'S_RSRP', 'S_RSRQ', 'S_SINR']].mode(axis = 1)[0]
  54     return category
  55
  56
  57 class modelling(object):
  58     def __init__(self,data):
  59         self.time = data.MeasTimestampRF
  60         self.id = data.UEID
  61         self.data = data.drop(['UEID', 'MeasTimestampRF'], axis = 1)
  62
  63     def dbscan(self):
  64         """
  65          Train hdbscan for the input dataframe
  66          save the hdbscan model
  67         """
  68         df = self.data.copy()
  69         hdb = hdbscan.HDBSCAN(min_cluster_size=16000, min_samples = 5, prediction_data = True).fit(df)
  70         joblib.dump(hdb, '/tmp/ad/hdbscan')
  71         self.data['Category'] = hdb.labels_
  72
  73     def RandomForest(self, y):
  74         """
  75          Transform categorical label into numeric(Save the LabelEncoder).
  76          Create Train and Test split for Random Forest Classifier and Save the model
  77         """
  78         df = self.data.copy()
  79         le = LabelEncoder()
  80         y = le.fit_transform(y)
  81         joblib.dump(le, '/tmp/ad/LabelEncoder')
  82         X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.20, stratify=y, random_state=42)
  83         rf = RandomForestClassifier(max_depth=9, random_state=0)
  84         rf.fit(X_train, y_train)
  85
  86         joblib.dump(rf, '/tmp/ad/RF')
  87         print('--------------------------- Training Score------------------------------------')
  88         score(X_test, y_test, rf)
  89         print('--------------------------- Test Score------------------------------------')
  90         test = pd.read_csv('/tmp/ad/ue_test.csv')
  91         test = test[UEKeyList]
  92         y = category(test, [sigstr, PRB, tput])
  93         y =le.transform(y)
  94         ps = preprocess(test)
  95         ps.process()
  96         test = ps.data.drop(['UEID', 'MeasTimestampRF'], axis = 1)
  97         score(test, y, rf)
  98
  99 def score(X, y, model):
 100     y_pred = model.predict(X)
 101     print('Accuracy : {}'.format(accuracy_score(y, y_pred)))
 102
 103     print('confusion matrix : {}'.format(confusion_matrix(y, y_pred)))
 104     print('f1-score : {}'.format(f1_score(y, y_pred, average = 'macro')))
 105
 106
 107 def train():
 108     """
 109      Main function to perform training on input files
 110      Read all the csv file in the current path and create trained model
 111     """
 112     print('Training Starts : ')
 113     path = '/tmp/ad/ue_data/'
 114     df = pd.DataFrame()
 115     # Read all the csv files and store the combined data into df
 116     for file in os.listdir(path):
 117         df = df.append(pd.read_csv(path + file))
 118
 119     df = df[UEKeyList]
 120     df.index = range(len(df))
 121     y =  category(df, [sigstr, PRB, tput])
 122     seg = {}
 123
 124     #Save the category of each UEID and save it as json file
 125     for ue in df.UEID.unique():
 126         seg[str(ue)] = list(set(y[df[df['UEID'] == ue].index]))
 127
 128     with open('ue_seg.json', 'w') as outfile:
 129         json.dump(seg, outfile)
 130
 131     # Do a preprocessing, processing and save the model
 132     ps = preprocess(df)
 133     ps.process()
 134     df = ps.data
 135     db = modelling(df)
 136 #    db.dbscan()
 137     db.RandomForest(y)