ad/ad_train.py

   1 import json
   2 import hdbscan
   3 import pandas as pd
   4 import joblib
   5 import os
   6 from ad_model.processing import preprocess
   7 from sklearn.ensemble import RandomForestClassifier
   8 from sklearn.preprocessing import LabelEncoder
   9 from sklearn.model_selection import train_test_split
  10
  11 # Ranges for input features based on excellent, good, average, & poor category
  12 UEKeyList = ['MeasTimestampRF', 'UEPDCPBytesDL', 'UEPDCPBytesUL', 'UEPRBUsageDL', 'UEPRBUsageUL', 'S_RSRP', 'S_RSRQ', 'S_SINR', 'UEID']
  13
  14 sigstr = {'S_RSRP': {'Excellent Signal': [-80, 10000000000000000], 'Good Signal': [-90, -80], 'Average Signal': [-100, -90], 'Poor Signal': [-100000000000000000, -100]}, 'S_RSRQ': {'Excellent Signal': [-10, 10000000000000000], 'Good Signal': [-15, -10], 'Average Signal': [-20, -15], 'Poor Signal': [-100000000000000000, -20]}, 'S_SINR': {'Excellent Signal': [20, 10000000000000000], 'Good Signal': [13, 20], 'Average Signal': [0, 13], 'Poor Signal': [-100000000000000000, 0]}}
  15
  16 PRB = {'UEPRBUsageDL': {'Excellent Signal': [25, 10000000000000000], 'Good Signal': [20, 25], 'Average Signal': [10, 20], 'Poor Signal': [-100000000000000000, 10]}, 'UEPRBUsageUL': {'Excellent Signal': [15, 10000000000000000], 'Good Signal': [10, 15], 'Average Signal': [5, 10], 'Poor Signal': [-100000000000000000, 5]}}
  17
  18 tput = {'UEPDCPBytesDL': {'Excellent Signal': [300000, 10000000000000000], 'Good Signal': [200000, 300000], 'Average Signal': [100000, 200000], 'Poor Signal': [-100000000000000000, 100000]}, 'UEPDCPBytesUL': {'Excellent Signal': [125000, 10000000000000000], 'Good Signal': [100000, 125000], 'Average Signal': [10000, 100000], 'Poor Signal': [-100000000000000000, 10000]}}
  19
  20
  21 def category(df, ranges):
  22     # Based on ranges, each sample is return with category(excellent, good, average, & poor category).
  23     data = df.copy()
  24     for block in ranges:
  25         df = data[list(block.keys())].copy()
  26         for key, value in block.items():
  27             temp = data[list(block.keys())].copy()
  28             for cat, bounds in value.items():
  29                 ind = temp[(temp[key] <= bounds[1]) & (temp[key] > bounds[0])].index
  30                 df.loc[ind, key] = cat
  31         data[df.columns] = df
  32     # Maximum category value is considered as final category value.
  33     category = data[['UEPDCPBytesDL', 'UEPDCPBytesUL', 'UEPRBUsageDL', 'UEPRBUsageUL', 'S_RSRP', 'S_RSRQ', 'S_SINR']].mode(axis=1)[0]
  34     return category
  35
  36
  37 class modelling(object):
  38     def __init__(self, data):
  39         self.time = data.MeasTimestampRF
  40         self.id = data.UEID
  41         self.data = data.drop(['UEID', 'MeasTimestampRF'], axis=1)
  42
  43     def dbscan(self):
  44         """
  45          Train hdbscan for the input dataframe
  46          save the hdbscan model
  47         """
  48
  49         df = self.data.copy()
  50         hdb = hdbscan.HDBSCAN(min_cluster_size=16000, min_samples=5, prediction_data=True).fit(df)
  51         joblib.dump(hdb, 'ad/hdbscan')
  52         self.data['Category'] = hdb.labels_  # Stores the labels into category field
  53
  54     def RandomForest(self, y):
  55         """
  56          Transform categorical label into numeric(Save the LabelEncoder).
  57          Create Train and Test split for Random Forest Classifier and Save the model
  58         """
  59         df = self.data.copy()
  60         le = LabelEncoder()
  61         y = le.fit_transform(y)
  62         joblib.dump(le, 'ad/LabelEncoder')
  63         X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.20, stratify=y, random_state=42)
  64         rf = RandomForestClassifier(max_depth=9, random_state=0)
  65         rf.fit(X_train, y_train)  # Fit the RFC model
  66         print("X_train cols:", X_train.columns)
  67         joblib.dump(rf, 'ad/RF')  # Save the RF model
  68
  69
  70 def train():
  71     """
  72      Main function to perform training on input files
  73      Read all the csv file in the current path and create trained model
  74     """
  75     print('Training Starts : ')
  76     path = 'ad/ue_data/'
  77     df = pd.DataFrame()
  78     # Read all the csv files and store the combined data into df
  79     for file in os.listdir(path):
  80         df = df.append(pd.read_csv(path + file))
  81     df = df[UEKeyList]
  82     df.index = range(len(df))
  83     y = category(df, [sigstr, PRB, tput])
  84     seg = {}
  85     # Save the category of each UEID and save it as json file
  86     for ue in df.UEID.unique():
  87         seg[str(ue)] = list(set(y[df[df['UEID'] == ue].index]))
  88
  89     with open('ue_seg.json', 'w') as outfile:
  90         json.dump(seg, outfile)
  91
  92     # Do a preprocessing, processing and save the model
  93     ps = preprocess(df)
  94     ps.process()
  95     df = ps.data
  96     db = modelling(df)
  97     # db.dbscan()
  98     db.RandomForest(y)