X-Git-Url: https://gerrit.o-ran-sc.org/r/gitweb?a=blobdiff_plain;f=ad%2Fad_train.py;h=e6eb73d9bb1eeef5083ccb496abeca52421ac5a6;hb=refs%2Fheads%2Fe-release;hp=d1aa31b905baa753d6b37f69675e98781382051e;hpb=f98ee76af036d60b8f5077105830ed61a13ed5aa;p=ric-app%2Fad.git diff --git a/ad/ad_train.py b/ad/ad_train.py index d1aa31b..e6eb73d 100644 --- a/ad/ad_train.py +++ b/ad/ad_train.py @@ -1,98 +1,104 @@ -import json -import hdbscan -import pandas as pd -import joblib -import os -from ad_model.processing import preprocess -from sklearn.ensemble import RandomForestClassifier -from sklearn.preprocessing import LabelEncoder -from sklearn.model_selection import train_test_split - -# Ranges for input features based on excellent, good, average, & poor category -UEKeyList = ['MeasTimestampRF', 'UEPDCPBytesDL', 'UEPDCPBytesUL', 'UEPRBUsageDL', 'UEPRBUsageUL', 'S_RSRP', 'S_RSRQ', 'S_SINR', 'UEID'] - -sigstr = {'S_RSRP': {'Excellent Signal': [-80, 10000000000000000], 'Good Signal': [-90, -80], 'Average Signal': [-100, -90], 'Poor Signal': [-100000000000000000, -100]}, 'S_RSRQ': {'Excellent Signal': [-10, 10000000000000000], 'Good Signal': [-15, -10], 'Average Signal': [-20, -15], 'Poor Signal': [-100000000000000000, -20]}, 'S_SINR': {'Excellent Signal': [20, 10000000000000000], 'Good Signal': [13, 20], 'Average Signal': [0, 13], 'Poor Signal': [-100000000000000000, 0]}} +# ================================================================================== +# Copyright (c) 2020 HCL Technologies Limited. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ================================================================================== -PRB = {'UEPRBUsageDL': {'Excellent Signal': [25, 10000000000000000], 'Good Signal': [20, 25], 'Average Signal': [10, 20], 'Poor Signal': [-100000000000000000, 10]}, 'UEPRBUsageUL': {'Excellent Signal': [15, 10000000000000000], 'Good Signal': [10, 15], 'Average Signal': [5, 10], 'Poor Signal': [-100000000000000000, 5]}} - -tput = {'UEPDCPBytesDL': {'Excellent Signal': [300000, 10000000000000000], 'Good Signal': [200000, 300000], 'Average Signal': [100000, 200000], 'Poor Signal': [-100000000000000000, 100000]}, 'UEPDCPBytesUL': {'Excellent Signal': [125000, 10000000000000000], 'Good Signal': [100000, 125000], 'Average Signal': [10000, 100000], 'Poor Signal': [-100000000000000000, 10000]}} +import joblib +from ad_model.processing import PREPROCESS +from sklearn.metrics import f1_score +from sklearn.ensemble import IsolationForest +from database import DATABASE, DUMMY +import numpy as np -def category(df, ranges): - # Based on ranges, each sample is return with category(excellent, good, average, & poor category). - data = df.copy() - for block in ranges: - df = data[list(block.keys())].copy() - for key, value in block.items(): - temp = data[list(block.keys())].copy() - for cat, bounds in value.items(): - ind = temp[(temp[key] <= bounds[1]) & (temp[key] > bounds[0])].index - df.loc[ind, key] = cat - data[df.columns] = df - # Maximum category value is considered as final category value. - category = data[['UEPDCPBytesDL', 'UEPDCPBytesUL', 'UEPRBUsageDL', 'UEPRBUsageUL', 'S_RSRP', 'S_RSRQ', 'S_SINR']].mode(axis=1)[0] - return category +class modelling(object): + r""" The modelling class takes input as dataframe or array and train Isolation Forest model + Paramteres + ......... + data: DataFrame or array + input dataset + cols: list + list of parameters in input dataset -class modelling(object): + Attributes + ---------- + actual:array + actual label for test data + X: DataFrame or array + transformed values of input data + """ def __init__(self, data): - self.time = data.MeasTimestampRF - self.id = data.UEID - self.data = data.drop(['UEID', 'MeasTimestampRF'], axis=1) + self.data = data + self.cols = data.columns - def dbscan(self): - """ - Train hdbscan for the input dataframe - save the hdbscan model - """ + def read_test(self, db): + """ Read test dataset for model validation""" - df = self.data.copy() - hdb = hdbscan.HDBSCAN(min_cluster_size=16000, min_samples=5, prediction_data=True).fit(df) - joblib.dump(hdb, 'ad/hdbscan') - self.data['Category'] = hdb.labels_ # Stores the labels into category field + db.read_data('valid') + test = db.data + self.actual = test['Anomaly'] + X = test[self.cols] + sc = joblib.load('scale') + self.X = sc.transform(X) - def RandomForest(self, y): - """ - Transform categorical label into numeric(Save the LabelEncoder). - Create Train and Test split for Random Forest Classifier and Save the model + def isoforest(self, outliers_fraction=0.05, random_state=42, push_model=False): + """ Train isolation forest + + Parameters + ---------- + outliers_fraction: float between 0.01 to 0.5 (default=0.05) + percentage of anomalous available in input data + push_model: boolean (default=False) + return f_1 score if True else push model into repo + random_state: int (default=42) """ - df = self.data.copy() - le = LabelEncoder() - y = le.fit_transform(y) - joblib.dump(le, 'ad/LabelEncoder') - X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.20, stratify=y, random_state=42) - rf = RandomForestClassifier(max_depth=9, random_state=0) - rf.fit(X_train, y_train) # Fit the RFC model - print("X_train cols:", X_train.columns) - joblib.dump(rf, 'ad/RF') # Save the RF model + iso = IsolationForest(contamination=outliers_fraction, random_state=random_state) + md = iso.fit(self.data, None) + if push_model: + joblib.dump(self.cols, 'params') + joblib.dump(md, 'model') + return test(self, md) -def train(): +def train(thread=False): """ - Main function to perform training on input files - Read all the csv file in the current path and create trained model + Main function to perform training on input data """ - print('Training Starts : ') - path = 'ad/ue_data/' - df = pd.DataFrame() - # Read all the csv files and store the combined data into df - for file in os.listdir(path): - df = df.append(pd.read_csv(path + file)) - df = df[UEKeyList] - df.index = range(len(df)) - y = category(df, [sigstr, PRB, tput]) - seg = {} - # Save the category of each UEID and save it as json file - for ue in df.UEID.unique(): - seg[str(ue)] = list(set(y[df[df['UEID'] == ue].index])) - - with open('ue_seg.json', 'w') as outfile: - json.dump(seg, outfile) - - # Do a preprocessing, processing and save the model - ps = preprocess(df) + if thread: + db = DUMMY() + else: + db = DATABASE('UEData') + db.read_data('train') + ps = PREPROCESS(db.data) ps.process() df = ps.data - db = modelling(df) - # db.dbscan() - db.RandomForest(y) + + mod = modelling(df) + mod.read_test(db) + + scores = [] + for of in np.arange(0.01, 0.4, 0.01): + scores.append(mod.isoforest(outliers_fraction=of)) + opt_f1 = scores.index(max(scores)) + 1 + mod.isoforest(outliers_fraction=opt_f1*0.01, push_model=True) + print("Optimum value of contamination : {}".format(opt_f1*0.01)) + print('Training Ends : ') + + +def test(self, model): + pred = model.predict(self.X) + if -1 in pred: + pred = [1 if p == -1 else 0 for p in pred] + return f1_score(self.actual, pred)