X-Git-Url: https://gerrit.o-ran-sc.org/r/gitweb?a=blobdiff_plain;f=ad%2Fad_train.py;h=e6eb73d9bb1eeef5083ccb496abeca52421ac5a6;hb=8aab5b060386311c287d5c832701ede6853e4905;hp=b6aa8434616e60db3a96e2ee4e97d5a08bf8a8e4;hpb=4f8b2a6fc8581b8227489d857f06d7734883700a;p=ric-app%2Fad.git diff --git a/ad/ad_train.py b/ad/ad_train.py index b6aa843..e6eb73d 100644 --- a/ad/ad_train.py +++ b/ad/ad_train.py @@ -13,125 +13,92 @@ # See the License for the specific language governing permissions and # limitations under the License. # ================================================================================== -import warnings -import json -import hdbscan -import pandas as pd -import numpy as np -import joblib, os -from ad_model.processing import preprocess -from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import accuracy_score, confusion_matrix,f1_score -from sklearn.preprocessing import LabelEncoder -from sklearn.model_selection import train_test_split - -# Ranges for input features based on excellent, good, average, & poor category -UEKeyList = ['MeasTimestampRF','UEPDCPBytesDL', 'UEPDCPBytesUL', 'UEPRBUsageDL', 'UEPRBUsageUL', 'S_RSRP', 'S_RSRQ', 'S_SINR','UEID'] -#UEKeyList = ['S_RSRP', 'S_RSRQ', 'S_SINR','UEID','MeasTimestampRF'] -sigstr = {'S_RSRP': {'Excellent Signal' : [-80, 10000000000000000], 'Good Signal': [-90,-80], 'Average Signal':[-100,-90], 'Poor Signal':[-100000000000000000,-100]}, 'S_RSRQ' : {'Excellent Signal' : [-10, 10000000000000000], 'Good Signal': [-15,-10], 'Average Signal':[-20,-15], 'Poor Signal':[-100000000000000000,-20]}, 'S_SINR' : {'Excellent Signal' : [20, 10000000000000000], 'Good Signal': [13,20], 'Average Signal':[0,13], 'Poor Signal':[-100000000000000000,0]}} +import joblib +from ad_model.processing import PREPROCESS +from sklearn.metrics import f1_score +from sklearn.ensemble import IsolationForest +from database import DATABASE, DUMMY +import numpy as np -PRB = {'UEPRBUsageDL': {'Excellent Signal' : [25, 10000000000000000], 'Good Signal': [20,25], 'Average Signal':[10,20], 'Poor Signal':[-100000000000000000,10]}, 'UEPRBUsageUL' : {'Excellent Signal' : [15, 10000000000000000], 'Good Signal': [10,15], 'Average Signal':[5,10], 'Poor Signal':[-100000000000000000,5]}} -tput = {'UEPDCPBytesDL': {'Excellent Signal' : [300000, 10000000000000000], 'Good Signal': [200000,300000], 'Average Signal':[100000,200000], 'Poor Signal':[-100000000000000000,100000]}, 'UEPDCPBytesUL' : {'Excellent Signal' : [125000, 10000000000000000], 'Good Signal': [100000,125000], 'Average Signal':[10000,100000], 'Poor Signal':[-100000000000000000,10000]}} +class modelling(object): + r""" The modelling class takes input as dataframe or array and train Isolation Forest model + Paramteres + ......... + data: DataFrame or array + input dataset + cols: list + list of parameters in input dataset -def category(df,ranges): + Attributes + ---------- + actual:array + actual label for test data + X: DataFrame or array + transformed values of input data """ - Based on ranges, each sample is return with category(excellent, good, average, & poor category). - """ - data = df.copy() - for block in ranges: - df = data[list(block.keys())].copy() - for key, value in block.items(): - temp = data[list(block.keys())].copy() - for cat, bounds in value.items(): - ind = temp[(temp[key] <= bounds[1]) & (temp[key] > bounds[0])].index - df.loc[ind, key] = cat - data[df.columns] = df - category = data[['UEPDCPBytesDL', 'UEPDCPBytesUL', 'UEPRBUsageDL', 'UEPRBUsageUL', - 'S_RSRP', 'S_RSRQ', 'S_SINR']].mode(axis = 1)[0] - return category + def __init__(self, data): + self.data = data + self.cols = data.columns + def read_test(self, db): + """ Read test dataset for model validation""" -class modelling(object): - def __init__(self,data): - self.time = data.MeasTimestampRF - self.id = data.UEID - self.data = data.drop(['UEID', 'MeasTimestampRF'], axis = 1) - - def dbscan(self): - """ - Train hdbscan for the input dataframe - save the hdbscan model - """ - df = self.data.copy() - hdb = hdbscan.HDBSCAN(min_cluster_size=16000, min_samples = 5, prediction_data = True).fit(df) - joblib.dump(hdb, '/tmp/ad/hdbscan') - self.data['Category'] = hdb.labels_ + db.read_data('valid') + test = db.data + self.actual = test['Anomaly'] + X = test[self.cols] + sc = joblib.load('scale') + self.X = sc.transform(X) - def RandomForest(self, y): - """ - Transform categorical label into numeric(Save the LabelEncoder). - Create Train and Test split for Random Forest Classifier and Save the model + def isoforest(self, outliers_fraction=0.05, random_state=42, push_model=False): + """ Train isolation forest + + Parameters + ---------- + outliers_fraction: float between 0.01 to 0.5 (default=0.05) + percentage of anomalous available in input data + push_model: boolean (default=False) + return f_1 score if True else push model into repo + random_state: int (default=42) """ - df = self.data.copy() - le = LabelEncoder() - y = le.fit_transform(y) - joblib.dump(le, '/tmp/ad/LabelEncoder') - X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.20, stratify=y, random_state=42) - rf = RandomForestClassifier(max_depth=9, random_state=0) - rf.fit(X_train, y_train) - - joblib.dump(rf, '/tmp/ad/RF') - print('--------------------------- Training Score------------------------------------') - score(X_test, y_test, rf) - print('--------------------------- Test Score------------------------------------') - test = pd.read_csv('/tmp/ad/ue_test.csv') - test = test[UEKeyList] - y = category(test, [sigstr, PRB, tput]) - y =le.transform(y) - ps = preprocess(test) - ps.process() - test = ps.data.drop(['UEID', 'MeasTimestampRF'], axis = 1) - score(test, y, rf) + iso = IsolationForest(contamination=outliers_fraction, random_state=random_state) + md = iso.fit(self.data, None) + if push_model: + joblib.dump(self.cols, 'params') + joblib.dump(md, 'model') + return test(self, md) -def score(X, y, model): - y_pred = model.predict(X) - print('Accuracy : {}'.format(accuracy_score(y, y_pred))) - print('confusion matrix : {}'.format(confusion_matrix(y, y_pred))) - print('f1-score : {}'.format(f1_score(y, y_pred, average = 'macro'))) +def train(thread=False): + """ + Main function to perform training on input data + """ + if thread: + db = DUMMY() + else: + db = DATABASE('UEData') + db.read_data('train') + ps = PREPROCESS(db.data) + ps.process() + df = ps.data + mod = modelling(df) + mod.read_test(db) -def train(): - """ - Main function to perform training on input files - Read all the csv file in the current path and create trained model - """ - print('Training Starts : ') - path = '/tmp/ad/ue_data/' - df = pd.DataFrame() - # Read all the csv files and store the combined data into df - for file in os.listdir(path): - df = df.append(pd.read_csv(path + file)) - - df = df[UEKeyList] - df.index = range(len(df)) - y = category(df, [sigstr, PRB, tput]) - seg = {} + scores = [] + for of in np.arange(0.01, 0.4, 0.01): + scores.append(mod.isoforest(outliers_fraction=of)) + opt_f1 = scores.index(max(scores)) + 1 + mod.isoforest(outliers_fraction=opt_f1*0.01, push_model=True) + print("Optimum value of contamination : {}".format(opt_f1*0.01)) + print('Training Ends : ') - #Save the category of each UEID and save it as json file - for ue in df.UEID.unique(): - seg[str(ue)] = list(set(y[df[df['UEID'] == ue].index])) - - with open('ue_seg.json', 'w') as outfile: - json.dump(seg, outfile) - # Do a preprocessing, processing and save the model - ps = preprocess(df) - ps.process() - df = ps.data - db = modelling(df) -# db.dbscan() - db.RandomForest(y) +def test(self, model): + pred = model.predict(self.X) + if -1 in pred: + pred = [1 if p == -1 else 0 for p in pred] + return f1_score(self.actual, pred)