1 # ==================================================================================
2 # Copyright (c) 2020 HCL Technologies Limited.
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
8 # http://www.apache.org/licenses/LICENSE-2.0
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15 # ==================================================================================
18 from ad_model.processing import PREPROCESS
19 from sklearn.metrics import f1_score
20 from sklearn.ensemble import IsolationForest
21 from database import DATABASE, DUMMY
25 class modelling(object):
26 r""" The modelling class takes input as dataframe or array and train Isolation Forest model
30 data: DataFrame or array
33 list of parameters in input dataset
38 actual label for test data
40 transformed values of input data
42 def __init__(self, data):
44 self.cols = data.columns
46 def read_test(self, db):
47 """ Read test dataset for model validation"""
51 self.actual = test['Anomaly']
53 sc = joblib.load('scale')
54 self.X = sc.transform(X)
56 def isoforest(self, outliers_fraction=0.05, random_state=42, push_model=False):
57 """ Train isolation forest
61 outliers_fraction: float between 0.01 to 0.5 (default=0.05)
62 percentage of anomalous available in input data
63 push_model: boolean (default=False)
64 return f_1 score if True else push model into repo
65 random_state: int (default=42)
67 iso = IsolationForest(contamination=outliers_fraction, random_state=random_state)
68 md = iso.fit(self.data, None)
70 joblib.dump(self.cols, 'params')
71 joblib.dump(md, 'model')
75 def train(thread=False):
77 Main function to perform training on input data
82 db = DATABASE('UEData')
84 ps = PREPROCESS(db.data)
92 for of in np.arange(0.01, 0.4, 0.01):
93 scores.append(mod.isoforest(outliers_fraction=of))
94 opt_f1 = scores.index(max(scores)) + 1
95 mod.isoforest(outliers_fraction=opt_f1*0.01, push_model=True)
96 print("Optimum value of contamination : {}".format(opt_f1*0.01))
97 print('Training Ends : ')
100 def test(self, model):
101 pred = model.predict(self.X)
103 pred = [1 if p == -1 else 0 for p in pred]
104 return f1_score(self.actual, pred)