1 # ==================================================================================
2 # Copyright (c) 2020 HCL Technologies Limited.
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
8 # http://www.apache.org/licenses/LICENSE-2.0
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15 # ==================================================================================
20 from processing import PREPROCESS
21 from sklearn.metrics import classification_report, f1_score
22 from sklearn.ensemble import IsolationForest
23 from sklearn.model_selection import RandomizedSearchCV
24 from mdclogpy import Logger
26 logger = Logger(name=__name__)
29 class ModelTraining(object):
30 r""" The modelling class takes input as dataframe or array and train Isolation Forest model
34 data: DataFrame or array
37 list of parameters in input dataset
42 actual label for test data
44 transformed values of input data
46 def __init__(self, db):
48 self.train_data = None
54 self.db.read_data(train=True)
55 while self.db.data is None or len(self.db.data.dropna()) < 1000:
56 logger.warning("Check if InfluxDB instance is up / Not sufficient data for Training")
58 self.db.read_data(train=True)
59 self.train_data = self.db.data
60 logger.debug("Training on {} Samples".format(self.train_data.shape[0]))
63 """ Read test dataset for model validation"""
64 self.db.read_data(valid=True)
65 while self.db.data is None or len(self.db.data.dropna()) < 300:
66 logger.warning("Check if InfluxDB instance is up? or Not sufficient data for Validation in last 10 minutes")
68 self.db.read_data(valid=True)
69 self.test_data = self.db.data.dropna()
70 logger.debug("Validation on {} Samples".format(self.test_data.shape[0]))
72 def isoforest(self, outliers_fraction=0.05, random_state=4):
73 """ Train isolation forest
77 outliers_fraction: float between 0.01 to 0.5 (default=0.05)
78 percentage of anomalous available in input data
79 push_model: boolean (default=False)
80 return f_1 score if True else push model into repo
81 random_state: int (default=42)
83 parameter = {'contamination': [of for of in np.arange(0.01, 0.5, 0.02)],
84 'n_estimators': [100*(i+1) for i in range(1, 10)],
85 'max_samples': [0.005, 0.01, 0.1, 0.15, 0.2, 0.3, 0.4]}
86 cv = [(slice(None), slice(None))]
87 iso = IsolationForest(random_state=random_state, bootstrap=True, warm_start=False)
88 model = RandomizedSearchCV(iso, parameter, scoring=self.validate, cv=cv, n_iter=50)
89 md = model.fit(self.train_data.values)
90 f1 = self.validate(md.best_estimator_, self.test_data, True)
91 return f1, md.best_estimator_
93 def validate(self, model, test_data, report=False):
94 pred = model.predict(self.test_data.values)
96 pred = [1 if p == -1 else 0 for p in pred]
97 F1 = f1_score(self.actual, pred, average='macro')
99 logger.debug("classfication report : {} ".format(classification_report(self.actual, pred)))
100 logger.debug("F1 score:{}".format(F1))
105 Main function to perform training on input data
107 logger.debug("Training Starts")
108 ps = PREPROCESS(self.train_data)
110 self.train_data = ps.data
112 self.actual = (self.test_data[self.db.anomaly] > 0).astype(int)
113 num = joblib.load('src/num_params')
114 ps = PREPROCESS(self.test_data[num])
116 self.test_data = ps.data
121 logger.info("Training Isolation Forest")
122 f1, model = self.isoforest()
126 opt = scores.index(max(scores))
127 joblib.dump(models[opt], 'src/model')
128 logger.info("Optimum f-score : {}".format(scores[opt]))
129 logger.info("Training Ends : ")