src/ad_train.py

   1 # ==================================================================================
   2 #  Copyright (c) 2020 HCL Technologies Limited.
   3 #
   4 #  Licensed under the Apache License, Version 2.0 (the "License");
   5 #  you may not use this file except in compliance with the License.
   6 #  You may obtain a copy of the License at
   7 #
   8 #     http://www.apache.org/licenses/LICENSE-2.0
   9 #
  10 #  Unless required by applicable law or agreed to in writing, software
  11 #  distributed under the License is distributed on an "AS IS" BASIS,
  12 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 #  See the License for the specific language governing permissions and
  14 #  limitations under the License.
  15 # ==================================================================================
  16
  17 import joblib
  18 import time
  19 import numpy as np
  20 from processing import PREPROCESS
  21 from sklearn.metrics import classification_report, f1_score
  22 from sklearn.ensemble import IsolationForest
  23 from sklearn.model_selection import RandomizedSearchCV
  24 from mdclogpy import Logger
  25
  26 logger = Logger(name=__name__)
  27
  28
  29 class ModelTraining(object):
  30     r""" The modelling class takes input as dataframe or array and train Isolation Forest model
  31
  32     Paramteres
  33     .........
  34     data: DataFrame or array
  35         input dataset
  36     cols: list
  37         list of parameters in input dataset
  38
  39     Attributes
  40     ----------
  41     actual:array
  42         actual label for test data
  43     X: DataFrame or array
  44         transformed values of input data
  45     """
  46     def __init__(self, db):
  47         self.db = db
  48         self.train_data = None
  49         self.test_data = None
  50         self.read_train()
  51         self.read_test()
  52
  53     def read_train(self):
  54         self.db.read_data(train=True)
  55         while self.db.data is None or len(self.db.data.dropna()) < 1000:
  56             logger.warning("Check if InfluxDB instance is up / Not sufficient data for Training")
  57             time.sleep(120)
  58             self.db.read_data(train=True)
  59         self.train_data = self.db.data
  60         logger.debug("Training on {} Samples".format(self.train_data.shape[0]))
  61
  62     def read_test(self):
  63         """ Read test dataset for model validation"""
  64         self.db.read_data(valid=True)
  65         while self.db.data is None or len(self.db.data.dropna()) < 300:
  66             logger.warning("Check if InfluxDB instance is up? or Not sufficient data for Validation in last 10 minutes")
  67             time.sleep(60)
  68             self.db.read_data(valid=True)
  69         self.test_data = self.db.data.dropna()
  70         logger.debug("Validation on {} Samples".format(self.test_data.shape[0]))
  71
  72     def isoforest(self, outliers_fraction=0.05, random_state=4):
  73         """ Train isolation forest
  74
  75         Parameters
  76         ----------
  77         outliers_fraction: float between 0.01 to 0.5 (default=0.05)
  78             percentage of anomalous available in input data
  79         push_model: boolean (default=False)
  80             return f_1 score if True else push model into repo
  81         random_state: int (default=42)
  82         """
  83         parameter = {'contamination': [of for of in np.arange(0.01, 0.5, 0.02)],
  84                      'n_estimators': [100*(i+1) for i in range(1, 10)],
  85                      'max_samples': [0.005, 0.01, 0.1, 0.15, 0.2, 0.3, 0.4]}
  86         cv = [(slice(None), slice(None))]
  87         iso = IsolationForest(random_state=random_state, bootstrap=True, warm_start=False)
  88         model = RandomizedSearchCV(iso, parameter, scoring=self.validate, cv=cv, n_iter=50)
  89         md = model.fit(self.train_data.values)
  90         f1 = self.validate(md.best_estimator_, self.test_data, True)
  91         return f1, md.best_estimator_
  92
  93     def validate(self, model, test_data, report=False):
  94         pred = model.predict(self.test_data.values)
  95         if -1 in pred:
  96             pred = [1 if p == -1 else 0 for p in pred]
  97         F1 = f1_score(self.actual, pred, average='macro')
  98         if report:
  99             logger.debug("classfication report : {} ".format(classification_report(self.actual, pred)))
 100             logger.debug("F1 score:{}".format(F1))
 101         return F1
 102
 103     def train(self):
 104         """
 105         Main function to perform training on input data
 106         """
 107         logger.debug("Training Starts")
 108         ps = PREPROCESS(self.train_data)
 109         ps.process()
 110         self.train_data = ps.data
 111
 112         self.actual = (self.test_data[self.db.anomaly] > 0).astype(int)
 113         num = joblib.load('src/num_params')
 114         ps = PREPROCESS(self.test_data[num])
 115         ps.transform()
 116         self.test_data = ps.data
 117
 118         scores = []
 119         models = []
 120
 121         logger.info("Training Isolation Forest")
 122         f1, model = self.isoforest()
 123         scores.append(f1)
 124         models.append(model)
 125
 126         opt = scores.index(max(scores))
 127         joblib.dump(models[opt], 'src/model')
 128         logger.info("Optimum f-score : {}".format(scores[opt]))
 129         logger.info("Training Ends : ")