ad/ad_train.py

   1 # ==================================================================================
   2 #  Copyright (c) 2020 HCL Technologies Limited.
   3 #
   4 #  Licensed under the Apache License, Version 2.0 (the "License");
   5 #  you may not use this file except in compliance with the License.
   6 #  You may obtain a copy of the License at
   7 #
   8 #     http://www.apache.org/licenses/LICENSE-2.0
   9 #
  10 #  Unless required by applicable law or agreed to in writing, software
  11 #  distributed under the License is distributed on an "AS IS" BASIS,
  12 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 #  See the License for the specific language governing permissions and
  14 #  limitations under the License.
  15 # ==================================================================================
  16
  17 import joblib
  18 from ad_model.processing import PREPROCESS
  19 from sklearn.metrics import f1_score
  20 from sklearn.ensemble import IsolationForest
  21 from database import DATABASE, DUMMY
  22 import numpy as np
  23
  24
  25 class modelling(object):
  26     r""" The modelling class takes input as dataframe or array and train Isolation Forest model
  27
  28     Paramteres
  29     .........
  30     data: DataFrame or array
  31         input dataset
  32     cols: list
  33         list of parameters in input dataset
  34
  35     Attributes
  36     ----------
  37     actual:array
  38         actual label for test data
  39     X: DataFrame or array
  40         transformed values of input data
  41     """
  42     def __init__(self, data):
  43         self.data = data
  44         self.cols = data.columns
  45
  46     def read_test(self, db):
  47         """ Read test dataset for model validation"""
  48
  49         db.read_data('valid')
  50         test = db.data
  51         self.actual = test['Anomaly']
  52         X = test[self.cols]
  53         sc = joblib.load('scale')
  54         self.X = sc.transform(X)
  55
  56     def isoforest(self, outliers_fraction=0.05, random_state=42, push_model=False):
  57         """ Train isolation forest
  58
  59         Parameters
  60         ----------
  61         outliers_fraction: float between 0.01 to 0.5 (default=0.05)
  62             percentage of anomalous available in input data
  63         push_model: boolean (default=False)
  64             return f_1 score if True else push model into repo
  65         random_state: int (default=42)
  66         """
  67         iso = IsolationForest(contamination=outliers_fraction, random_state=random_state)
  68         md = iso.fit(self.data, None)
  69         if push_model:
  70             joblib.dump(self.cols, 'params')
  71             joblib.dump(md, 'model')
  72         return test(self, md)
  73
  74
  75 def train(thread=False):
  76     """
  77      Main function to perform training on input data
  78     """
  79     if thread:
  80         db = DUMMY()
  81     else:
  82         db = DATABASE('UEData')
  83     db.read_data('train')
  84     ps = PREPROCESS(db.data)
  85     ps.process()
  86     df = ps.data
  87
  88     mod = modelling(df)
  89     mod.read_test(db)
  90
  91     scores = []
  92     for of in np.arange(0.01, 0.4, 0.01):
  93         scores.append(mod.isoforest(outliers_fraction=of))
  94     opt_f1 = scores.index(max(scores)) + 1
  95     mod.isoforest(outliers_fraction=opt_f1*0.01, push_model=True)
  96     print("Optimum value of contamination : {}".format(opt_f1*0.01))
  97     print('Training Ends : ')
  98
  99
 100 def test(self, model):
 101     pred = model.predict(self.X)
 102     if -1 in pred:
 103         pred = [1 if p == -1 else 0 for p in pred]
 104     return f1_score(self.actual, pred)