LN0739_FM_FR9: Add configured delay for raise/clear alarm handling
[ric-plt/alarm-go.git] / manager / cmd / manager.go
index f00a812..f7ad81e 100755 (executable)
 package main
 
 import (
+       "bytes"
        "encoding/json"
        "fmt"
+       "io/ioutil"
+       "net/http"
+       "os"
+       "time"
+
        "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm"
        app "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp"
        clientruntime "github.com/go-openapi/runtime/client"
@@ -31,9 +37,6 @@ import (
        "github.com/prometheus/alertmanager/api/v2/client/alert"
        "github.com/prometheus/alertmanager/api/v2/models"
        "github.com/spf13/viper"
-       "io/ioutil"
-       "os"
-       "time"
 )
 
 func (a *AlarmManager) StartAlertTimer() {
@@ -71,19 +74,21 @@ func (a *AlarmManager) HandleAlarms(rp *app.RMRParams) (*alert.PostAlertsOK, err
        }
        app.Logger.Info("newAlarm: %v", m)
 
-       return a.ProcessAlarm(&m)
+       return a.ProcessAlarm(&AlarmNotification{m, alarm.AlarmDefinition{}})
 }
 
-func (a *AlarmManager) ProcessAlarm(m *alarm.AlarmMessage) (*alert.PostAlertsOK, error) {
+func (a *AlarmManager) ProcessAlarm(m *AlarmNotification) (*alert.PostAlertsOK, error) {
        a.mutex.Lock()
-       if _, ok := alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok {
+       alarmDef := &alarm.AlarmDefinition{}
+       var ok bool
+       if alarmDef, ok = alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok {
                app.Logger.Warn("Alarm (SP='%d') not recognized, suppressing ...", m.Alarm.SpecificProblem)
                a.mutex.Unlock()
                return nil, nil
        }
 
-       // Suppress duplicate alarms
        idx, found := a.IsMatchFound(m.Alarm)
+       // Suppress duplicate alarms
        if found && m.AlarmAction == alarm.AlarmActionRaise {
                app.Logger.Info("Duplicate alarm found, suppressing ...")
                if m.PerceivedSeverity == a.activeAlarms[idx].PerceivedSeverity {
@@ -99,23 +104,17 @@ func (a *AlarmManager) ProcessAlarm(m *alarm.AlarmMessage) (*alert.PostAlertsOK,
        // Clear alarm if found from active alarm list
        if m.AlarmAction == alarm.AlarmActionClear {
                if found {
-                       a.alarmHistory = append(a.alarmHistory, *m)
-                       a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
-                       if ((len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false)){
-                               app.Logger.Error("alarm history count exceeded maxAlarmHistory threshold")
-                               histAlarm := a.alarmClient.NewAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "threshold", "history")
-                               histAlarmMessage := alarm.AlarmMessage{Alarm: histAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
-                               a.activeAlarms = append(a.activeAlarms, histAlarmMessage)
-                               a.alarmHistory = append(a.alarmHistory, histAlarmMessage)
-                       }
-                       if ((a.exceededActiveAlarmOn == true) && (m.Alarm.SpecificProblem == alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD)) {
-                               a.exceededActiveAlarmOn = false
-                       }
-                       if ((a.exceededAlarmHistoryOn == true) && (m.Alarm.SpecificProblem == alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD)) {
-                               a.exceededAlarmHistoryOn = false
+                       if a.ProcessClearAlarm(m, alarmDef, idx) == false {
+                               return nil, nil
                        }
                        if a.postClear {
                                a.mutex.Unlock()
+
+                               // Send alarm notification to NOMA, if enabled
+                               if app.Config.GetBool("controls.noma.enabled") {
+                                       m.PerceivedSeverity = alarm.SeverityCleared
+                                       return a.PostAlarm(m)
+                               }
                                return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusResolved, m.AlarmTime))
                        }
                }
@@ -126,8 +125,13 @@ func (a *AlarmManager) ProcessAlarm(m *alarm.AlarmMessage) (*alert.PostAlertsOK,
 
        // New alarm -> update active alarms and post to Alert Manager
        if m.AlarmAction == alarm.AlarmActionRaise {
-               a.UpdateAlarmLists(m)
-               a.mutex.Unlock()
+               if a.ProcessRaiseAlarm(m, alarmDef) == false {
+                       return nil, nil
+               }
+               // Send alarm notification to NOMA, if enabled
+               if app.Config.GetBool("controls.noma.enabled") {
+                       return a.PostAlarm(m)
+               }
                return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
        }
 
@@ -135,6 +139,75 @@ func (a *AlarmManager) ProcessAlarm(m *alarm.AlarmMessage) (*alert.PostAlertsOK,
        return nil, nil
 }
 
+func (a *AlarmManager)ProcessRaiseAlarm(m *AlarmNotification, alarmDef *alarm.AlarmDefinition) bool {
+       app.Logger.Debug("Raise alarmDef.RaiseDelay = %v, AlarmNotification = %v", alarmDef.RaiseDelay, *m)
+       // RaiseDelay > 0 in an alarm object in active alarm table indicates that raise delay is still ongoing for the alarm
+       m.AlarmDefinition.RaiseDelay = alarmDef.RaiseDelay
+       a.UpdateAlarmFields(a.GenerateAlarmId(), m)
+       a.UpdateActiveAlarmList(m)
+       a.mutex.Unlock()
+       if alarmDef.RaiseDelay > 0 {
+               timerDelay(alarmDef.RaiseDelay)
+               a.mutex.Lock()
+               // Alarm may have been deleted from active alarms table during delay or table index may have changed
+               idx, found := a.IsMatchFound(m.Alarm)
+               if found {
+                       // Alarm is not showed in active alarms or alarm history via CLI before RaiseDelay has elapsed, i.e the value is 0
+                       a.activeAlarms[idx].AlarmDefinition.RaiseDelay = 0
+                       app.Logger.Debug("Raise after delay alarmDef.RaiseDelay = %v, AlarmNotification = %v", alarmDef.RaiseDelay, *m)
+                       a.mutex.Unlock()
+               } else {
+                       app.Logger.Debug("Alarm deleted during raise delay. AlarmNotification = %v", *m)
+                       a.mutex.Unlock()
+                       return false
+               }
+       }
+       m.AlarmDefinition.RaiseDelay = 0
+       a.UpdateAlarmHistoryList(m)
+       a.WriteAlarmInfoToPersistentVolume()    
+       return true
+}
+
+func (a *AlarmManager)ProcessClearAlarm(m *AlarmNotification, alarmDef *alarm.AlarmDefinition, idx int) bool {
+       app.Logger.Debug("Clear alarmDef.ClearDelay = %v, AlarmNotification = %v", alarmDef.ClearDelay, *m)
+       if alarmDef.ClearDelay > 0 {
+               a.mutex.Unlock()
+               timerDelay(alarmDef.ClearDelay)
+               app.Logger.Debug("Clear after delay alarmDef.ClearDelay = %v, AlarmNotification = %v", alarmDef.ClearDelay, *m)
+               a.mutex.Lock()
+               // Another alarm clear may have happened during delay and active alarms table index changed
+               var found bool
+               idx, found = a.IsMatchFound(m.Alarm)
+               if !found {
+                       app.Logger.Debug("Alarm not anymore in the active alarms table. AlarmNotification = %v", *m)
+                       a.mutex.Unlock()
+                       return false
+               }
+       }
+       a.UpdateAlarmFields(a.activeAlarms[idx].AlarmId, m)
+       a.alarmHistory = append(a.alarmHistory, *m)
+       a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
+       if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) {
+               app.Logger.Warn("alarm history count exceeded maxAlarmHistory threshold")
+               a.GenerateThresholdAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, "history")
+       }
+
+       if a.exceededActiveAlarmOn && m.Alarm.SpecificProblem == alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD {
+               a.exceededActiveAlarmOn = false
+       }
+
+       if a.exceededAlarmHistoryOn && m.Alarm.SpecificProblem == alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD {
+               a.exceededAlarmHistoryOn = false
+       }
+       a.WriteAlarmInfoToPersistentVolume()    
+       return true
+}
+
+func timerDelay(delay int) {
+       timer := time.NewTimer(time.Duration(delay) * time.Second)
+       <-timer.C
+}
+
 func (a *AlarmManager) IsMatchFound(newAlarm alarm.Alarm) (int, bool) {
        for i, m := range a.activeAlarms {
                if m.ManagedObjectId == newAlarm.ManagedObjectId && m.ApplicationId == newAlarm.ApplicationId &&
@@ -145,54 +218,102 @@ func (a *AlarmManager) IsMatchFound(newAlarm alarm.Alarm) (int, bool) {
        return -1, false
 }
 
-func (a *AlarmManager) RemoveAlarm(alarms []alarm.AlarmMessage, i int, listName string) []alarm.AlarmMessage {
+func (a *AlarmManager) RemoveAlarm(alarms []AlarmNotification, i int, listName string) []AlarmNotification {
        app.Logger.Info("Alarm '%+v' deleted from the '%s' list", alarms[i], listName)
        copy(alarms[i:], alarms[i+1:])
        return alarms[:len(alarms)-1]
 }
 
-func (a *AlarmManager) UpdateAlarmLists(newAlarm *alarm.AlarmMessage) {
-       /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised.
-          The attempt to raise the alarm next time will be supressed when found as duplicate. */
-       if ((len(a.activeAlarms) >= a.maxActiveAlarms) && (a.exceededActiveAlarmOn == false)) {
-               app.Logger.Error("active alarm count exceeded maxActiveAlarms threshold")
-               actAlarm := a.alarmClient.NewAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "threshold", "active")
-               actAlarmMessage := alarm.AlarmMessage{Alarm: actAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
-               a.activeAlarms = append(a.activeAlarms, actAlarmMessage)
-               a.alarmHistory = append(a.alarmHistory, actAlarmMessage)
-               a.exceededActiveAlarmOn = true
+func (a *AlarmManager) GenerateAlarmId() int {
+       a.uniqueAlarmId++ // @todo: generate a unique ID
+       return a.uniqueAlarmId
+}
+
+func (a *AlarmManager) UpdateAlarmFields(alarmId int, newAlarm *AlarmNotification) {
+       alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem]
+       newAlarm.AlarmId = alarmId
+       newAlarm.AlarmText = alarmDef.AlarmText
+       newAlarm.EventType = alarmDef.EventType
+}
+
+func (a *AlarmManager) GenerateThresholdAlarm(sp int, data string) bool {
+       thresholdAlarm := a.alarmClient.NewAlarm(sp, alarm.SeverityWarning, "threshold", data)
+       thresholdMessage := alarm.AlarmMessage{
+               Alarm:       thresholdAlarm,
+               AlarmAction: alarm.AlarmActionRaise,
+               AlarmTime:   (time.Now().UnixNano()),
        }
+       alarmDef := alarm.RICAlarmDefinitions[sp]
+       alarmId := a.GenerateAlarmId()
+       alarmDef.AlarmId = alarmId
+       a.activeAlarms = append(a.activeAlarms, AlarmNotification{thresholdMessage, *alarmDef})
+       a.alarmHistory = append(a.alarmHistory, AlarmNotification{thresholdMessage, *alarmDef})
+
+       return true
+}
 
-       if ((len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false)) {
-               app.Logger.Error("alarm history count exceeded maxAlarmHistory threshold")
-               histAlarm := a.alarmClient.NewAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "threshold", "history")
-               histAlarmMessage := alarm.AlarmMessage{Alarm: histAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
-               a.activeAlarms = append(a.activeAlarms, histAlarmMessage)
-               a.alarmHistory = append(a.alarmHistory, histAlarmMessage)
-               a.exceededAlarmHistoryOn = true
+func (a *AlarmManager) UpdateActiveAlarmList(newAlarm *AlarmNotification) {
+       /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised.
+          The attempt to raise the alarm next time will be suppressed when found as duplicate. */
+       if (len(a.activeAlarms) >= a.maxActiveAlarms) && (a.exceededActiveAlarmOn == false) {
+               app.Logger.Warn("active alarm count exceeded maxActiveAlarms threshold")
+               a.exceededActiveAlarmOn = a.GenerateThresholdAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, "active")
        }
 
-       // @todo: For now just keep the alarms (both active and history) in-memory. Use SDL later for persistence
+       // @todo: For now just keep the  active alarms in-memory. Use SDL later for persistence
        a.activeAlarms = append(a.activeAlarms, *newAlarm)
+}
+
+func (a *AlarmManager) UpdateAlarmHistoryList(newAlarm *AlarmNotification) {
+       /* If maximum number of events in alarm history is reached, an error log writing is made,
+          and new alarm indicating the problem is raised. The attempt to add new event time will
+          be suppressed */
+
+       if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) {
+               app.Logger.Warn("alarm history count exceeded maxAlarmHistory threshold")
+               a.exceededAlarmHistoryOn = a.GenerateThresholdAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, "history")
+       }
+
+       // @todo: For now just keep the alarms history in-memory. Use SDL later for persistence
        a.alarmHistory = append(a.alarmHistory, *newAlarm)
 }
 
+func (a *AlarmManager) PostAlarm(m *AlarmNotification) (*alert.PostAlertsOK, error) {
+       result, err := json.Marshal(m)
+       if err != nil {
+               app.Logger.Info("json.Marshal failed: %v", err)
+               return nil, err
+       }
+
+       fullUrl := fmt.Sprintf("%s/%s", app.Config.GetString("controls.noma.host"), app.Config.GetString("controls.noma.alarmUrl"))
+       app.Logger.Info("Posting alarm to '%s'", fullUrl)
+
+       resp, err := http.Post(fullUrl, "application/json", bytes.NewReader(result))
+       if err != nil || resp == nil {
+               app.Logger.Info("Unable to post alarm to '%s': %v", fullUrl, err)
+       }
+
+       return nil, err
+}
+
 func (a *AlarmManager) GenerateAlertLabels(newAlarm alarm.Alarm, status AlertStatus, alarmTime int64) (models.LabelSet, models.LabelSet) {
        alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem]
        amLabels := models.LabelSet{
                "status":      string(status),
                "alertname":   alarmDef.AlarmText,
                "severity":    string(newAlarm.PerceivedSeverity),
-               "service":     fmt.Sprintf("%s:%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
-               "system_name": fmt.Sprintf("RIC:%s:%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
+               "service":     fmt.Sprintf("%s/%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
+               "system_name": "RIC",
        }
        amAnnotations := models.LabelSet{
-               "alarm_id":        fmt.Sprintf("%d", alarmDef.AlarmId),
-               "description":     fmt.Sprintf("%d:%s:%s", newAlarm.SpecificProblem, newAlarm.IdentifyingInfo, newAlarm.AdditionalInfo),
-               "additional_info": newAlarm.AdditionalInfo,
-               "summary":         alarmDef.EventType,
-               "instructions":    alarmDef.OperationInstructions,
-               "timestamp":       fmt.Sprintf("%s", time.Unix(0, alarmTime).Format("02/01/2006, 15:04:05")),
+               "alarm_id":         fmt.Sprintf("%d", alarmDef.AlarmId),
+               "specific_problem": fmt.Sprintf("%d", newAlarm.SpecificProblem),
+               "event_type":       alarmDef.EventType,
+               "identifying_info": newAlarm.IdentifyingInfo,
+               "additional_info":  newAlarm.AdditionalInfo,
+               "description":      fmt.Sprintf("%s:%s", newAlarm.IdentifyingInfo, newAlarm.AdditionalInfo),
+               "instructions":     alarmDef.OperationInstructions,
+               "timestamp":        fmt.Sprintf("%s", time.Unix(0, alarmTime).Format("02/01/2006, 15:04:05")),
        }
 
        return amLabels, amAnnotations
@@ -263,6 +384,8 @@ func (a *AlarmManager) ReadAlarmDefinitionFromJson() {
                                        ricAlarmDefintion.AlarmText = alarmDefinition.AlarmText
                                        ricAlarmDefintion.EventType = alarmDefinition.EventType
                                        ricAlarmDefintion.OperationInstructions = alarmDefinition.OperationInstructions
+                                       ricAlarmDefintion.RaiseDelay = alarmDefinition.RaiseDelay
+                                       ricAlarmDefintion.ClearDelay = alarmDefinition.ClearDelay
                                        alarm.RICAlarmDefinitions[alarmDefinition.AlarmId] = ricAlarmDefintion
                                }
                        }
@@ -274,6 +397,43 @@ func (a *AlarmManager) ReadAlarmDefinitionFromJson() {
        }
 }
 
+func (a *AlarmManager) ReadAlarmInfoFromPersistentVolume() {
+       var alarmpersistentinfo AlarmPersistentInfo
+       byteValue, rerr := ioutil.ReadFile(a.alarmInfoPvFile)
+       if rerr != nil {
+               app.Logger.Error("ararminfo.json file read error %v", rerr)
+       } else {
+               err := json.Unmarshal(byteValue, &alarmpersistentinfo)
+               if err != nil {
+                       app.Logger.Error("alarmpersistentinfo json unmarshal error %v", err)
+               } else {
+                       a.uniqueAlarmId = alarmpersistentinfo.UniqueAlarmId
+                       a.activeAlarms = make([]AlarmNotification, len(alarmpersistentinfo.ActiveAlarms))
+                       a.alarmHistory = make([]AlarmNotification, len(alarmpersistentinfo.AlarmHistory))
+                       copy(a.activeAlarms, alarmpersistentinfo.ActiveAlarms)
+                       copy(a.alarmHistory, alarmpersistentinfo.AlarmHistory)
+               }
+       }
+}
+
+func (a *AlarmManager) WriteAlarmInfoToPersistentVolume() {
+       var alarmpersistentinfo AlarmPersistentInfo
+       alarmpersistentinfo.UniqueAlarmId = a.uniqueAlarmId
+       alarmpersistentinfo.ActiveAlarms = make([]AlarmNotification, len(a.activeAlarms))
+       alarmpersistentinfo.AlarmHistory = make([]AlarmNotification, len(a.alarmHistory))
+       copy(alarmpersistentinfo.ActiveAlarms, a.activeAlarms)
+       copy(alarmpersistentinfo.AlarmHistory, a.alarmHistory)
+       wdata, err := json.MarshalIndent(alarmpersistentinfo, "", " ")
+       if err != nil {
+               app.Logger.Error("alarmpersistentinfo json marshal error %v", err)
+       } else {
+               werr := ioutil.WriteFile(a.alarmInfoPvFile, wdata, 0777)
+               if werr != nil {
+                       app.Logger.Error("alarminfo.json file write error %v", werr)
+               }
+       }
+}
+
 func (a *AlarmManager) Run(sdlcheck bool) {
        app.Logger.SetMdc("alarmManager", fmt.Sprintf("%s:%s", Version, Hash))
        app.SetReadyCB(func(d interface{}) { a.rmrReady = true }, true)
@@ -295,14 +455,15 @@ func (a *AlarmManager) Run(sdlcheck bool) {
        app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.GetAlarmDefinition, "GET")
 
        // Start background timer for re-raising alerts
-       a.postClear = sdlcheck
        go a.StartAlertTimer()
        a.alarmClient, _ = alarm.InitAlarm("SEP", "ALARMMANAGER")
 
+       a.ReadAlarmInfoFromPersistentVolume()
+
        app.RunWithParams(a, sdlcheck)
 }
 
-func NewAlarmManager(amHost string, alertInterval int) *AlarmManager {
+func NewAlarmManager(amHost string, alertInterval int, clearAlarm bool) *AlarmManager {
        if alertInterval == 0 {
                alertInterval = viper.GetInt("controls.promAlertManager.alertInterval")
        }
@@ -312,21 +473,24 @@ func NewAlarmManager(amHost string, alertInterval int) *AlarmManager {
        }
 
        return &AlarmManager{
-               rmrReady:        false,
-               amHost:          amHost,
-               amBaseUrl:       viper.GetString("controls.promAlertManager.baseUrl"),
-               amSchemes:       []string{viper.GetString("controls.promAlertManager.schemes")},
-               alertInterval:   alertInterval,
-               activeAlarms:    make([]alarm.AlarmMessage, 0),
-               alarmHistory:    make([]alarm.AlarmMessage, 0),
-               maxActiveAlarms: app.Config.GetInt("controls.maxActiveAlarms"),
-               maxAlarmHistory: app.Config.GetInt("controls.maxAlarmHistory"),
+               rmrReady:               false,
+               postClear:              clearAlarm,
+               amHost:                 amHost,
+               amBaseUrl:              app.Config.GetString("controls.promAlertManager.baseUrl"),
+               amSchemes:              []string{app.Config.GetString("controls.promAlertManager.schemes")},
+               alertInterval:          alertInterval,
+               activeAlarms:           make([]AlarmNotification, 0),
+               alarmHistory:           make([]AlarmNotification, 0),
+               uniqueAlarmId:          0,
+               maxActiveAlarms:        app.Config.GetInt("controls.maxActiveAlarms"),
+               maxAlarmHistory:        app.Config.GetInt("controls.maxAlarmHistory"),
                exceededActiveAlarmOn:  false,
                exceededAlarmHistoryOn: false,
+               alarmInfoPvFile:        app.Config.GetString("controls.alarmInfoPvFile"),
        }
 }
 
 // Main function
 func main() {
-       NewAlarmManager("", 0).Run(true)
+       NewAlarmManager("", 0, true).Run(true)
 }