X-Git-Url: https://gerrit.o-ran-sc.org/r/gitweb?a=blobdiff_plain;f=manager%2Fcmd%2Fmanager.go;h=f7ad81e7e5cdaa8b683dbe11a58d2e903e25ea95;hb=18fd03c7dd490347eeec3ed60e176fb3a8ccc3c7;hp=fb30ba7c3cdd26a474869b69b9098f28834f2ce0;hpb=fe07bd1ea2f550225a209f1393fbaf14fe0a46a2;p=ric-plt%2Falarm-go.git diff --git a/manager/cmd/manager.go b/manager/cmd/manager.go index fb30ba7..f7ad81e 100755 --- a/manager/cmd/manager.go +++ b/manager/cmd/manager.go @@ -21,19 +21,22 @@ package main import ( + "bytes" "encoding/json" "fmt" + "io/ioutil" + "net/http" + "os" "time" + "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm" + app "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp" clientruntime "github.com/go-openapi/runtime/client" "github.com/go-openapi/strfmt" "github.com/prometheus/alertmanager/api/v2/client" "github.com/prometheus/alertmanager/api/v2/client/alert" "github.com/prometheus/alertmanager/api/v2/models" "github.com/spf13/viper" - - "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm" - app "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp" ) func (a *AlarmManager) StartAlertTimer() { @@ -71,21 +74,26 @@ func (a *AlarmManager) HandleAlarms(rp *app.RMRParams) (*alert.PostAlertsOK, err } app.Logger.Info("newAlarm: %v", m) - return a.ProcessAlarm(&m) + return a.ProcessAlarm(&AlarmNotification{m, alarm.AlarmDefinition{}}) } -func (a *AlarmManager) ProcessAlarm(m *alarm.AlarmMessage) (*alert.PostAlertsOK, error) { - if _, ok := alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok { +func (a *AlarmManager) ProcessAlarm(m *AlarmNotification) (*alert.PostAlertsOK, error) { + a.mutex.Lock() + alarmDef := &alarm.AlarmDefinition{} + var ok bool + if alarmDef, ok = alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok { app.Logger.Warn("Alarm (SP='%d') not recognized, suppressing ...", m.Alarm.SpecificProblem) + a.mutex.Unlock() return nil, nil } - // Suppress duplicate alarms idx, found := a.IsMatchFound(m.Alarm) - if found && m.AlarmAction == alarm.AlarmActionRaise { + // Suppress duplicate alarms + if found && m.AlarmAction == alarm.AlarmActionRaise { app.Logger.Info("Duplicate alarm found, suppressing ...") if m.PerceivedSeverity == a.activeAlarms[idx].PerceivedSeverity { // Duplicate with same severity found + a.mutex.Unlock() return nil, nil } else { // Remove duplicate with different severity @@ -93,30 +101,113 @@ func (a *AlarmManager) ProcessAlarm(m *alarm.AlarmMessage) (*alert.PostAlertsOK, } } - // Clear alarm if found from active alarm list if m.AlarmAction == alarm.AlarmActionClear { if found { - a.alarmHistory = append(a.alarmHistory, *m) - a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active") - + if a.ProcessClearAlarm(m, alarmDef, idx) == false { + return nil, nil + } if a.postClear { + a.mutex.Unlock() + + // Send alarm notification to NOMA, if enabled + if app.Config.GetBool("controls.noma.enabled") { + m.PerceivedSeverity = alarm.SeverityCleared + return a.PostAlarm(m) + } return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusResolved, m.AlarmTime)) } } app.Logger.Info("No matching active alarm found, suppressing ...") + a.mutex.Unlock() return nil, nil } // New alarm -> update active alarms and post to Alert Manager if m.AlarmAction == alarm.AlarmActionRaise { - a.UpdateAlarmLists(m) + if a.ProcessRaiseAlarm(m, alarmDef) == false { + return nil, nil + } + // Send alarm notification to NOMA, if enabled + if app.Config.GetBool("controls.noma.enabled") { + return a.PostAlarm(m) + } return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime)) } + a.mutex.Unlock() return nil, nil } +func (a *AlarmManager)ProcessRaiseAlarm(m *AlarmNotification, alarmDef *alarm.AlarmDefinition) bool { + app.Logger.Debug("Raise alarmDef.RaiseDelay = %v, AlarmNotification = %v", alarmDef.RaiseDelay, *m) + // RaiseDelay > 0 in an alarm object in active alarm table indicates that raise delay is still ongoing for the alarm + m.AlarmDefinition.RaiseDelay = alarmDef.RaiseDelay + a.UpdateAlarmFields(a.GenerateAlarmId(), m) + a.UpdateActiveAlarmList(m) + a.mutex.Unlock() + if alarmDef.RaiseDelay > 0 { + timerDelay(alarmDef.RaiseDelay) + a.mutex.Lock() + // Alarm may have been deleted from active alarms table during delay or table index may have changed + idx, found := a.IsMatchFound(m.Alarm) + if found { + // Alarm is not showed in active alarms or alarm history via CLI before RaiseDelay has elapsed, i.e the value is 0 + a.activeAlarms[idx].AlarmDefinition.RaiseDelay = 0 + app.Logger.Debug("Raise after delay alarmDef.RaiseDelay = %v, AlarmNotification = %v", alarmDef.RaiseDelay, *m) + a.mutex.Unlock() + } else { + app.Logger.Debug("Alarm deleted during raise delay. AlarmNotification = %v", *m) + a.mutex.Unlock() + return false + } + } + m.AlarmDefinition.RaiseDelay = 0 + a.UpdateAlarmHistoryList(m) + a.WriteAlarmInfoToPersistentVolume() + return true +} + +func (a *AlarmManager)ProcessClearAlarm(m *AlarmNotification, alarmDef *alarm.AlarmDefinition, idx int) bool { + app.Logger.Debug("Clear alarmDef.ClearDelay = %v, AlarmNotification = %v", alarmDef.ClearDelay, *m) + if alarmDef.ClearDelay > 0 { + a.mutex.Unlock() + timerDelay(alarmDef.ClearDelay) + app.Logger.Debug("Clear after delay alarmDef.ClearDelay = %v, AlarmNotification = %v", alarmDef.ClearDelay, *m) + a.mutex.Lock() + // Another alarm clear may have happened during delay and active alarms table index changed + var found bool + idx, found = a.IsMatchFound(m.Alarm) + if !found { + app.Logger.Debug("Alarm not anymore in the active alarms table. AlarmNotification = %v", *m) + a.mutex.Unlock() + return false + } + } + a.UpdateAlarmFields(a.activeAlarms[idx].AlarmId, m) + a.alarmHistory = append(a.alarmHistory, *m) + a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active") + if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) { + app.Logger.Warn("alarm history count exceeded maxAlarmHistory threshold") + a.GenerateThresholdAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, "history") + } + + if a.exceededActiveAlarmOn && m.Alarm.SpecificProblem == alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD { + a.exceededActiveAlarmOn = false + } + + if a.exceededAlarmHistoryOn && m.Alarm.SpecificProblem == alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD { + a.exceededAlarmHistoryOn = false + } + a.WriteAlarmInfoToPersistentVolume() + return true +} + +func timerDelay(delay int) { + timer := time.NewTimer(time.Duration(delay) * time.Second) + <-timer.C +} + func (a *AlarmManager) IsMatchFound(newAlarm alarm.Alarm) (int, bool) { for i, m := range a.activeAlarms { if m.ManagedObjectId == newAlarm.ManagedObjectId && m.ApplicationId == newAlarm.ApplicationId && @@ -127,58 +218,102 @@ func (a *AlarmManager) IsMatchFound(newAlarm alarm.Alarm) (int, bool) { return -1, false } -func (a *AlarmManager) RemoveAlarm(alarms []alarm.AlarmMessage, i int, listName string) []alarm.AlarmMessage { - a.mutex.Lock() - defer a.mutex.Unlock() - +func (a *AlarmManager) RemoveAlarm(alarms []AlarmNotification, i int, listName string) []AlarmNotification { app.Logger.Info("Alarm '%+v' deleted from the '%s' list", alarms[i], listName) copy(alarms[i:], alarms[i+1:]) return alarms[:len(alarms)-1] } -func (a *AlarmManager) UpdateAlarmLists(newAlarm *alarm.AlarmMessage) { - a.mutex.Lock() - defer a.mutex.Unlock() +func (a *AlarmManager) GenerateAlarmId() int { + a.uniqueAlarmId++ // @todo: generate a unique ID + return a.uniqueAlarmId +} - /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised. - The attempt to raise the alarm next time will be supressed when found as duplicate. */ - if len(a.activeAlarms) >= a.maxActiveAlarms { - app.Logger.Error("active alarm count exceeded maxActiveAlarms threshold") - actAlarm := a.alarmClient.NewAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "clear alarms or raise threshold", "active alarms full") - actAlarmMessage := alarm.AlarmMessage{Alarm: actAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())} - a.activeAlarms = append(a.activeAlarms, actAlarmMessage) - a.alarmHistory = append(a.alarmHistory, actAlarmMessage) +func (a *AlarmManager) UpdateAlarmFields(alarmId int, newAlarm *AlarmNotification) { + alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem] + newAlarm.AlarmId = alarmId + newAlarm.AlarmText = alarmDef.AlarmText + newAlarm.EventType = alarmDef.EventType +} + +func (a *AlarmManager) GenerateThresholdAlarm(sp int, data string) bool { + thresholdAlarm := a.alarmClient.NewAlarm(sp, alarm.SeverityWarning, "threshold", data) + thresholdMessage := alarm.AlarmMessage{ + Alarm: thresholdAlarm, + AlarmAction: alarm.AlarmActionRaise, + AlarmTime: (time.Now().UnixNano()), } + alarmDef := alarm.RICAlarmDefinitions[sp] + alarmId := a.GenerateAlarmId() + alarmDef.AlarmId = alarmId + a.activeAlarms = append(a.activeAlarms, AlarmNotification{thresholdMessage, *alarmDef}) + a.alarmHistory = append(a.alarmHistory, AlarmNotification{thresholdMessage, *alarmDef}) - if len(a.alarmHistory) >= a.maxAlarmHistory { - app.Logger.Error("alarm history count exceeded maxAlarmHistory threshold") - histAlarm := a.alarmClient.NewAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "clear alarms or raise threshold", "alarm history full") - histAlarmMessage := alarm.AlarmMessage{Alarm: histAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())} - a.activeAlarms = append(a.activeAlarms, histAlarmMessage) - a.alarmHistory = append(a.alarmHistory, histAlarmMessage) + return true +} + +func (a *AlarmManager) UpdateActiveAlarmList(newAlarm *AlarmNotification) { + /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised. + The attempt to raise the alarm next time will be suppressed when found as duplicate. */ + if (len(a.activeAlarms) >= a.maxActiveAlarms) && (a.exceededActiveAlarmOn == false) { + app.Logger.Warn("active alarm count exceeded maxActiveAlarms threshold") + a.exceededActiveAlarmOn = a.GenerateThresholdAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, "active") } - // @todo: For now just keep the alarms (both active and history) in-memory. Use SDL later for persistence + // @todo: For now just keep the active alarms in-memory. Use SDL later for persistence a.activeAlarms = append(a.activeAlarms, *newAlarm) +} + +func (a *AlarmManager) UpdateAlarmHistoryList(newAlarm *AlarmNotification) { + /* If maximum number of events in alarm history is reached, an error log writing is made, + and new alarm indicating the problem is raised. The attempt to add new event time will + be suppressed */ + + if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) { + app.Logger.Warn("alarm history count exceeded maxAlarmHistory threshold") + a.exceededAlarmHistoryOn = a.GenerateThresholdAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, "history") + } + + // @todo: For now just keep the alarms history in-memory. Use SDL later for persistence a.alarmHistory = append(a.alarmHistory, *newAlarm) } +func (a *AlarmManager) PostAlarm(m *AlarmNotification) (*alert.PostAlertsOK, error) { + result, err := json.Marshal(m) + if err != nil { + app.Logger.Info("json.Marshal failed: %v", err) + return nil, err + } + + fullUrl := fmt.Sprintf("%s/%s", app.Config.GetString("controls.noma.host"), app.Config.GetString("controls.noma.alarmUrl")) + app.Logger.Info("Posting alarm to '%s'", fullUrl) + + resp, err := http.Post(fullUrl, "application/json", bytes.NewReader(result)) + if err != nil || resp == nil { + app.Logger.Info("Unable to post alarm to '%s': %v", fullUrl, err) + } + + return nil, err +} + func (a *AlarmManager) GenerateAlertLabels(newAlarm alarm.Alarm, status AlertStatus, alarmTime int64) (models.LabelSet, models.LabelSet) { alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem] amLabels := models.LabelSet{ "status": string(status), "alertname": alarmDef.AlarmText, "severity": string(newAlarm.PerceivedSeverity), - "service": fmt.Sprintf("%s:%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId), - "system_name": fmt.Sprintf("RIC:%s:%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId), + "service": fmt.Sprintf("%s/%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId), + "system_name": "RIC", } amAnnotations := models.LabelSet{ - "alarm_id": fmt.Sprintf("%d", alarmDef.AlarmId), - "description": fmt.Sprintf("%d:%s:%s", newAlarm.SpecificProblem, newAlarm.IdentifyingInfo, newAlarm.AdditionalInfo), - "additional_info": newAlarm.AdditionalInfo, - "summary": alarmDef.EventType, - "instructions": alarmDef.OperationInstructions, - "timestamp": fmt.Sprintf("%s", time.Unix(0, alarmTime).Format("02/01/2006, 15:04:05")), + "alarm_id": fmt.Sprintf("%d", alarmDef.AlarmId), + "specific_problem": fmt.Sprintf("%d", newAlarm.SpecificProblem), + "event_type": alarmDef.EventType, + "identifying_info": newAlarm.IdentifyingInfo, + "additional_info": newAlarm.AdditionalInfo, + "description": fmt.Sprintf("%s:%s", newAlarm.IdentifyingInfo, newAlarm.AdditionalInfo), + "instructions": alarmDef.OperationInstructions, + "timestamp": fmt.Sprintf("%s", time.Unix(0, alarmTime).Format("02/01/2006, 15:04:05")), } return amLabels, amAnnotations @@ -219,35 +354,116 @@ func (a *AlarmManager) ConfigChangeCB(configparam string) { a.maxActiveAlarms = app.Config.GetInt("controls.maxActiveAlarms") a.maxAlarmHistory = app.Config.GetInt("controls.maxAlarmHistory") + a.alertInterval = viper.GetInt("controls.promAlertManager.alertInterval") + a.amHost = viper.GetString("controls.promAlertManager.address") app.Logger.Debug("ConfigChangeCB: maxActiveAlarms %v", a.maxActiveAlarms) app.Logger.Debug("ConfigChangeCB: maxAlarmHistory = %v", a.maxAlarmHistory) + app.Logger.Debug("ConfigChangeCB: alertInterval %v", a.alertInterval) + app.Logger.Debug("ConfigChangeCB: amHost = %v", a.amHost) return } +func (a *AlarmManager) ReadAlarmDefinitionFromJson() { + + filename := os.Getenv("DEF_FILE") + file, err := ioutil.ReadFile(filename) + if err == nil { + data := RicAlarmDefinitions{} + err = json.Unmarshal([]byte(file), &data) + if err == nil { + for _, alarmDefinition := range data.AlarmDefinitions { + _, exists := alarm.RICAlarmDefinitions[alarmDefinition.AlarmId] + if exists { + app.Logger.Error("ReadAlarmDefinitionFromJson: alarm definition already exists for %v", alarmDefinition.AlarmId) + } else { + app.Logger.Debug("ReadAlarmDefinitionFromJson: alarm %v", alarmDefinition.AlarmId) + ricAlarmDefintion := new(alarm.AlarmDefinition) + ricAlarmDefintion.AlarmId = alarmDefinition.AlarmId + ricAlarmDefintion.AlarmText = alarmDefinition.AlarmText + ricAlarmDefintion.EventType = alarmDefinition.EventType + ricAlarmDefintion.OperationInstructions = alarmDefinition.OperationInstructions + ricAlarmDefintion.RaiseDelay = alarmDefinition.RaiseDelay + ricAlarmDefintion.ClearDelay = alarmDefinition.ClearDelay + alarm.RICAlarmDefinitions[alarmDefinition.AlarmId] = ricAlarmDefintion + } + } + } else { + app.Logger.Error("ReadAlarmDefinitionFromJson: json.Unmarshal failed with error %v", err) + } + } else { + app.Logger.Error("ReadAlarmDefinitionFromJson: ioutil.ReadFile failed with error %v", err) + } +} + +func (a *AlarmManager) ReadAlarmInfoFromPersistentVolume() { + var alarmpersistentinfo AlarmPersistentInfo + byteValue, rerr := ioutil.ReadFile(a.alarmInfoPvFile) + if rerr != nil { + app.Logger.Error("ararminfo.json file read error %v", rerr) + } else { + err := json.Unmarshal(byteValue, &alarmpersistentinfo) + if err != nil { + app.Logger.Error("alarmpersistentinfo json unmarshal error %v", err) + } else { + a.uniqueAlarmId = alarmpersistentinfo.UniqueAlarmId + a.activeAlarms = make([]AlarmNotification, len(alarmpersistentinfo.ActiveAlarms)) + a.alarmHistory = make([]AlarmNotification, len(alarmpersistentinfo.AlarmHistory)) + copy(a.activeAlarms, alarmpersistentinfo.ActiveAlarms) + copy(a.alarmHistory, alarmpersistentinfo.AlarmHistory) + } + } +} + +func (a *AlarmManager) WriteAlarmInfoToPersistentVolume() { + var alarmpersistentinfo AlarmPersistentInfo + alarmpersistentinfo.UniqueAlarmId = a.uniqueAlarmId + alarmpersistentinfo.ActiveAlarms = make([]AlarmNotification, len(a.activeAlarms)) + alarmpersistentinfo.AlarmHistory = make([]AlarmNotification, len(a.alarmHistory)) + copy(alarmpersistentinfo.ActiveAlarms, a.activeAlarms) + copy(alarmpersistentinfo.AlarmHistory, a.alarmHistory) + wdata, err := json.MarshalIndent(alarmpersistentinfo, "", " ") + if err != nil { + app.Logger.Error("alarmpersistentinfo json marshal error %v", err) + } else { + werr := ioutil.WriteFile(a.alarmInfoPvFile, wdata, 0777) + if werr != nil { + app.Logger.Error("alarminfo.json file write error %v", werr) + } + } +} + func (a *AlarmManager) Run(sdlcheck bool) { app.Logger.SetMdc("alarmManager", fmt.Sprintf("%s:%s", Version, Hash)) app.SetReadyCB(func(d interface{}) { a.rmrReady = true }, true) app.Resource.InjectStatusCb(a.StatusCB) app.AddConfigChangeListener(a.ConfigChangeCB) + alarm.RICAlarmDefinitions = make(map[int]*alarm.AlarmDefinition) + a.ReadAlarmDefinitionFromJson() + app.Resource.InjectRoute("/ric/v1/alarms", a.RaiseAlarm, "POST") app.Resource.InjectRoute("/ric/v1/alarms", a.ClearAlarm, "DELETE") app.Resource.InjectRoute("/ric/v1/alarms/active", a.GetActiveAlarms, "GET") app.Resource.InjectRoute("/ric/v1/alarms/history", a.GetAlarmHistory, "GET") app.Resource.InjectRoute("/ric/v1/alarms/config", a.SetAlarmConfig, "POST") app.Resource.InjectRoute("/ric/v1/alarms/config", a.GetAlarmConfig, "GET") + app.Resource.InjectRoute("/ric/v1/alarms/define", a.SetAlarmDefinition, "POST") + app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.DeleteAlarmDefinition, "DELETE") + app.Resource.InjectRoute("/ric/v1/alarms/define", a.GetAlarmDefinition, "GET") + app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.GetAlarmDefinition, "GET") // Start background timer for re-raising alerts - a.postClear = sdlcheck go a.StartAlertTimer() a.alarmClient, _ = alarm.InitAlarm("SEP", "ALARMMANAGER") + a.ReadAlarmInfoFromPersistentVolume() + app.RunWithParams(a, sdlcheck) } -func NewAlarmManager(amHost string, alertInterval int) *AlarmManager { +func NewAlarmManager(amHost string, alertInterval int, clearAlarm bool) *AlarmManager { if alertInterval == 0 { alertInterval = viper.GetInt("controls.promAlertManager.alertInterval") } @@ -257,19 +473,24 @@ func NewAlarmManager(amHost string, alertInterval int) *AlarmManager { } return &AlarmManager{ - rmrReady: false, - amHost: amHost, - amBaseUrl: viper.GetString("controls.promAlertManager.baseUrl"), - amSchemes: []string{viper.GetString("controls.promAlertManager.schemes")}, - alertInterval: alertInterval, - activeAlarms: make([]alarm.AlarmMessage, 0), - alarmHistory: make([]alarm.AlarmMessage, 0), - maxActiveAlarms: app.Config.GetInt("controls.maxActiveAlarms"), - maxAlarmHistory: app.Config.GetInt("controls.maxAlarmHistory"), + rmrReady: false, + postClear: clearAlarm, + amHost: amHost, + amBaseUrl: app.Config.GetString("controls.promAlertManager.baseUrl"), + amSchemes: []string{app.Config.GetString("controls.promAlertManager.schemes")}, + alertInterval: alertInterval, + activeAlarms: make([]AlarmNotification, 0), + alarmHistory: make([]AlarmNotification, 0), + uniqueAlarmId: 0, + maxActiveAlarms: app.Config.GetInt("controls.maxActiveAlarms"), + maxAlarmHistory: app.Config.GetInt("controls.maxAlarmHistory"), + exceededActiveAlarmOn: false, + exceededAlarmHistoryOn: false, + alarmInfoPvFile: app.Config.GetString("controls.alarmInfoPvFile"), } } // Main function func main() { - NewAlarmManager("", 0).Run(true) + NewAlarmManager("", 0, true).Run(true) }