X-Git-Url: https://gerrit.o-ran-sc.org/r/gitweb?a=blobdiff_plain;ds=sidebyside;f=manager%2Fcmd%2Fmanager.go;h=f00a812c534c063e08b9431daa07962370da567d;hb=refs%2Fchanges%2F11%2F4811%2F4;hp=d74c62d894361c9615abaab491551f6c552bce02;hpb=121e8b6026904eb245c6f8ac805efa7c3dad3ae5;p=ric-plt%2Falarm-go.git diff --git a/manager/cmd/manager.go b/manager/cmd/manager.go index d74c62d..f00a812 100755 --- a/manager/cmd/manager.go +++ b/manager/cmd/manager.go @@ -23,17 +23,17 @@ package main import ( "encoding/json" "fmt" - "time" - + "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm" + app "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp" clientruntime "github.com/go-openapi/runtime/client" "github.com/go-openapi/strfmt" "github.com/prometheus/alertmanager/api/v2/client" "github.com/prometheus/alertmanager/api/v2/client/alert" "github.com/prometheus/alertmanager/api/v2/models" "github.com/spf13/viper" - - "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm" - app "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp" + "io/ioutil" + "os" + "time" ) func (a *AlarmManager) StartAlertTimer() { @@ -42,7 +42,7 @@ func (a *AlarmManager) StartAlertTimer() { a.mutex.Lock() for _, m := range a.activeAlarms { app.Logger.Info("Re-raising alarm: %v", m) - a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive)) + a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime)) } a.mutex.Unlock() } @@ -75,37 +75,63 @@ func (a *AlarmManager) HandleAlarms(rp *app.RMRParams) (*alert.PostAlertsOK, err } func (a *AlarmManager) ProcessAlarm(m *alarm.AlarmMessage) (*alert.PostAlertsOK, error) { + a.mutex.Lock() if _, ok := alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok { app.Logger.Warn("Alarm (SP='%d') not recognized, suppressing ...", m.Alarm.SpecificProblem) + a.mutex.Unlock() return nil, nil } // Suppress duplicate alarms idx, found := a.IsMatchFound(m.Alarm) - if found && m.AlarmAction != alarm.AlarmActionClear { + if found && m.AlarmAction == alarm.AlarmActionRaise { app.Logger.Info("Duplicate alarm found, suppressing ...") - return nil, nil + if m.PerceivedSeverity == a.activeAlarms[idx].PerceivedSeverity { + // Duplicate with same severity found + a.mutex.Unlock() + return nil, nil + } else { + // Remove duplicate with different severity + a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active") + } } // Clear alarm if found from active alarm list if m.AlarmAction == alarm.AlarmActionClear { if found { + a.alarmHistory = append(a.alarmHistory, *m) a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active") - + if ((len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false)){ + app.Logger.Error("alarm history count exceeded maxAlarmHistory threshold") + histAlarm := a.alarmClient.NewAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "threshold", "history") + histAlarmMessage := alarm.AlarmMessage{Alarm: histAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())} + a.activeAlarms = append(a.activeAlarms, histAlarmMessage) + a.alarmHistory = append(a.alarmHistory, histAlarmMessage) + } + if ((a.exceededActiveAlarmOn == true) && (m.Alarm.SpecificProblem == alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD)) { + a.exceededActiveAlarmOn = false + } + if ((a.exceededAlarmHistoryOn == true) && (m.Alarm.SpecificProblem == alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD)) { + a.exceededAlarmHistoryOn = false + } if a.postClear { - return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusResolved)) + a.mutex.Unlock() + return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusResolved, m.AlarmTime)) } } app.Logger.Info("No matching active alarm found, suppressing ...") + a.mutex.Unlock() return nil, nil } // New alarm -> update active alarms and post to Alert Manager if m.AlarmAction == alarm.AlarmActionRaise { a.UpdateAlarmLists(m) - return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive)) + a.mutex.Unlock() + return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime)) } + a.mutex.Unlock() return nil, nil } @@ -120,25 +146,30 @@ func (a *AlarmManager) IsMatchFound(newAlarm alarm.Alarm) (int, bool) { } func (a *AlarmManager) RemoveAlarm(alarms []alarm.AlarmMessage, i int, listName string) []alarm.AlarmMessage { - a.mutex.Lock() - defer a.mutex.Unlock() - app.Logger.Info("Alarm '%+v' deleted from the '%s' list", alarms[i], listName) copy(alarms[i:], alarms[i+1:]) return alarms[:len(alarms)-1] } func (a *AlarmManager) UpdateAlarmLists(newAlarm *alarm.AlarmMessage) { - a.mutex.Lock() - defer a.mutex.Unlock() - - // If maximum number of active alarms is reached, purge the oldest alarm - if len(a.activeAlarms) >= viper.GetInt("controls.maxActiveAlarms") { - a.activeAlarms = a.RemoveAlarm(a.activeAlarms, 0, "active") + /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised. + The attempt to raise the alarm next time will be supressed when found as duplicate. */ + if ((len(a.activeAlarms) >= a.maxActiveAlarms) && (a.exceededActiveAlarmOn == false)) { + app.Logger.Error("active alarm count exceeded maxActiveAlarms threshold") + actAlarm := a.alarmClient.NewAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "threshold", "active") + actAlarmMessage := alarm.AlarmMessage{Alarm: actAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())} + a.activeAlarms = append(a.activeAlarms, actAlarmMessage) + a.alarmHistory = append(a.alarmHistory, actAlarmMessage) + a.exceededActiveAlarmOn = true } - if len(a.alarmHistory) >= viper.GetInt("controls.maxAlarmHistory") { - a.alarmHistory = a.RemoveAlarm(a.alarmHistory, 0, "history") + if ((len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false)) { + app.Logger.Error("alarm history count exceeded maxAlarmHistory threshold") + histAlarm := a.alarmClient.NewAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "threshold", "history") + histAlarmMessage := alarm.AlarmMessage{Alarm: histAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())} + a.activeAlarms = append(a.activeAlarms, histAlarmMessage) + a.alarmHistory = append(a.alarmHistory, histAlarmMessage) + a.exceededAlarmHistoryOn = true } // @todo: For now just keep the alarms (both active and history) in-memory. Use SDL later for persistence @@ -146,7 +177,7 @@ func (a *AlarmManager) UpdateAlarmLists(newAlarm *alarm.AlarmMessage) { a.alarmHistory = append(a.alarmHistory, *newAlarm) } -func (a *AlarmManager) GenerateAlertLabels(newAlarm alarm.Alarm, status AlertStatus) (models.LabelSet, models.LabelSet) { +func (a *AlarmManager) GenerateAlertLabels(newAlarm alarm.Alarm, status AlertStatus, alarmTime int64) (models.LabelSet, models.LabelSet) { alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem] amLabels := models.LabelSet{ "status": string(status), @@ -161,6 +192,7 @@ func (a *AlarmManager) GenerateAlertLabels(newAlarm alarm.Alarm, status AlertSta "additional_info": newAlarm.AdditionalInfo, "summary": alarmDef.EventType, "instructions": alarmDef.OperationInstructions, + "timestamp": fmt.Sprintf("%s", time.Unix(0, alarmTime).Format("02/01/2006, 15:04:05")), } return amLabels, amAnnotations @@ -197,19 +229,75 @@ func (a *AlarmManager) StatusCB() bool { return a.rmrReady } +func (a *AlarmManager) ConfigChangeCB(configparam string) { + + a.maxActiveAlarms = app.Config.GetInt("controls.maxActiveAlarms") + a.maxAlarmHistory = app.Config.GetInt("controls.maxAlarmHistory") + a.alertInterval = viper.GetInt("controls.promAlertManager.alertInterval") + a.amHost = viper.GetString("controls.promAlertManager.address") + + app.Logger.Debug("ConfigChangeCB: maxActiveAlarms %v", a.maxActiveAlarms) + app.Logger.Debug("ConfigChangeCB: maxAlarmHistory = %v", a.maxAlarmHistory) + app.Logger.Debug("ConfigChangeCB: alertInterval %v", a.alertInterval) + app.Logger.Debug("ConfigChangeCB: amHost = %v", a.amHost) + + return +} + +func (a *AlarmManager) ReadAlarmDefinitionFromJson() { + + filename := os.Getenv("DEF_FILE") + file, err := ioutil.ReadFile(filename) + if err == nil { + data := RicAlarmDefinitions{} + err = json.Unmarshal([]byte(file), &data) + if err == nil { + for _, alarmDefinition := range data.AlarmDefinitions { + _, exists := alarm.RICAlarmDefinitions[alarmDefinition.AlarmId] + if exists { + app.Logger.Error("ReadAlarmDefinitionFromJson: alarm definition already exists for %v", alarmDefinition.AlarmId) + } else { + app.Logger.Debug("ReadAlarmDefinitionFromJson: alarm %v", alarmDefinition.AlarmId) + ricAlarmDefintion := new(alarm.AlarmDefinition) + ricAlarmDefintion.AlarmId = alarmDefinition.AlarmId + ricAlarmDefintion.AlarmText = alarmDefinition.AlarmText + ricAlarmDefintion.EventType = alarmDefinition.EventType + ricAlarmDefintion.OperationInstructions = alarmDefinition.OperationInstructions + alarm.RICAlarmDefinitions[alarmDefinition.AlarmId] = ricAlarmDefintion + } + } + } else { + app.Logger.Error("ReadAlarmDefinitionFromJson: json.Unmarshal failed with error %v", err) + } + } else { + app.Logger.Error("ReadAlarmDefinitionFromJson: ioutil.ReadFile failed with error %v", err) + } +} + func (a *AlarmManager) Run(sdlcheck bool) { app.Logger.SetMdc("alarmManager", fmt.Sprintf("%s:%s", Version, Hash)) app.SetReadyCB(func(d interface{}) { a.rmrReady = true }, true) app.Resource.InjectStatusCb(a.StatusCB) + app.AddConfigChangeListener(a.ConfigChangeCB) + + alarm.RICAlarmDefinitions = make(map[int]*alarm.AlarmDefinition) + a.ReadAlarmDefinitionFromJson() app.Resource.InjectRoute("/ric/v1/alarms", a.RaiseAlarm, "POST") app.Resource.InjectRoute("/ric/v1/alarms", a.ClearAlarm, "DELETE") app.Resource.InjectRoute("/ric/v1/alarms/active", a.GetActiveAlarms, "GET") app.Resource.InjectRoute("/ric/v1/alarms/history", a.GetAlarmHistory, "GET") + app.Resource.InjectRoute("/ric/v1/alarms/config", a.SetAlarmConfig, "POST") + app.Resource.InjectRoute("/ric/v1/alarms/config", a.GetAlarmConfig, "GET") + app.Resource.InjectRoute("/ric/v1/alarms/define", a.SetAlarmDefinition, "POST") + app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.DeleteAlarmDefinition, "DELETE") + app.Resource.InjectRoute("/ric/v1/alarms/define", a.GetAlarmDefinition, "GET") + app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.GetAlarmDefinition, "GET") // Start background timer for re-raising alerts a.postClear = sdlcheck go a.StartAlertTimer() + a.alarmClient, _ = alarm.InitAlarm("SEP", "ALARMMANAGER") app.RunWithParams(a, sdlcheck) } @@ -224,13 +312,17 @@ func NewAlarmManager(amHost string, alertInterval int) *AlarmManager { } return &AlarmManager{ - rmrReady: false, - amHost: amHost, - amBaseUrl: viper.GetString("controls.promAlertManager.baseUrl"), - amSchemes: []string{viper.GetString("controls.promAlertManager.schemes")}, - alertInterval: alertInterval, - activeAlarms: make([]alarm.AlarmMessage, 0), - alarmHistory: make([]alarm.AlarmMessage, 0), + rmrReady: false, + amHost: amHost, + amBaseUrl: viper.GetString("controls.promAlertManager.baseUrl"), + amSchemes: []string{viper.GetString("controls.promAlertManager.schemes")}, + alertInterval: alertInterval, + activeAlarms: make([]alarm.AlarmMessage, 0), + alarmHistory: make([]alarm.AlarmMessage, 0), + maxActiveAlarms: app.Config.GetInt("controls.maxActiveAlarms"), + maxAlarmHistory: app.Config.GetInt("controls.maxAlarmHistory"), + exceededActiveAlarmOn: false, + exceededAlarmHistoryOn: false, } }