X-Git-Url: https://gerrit.o-ran-sc.org/r/gitweb?a=blobdiff_plain;f=manager%2Fcmd%2Fmanager.go;h=da689dcd43208831a82185b6ac11eff5e287c344;hb=44be37f7fc8cc9d52ca454a7a08ac2d6a883561c;hp=2b5af170b663eac375712165fcdbc625b1447f29;hpb=78b2b0a83eb5ed740f21efb0b177d7c5609d7230;p=ric-plt%2Falarm-go.git diff --git a/manager/cmd/manager.go b/manager/cmd/manager.go index 2b5af17..da689dc 100755 --- a/manager/cmd/manager.go +++ b/manager/cmd/manager.go @@ -24,6 +24,11 @@ import ( "bytes" "encoding/json" "fmt" + "io/ioutil" + "net/http" + "os" + "time" + "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm" app "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp" clientruntime "github.com/go-openapi/runtime/client" @@ -32,19 +37,51 @@ import ( "github.com/prometheus/alertmanager/api/v2/client/alert" "github.com/prometheus/alertmanager/api/v2/models" "github.com/spf13/viper" - "io/ioutil" - "net/http" - "os" - "time" ) +func (a *AlarmManager) ClearExpiredAlarms(m AlarmNotification, idx int, mLocked bool) bool { + d, ok := alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem] + if !ok || d.TimeToLive == 0 { + return false + } + + elapsed := (time.Now().UnixNano() - m.AlarmTime) / 1e9 + if int(elapsed) >= d.TimeToLive { + app.Logger.Info("Alarm (sp=%d id=%d) with TTL=%d expired, clearing ...", m.Alarm.SpecificProblem, m.AlarmId, d.TimeToLive) + + m.AlarmAction = alarm.AlarmActionClear + m.AlarmTime = time.Now().UnixNano() + + if !mLocked { // For testing purpose + a.mutex.Lock() + } + a.ProcessClearAlarm(&m, d, idx) + return true + } + return false +} + +func (a *AlarmManager) StartTTLTimer(interval int) { + tick := time.Tick(time.Duration(interval) * time.Second) + for range tick { + a.mutex.Lock() + for idx, m := range a.activeAlarms { + if a.ClearExpiredAlarms(m, idx, true) { + a.mutex.Lock() // ClearExpiredAlarms unlocks the mutex, so re-lock here + continue + } + } + a.mutex.Unlock() + } +} + func (a *AlarmManager) StartAlertTimer() { tick := time.Tick(time.Duration(a.alertInterval) * time.Millisecond) for range tick { a.mutex.Lock() for _, m := range a.activeAlarms { app.Logger.Info("Re-raising alarm: %v", m) - a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime)) + a.PostAlert(a.GenerateAlertLabels(m.AlarmId, m.Alarm, AlertStatusActive, m.AlarmTime)) } a.mutex.Unlock() } @@ -78,15 +115,16 @@ func (a *AlarmManager) HandleAlarms(rp *app.RMRParams) (*alert.PostAlertsOK, err func (a *AlarmManager) ProcessAlarm(m *AlarmNotification) (*alert.PostAlertsOK, error) { a.mutex.Lock() - - if _, ok := alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok { + alarmDef := &alarm.AlarmDefinition{} + var ok bool + if alarmDef, ok = alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok { app.Logger.Warn("Alarm (SP='%d') not recognized, suppressing ...", m.Alarm.SpecificProblem) a.mutex.Unlock() return nil, nil } - // Suppress duplicate alarms idx, found := a.IsMatchFound(m.Alarm) + // Suppress duplicate alarms if found && m.AlarmAction == alarm.AlarmActionRaise { app.Logger.Info("Duplicate alarm found, suppressing ...") if m.PerceivedSeverity == a.activeAlarms[idx].PerceivedSeverity { @@ -100,60 +138,101 @@ func (a *AlarmManager) ProcessAlarm(m *AlarmNotification) (*alert.PostAlertsOK, } // Clear alarm if found from active alarm list - if m.AlarmAction == alarm.AlarmActionClear { - if found { - a.UpdateAlarmFields(a.activeAlarms[idx].AlarmId, m) - a.alarmHistory = append(a.alarmHistory, *m) - a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active") - if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) { - app.Logger.Warn("alarm history count exceeded maxAlarmHistory threshold") - a.GenerateThresholdAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, "history") - } + if found && m.AlarmAction == alarm.AlarmActionClear { + return a.ProcessClearAlarm(m, alarmDef, idx) + } - if a.exceededActiveAlarmOn && m.Alarm.SpecificProblem == alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD { - a.exceededActiveAlarmOn = false - } + // New alarm -> update active alarms and post to Alert Manager + if m.AlarmAction == alarm.AlarmActionRaise { + return a.ProcessRaiseAlarm(m, alarmDef) + } - if a.exceededAlarmHistoryOn && m.Alarm.SpecificProblem == alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD { - a.exceededAlarmHistoryOn = false - } + a.mutex.Unlock() + return nil, nil +} - a.WriteAlarmInfoToPersistentVolume() +func (a *AlarmManager) ProcessRaiseAlarm(m *AlarmNotification, alarmDef *alarm.AlarmDefinition) (*alert.PostAlertsOK, error) { + app.Logger.Debug("Raise alarmDef.RaiseDelay = %v, AlarmNotification = %v", alarmDef.RaiseDelay, *m) - if a.postClear { - a.mutex.Unlock() + // RaiseDelay > 0 in an alarm object in active alarm table indicates that raise delay is still ongoing for the alarm + m.AlarmDefinition.RaiseDelay = alarmDef.RaiseDelay + a.UpdateAlarmFields(a.GenerateAlarmId(), m) + a.UpdateActiveAlarmList(m) + a.mutex.Unlock() - // Send alarm notification to NOMA, if enabled - if app.Config.GetBool("controls.noma.enabled") { - m.PerceivedSeverity = alarm.SeverityCleared - return a.PostAlarm(m) - } - return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusResolved, m.AlarmTime)) - } + if alarmDef.RaiseDelay > 0 { + timerDelay(alarmDef.RaiseDelay) + a.mutex.Lock() + // Alarm may have been deleted from active alarms table during delay or table index may have changed + idx, found := a.IsMatchFound(m.Alarm) + if found { + // Alarm is not showed in active alarms or alarm history via CLI before RaiseDelay has elapsed, i.e the value is 0 + a.activeAlarms[idx].AlarmDefinition.RaiseDelay = 0 + app.Logger.Debug("Raise after delay alarmDef.RaiseDelay = %v, AlarmNotification = %v", alarmDef.RaiseDelay, *m) + a.mutex.Unlock() + } else { + app.Logger.Debug("Alarm deleted during raise delay. AlarmNotification = %v", *m) + a.mutex.Unlock() + return nil, nil } - app.Logger.Info("No matching active alarm found, suppressing ...") - a.mutex.Unlock() - return nil, nil } - // New alarm -> update active alarms and post to Alert Manager - if m.AlarmAction == alarm.AlarmActionRaise { - a.UpdateAlarmFields(a.GenerateAlarmId(), m) - a.UpdateAlarmLists(m) - a.WriteAlarmInfoToPersistentVolume() - a.mutex.Unlock() + m.AlarmDefinition.RaiseDelay = 0 + a.UpdateAlarmHistoryList(m) + a.WriteAlarmInfoToPersistentVolume() + + // Send alarm notification to NOMA, if enabled + if app.Config.GetBool("controls.noma.enabled") { + return a.PostAlarm(m) + } + return a.PostAlert(a.GenerateAlertLabels(m.AlarmId, m.Alarm, AlertStatusActive, m.AlarmTime)) +} - // Send alarm notification to NOMA, if enabled - if app.Config.GetBool("controls.noma.enabled") { - return a.PostAlarm(m) +func (a *AlarmManager) ProcessClearAlarm(m *AlarmNotification, alarmDef *alarm.AlarmDefinition, idx int) (*alert.PostAlertsOK, error) { + app.Logger.Debug("Clear alarmDef.ClearDelay = %v, AlarmNotification = %v", alarmDef.ClearDelay, *m) + if alarmDef.ClearDelay > 0 { + a.mutex.Unlock() + timerDelay(alarmDef.ClearDelay) + app.Logger.Debug("Clear after delay alarmDef.ClearDelay = %v, AlarmNotification = %v", alarmDef.ClearDelay, *m) + a.mutex.Lock() + // Another alarm clear may have happened during delay and active alarms table index changed + var found bool + idx, found = a.IsMatchFound(m.Alarm) + if !found { + a.mutex.Unlock() + return nil, nil } - return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime)) } + a.UpdateAlarmFields(a.activeAlarms[idx].AlarmId, m) + a.alarmHistory = append(a.alarmHistory, *m) + a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active") + if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) { + app.Logger.Warn("alarm history count exceeded maxAlarmHistory threshold") + a.GenerateThresholdAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, "history") + } + + if a.exceededActiveAlarmOn && m.Alarm.SpecificProblem == alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD { + a.exceededActiveAlarmOn = false + } + + if a.exceededAlarmHistoryOn && m.Alarm.SpecificProblem == alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD { + a.exceededAlarmHistoryOn = false + } + a.WriteAlarmInfoToPersistentVolume() a.mutex.Unlock() + if a.postClear && app.Config.GetBool("controls.noma.enabled") { + m.PerceivedSeverity = alarm.SeverityCleared + return a.PostAlarm(m) + } return nil, nil } +func timerDelay(delay int) { + timer := time.NewTimer(time.Duration(delay) * time.Second) + <-timer.C +} + func (a *AlarmManager) IsMatchFound(newAlarm alarm.Alarm) (int, bool) { for i, m := range a.activeAlarms { if m.ManagedObjectId == newAlarm.ManagedObjectId && m.ApplicationId == newAlarm.ApplicationId && @@ -187,7 +266,7 @@ func (a *AlarmManager) GenerateThresholdAlarm(sp int, data string) bool { thresholdMessage := alarm.AlarmMessage{ Alarm: thresholdAlarm, AlarmAction: alarm.AlarmActionRaise, - AlarmTime: (time.Now().UnixNano()), + AlarmTime: time.Now().UnixNano(), } alarmDef := alarm.RICAlarmDefinitions[sp] alarmId := a.GenerateAlarmId() @@ -198,21 +277,29 @@ func (a *AlarmManager) GenerateThresholdAlarm(sp int, data string) bool { return true } -func (a *AlarmManager) UpdateAlarmLists(newAlarm *AlarmNotification) { +func (a *AlarmManager) UpdateActiveAlarmList(newAlarm *AlarmNotification) { /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised. - The attempt to raise the alarm next time will be supressed when found as duplicate. */ + The attempt to raise the alarm next time will be suppressed when found as duplicate. */ if (len(a.activeAlarms) >= a.maxActiveAlarms) && (a.exceededActiveAlarmOn == false) { app.Logger.Warn("active alarm count exceeded maxActiveAlarms threshold") a.exceededActiveAlarmOn = a.GenerateThresholdAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, "active") } + // @todo: For now just keep the active alarms in-memory. Use SDL later for persistence + a.activeAlarms = append(a.activeAlarms, *newAlarm) +} + +func (a *AlarmManager) UpdateAlarmHistoryList(newAlarm *AlarmNotification) { + /* If maximum number of events in alarm history is reached, an error log writing is made, + and new alarm indicating the problem is raised. The attempt to add new event time will + be suppressed */ + if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) { app.Logger.Warn("alarm history count exceeded maxAlarmHistory threshold") a.exceededAlarmHistoryOn = a.GenerateThresholdAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, "history") } - // @todo: For now just keep the alarms (both active and history) in-memory. Use SDL later for persistence - a.activeAlarms = append(a.activeAlarms, *newAlarm) + // @todo: For now just keep the alarms history in-memory. Use SDL later for persistence a.alarmHistory = append(a.alarmHistory, *newAlarm) } @@ -234,7 +321,7 @@ func (a *AlarmManager) PostAlarm(m *AlarmNotification) (*alert.PostAlertsOK, err return nil, err } -func (a *AlarmManager) GenerateAlertLabels(newAlarm alarm.Alarm, status AlertStatus, alarmTime int64) (models.LabelSet, models.LabelSet) { +func (a *AlarmManager) GenerateAlertLabels(alarmId int, newAlarm alarm.Alarm, status AlertStatus, alarmTime int64) (models.LabelSet, models.LabelSet) { alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem] amLabels := models.LabelSet{ "status": string(status), @@ -244,7 +331,7 @@ func (a *AlarmManager) GenerateAlertLabels(newAlarm alarm.Alarm, status AlertSta "system_name": "RIC", } amAnnotations := models.LabelSet{ - "alarm_id": fmt.Sprintf("%d", alarmDef.AlarmId), + "alarm_id": fmt.Sprintf("%d", alarmId), "specific_problem": fmt.Sprintf("%d", newAlarm.SpecificProblem), "event_type": alarmDef.EventType, "identifying_info": newAlarm.IdentifyingInfo, @@ -284,14 +371,20 @@ func (a *AlarmManager) StatusCB() bool { if !a.rmrReady { app.Logger.Info("RMR not ready yet!") } - return a.rmrReady } func (a *AlarmManager) ConfigChangeCB(configparam string) { - a.maxActiveAlarms = app.Config.GetInt("controls.maxActiveAlarms") + if a.maxActiveAlarms == 0 { + a.maxActiveAlarms = 5000 + } + a.maxAlarmHistory = app.Config.GetInt("controls.maxAlarmHistory") + if a.maxAlarmHistory == 0 { + a.maxAlarmHistory = 20000 + } + a.alertInterval = viper.GetInt("controls.promAlertManager.alertInterval") a.amHost = viper.GetString("controls.promAlertManager.address") @@ -322,6 +415,9 @@ func (a *AlarmManager) ReadAlarmDefinitionFromJson() { ricAlarmDefintion.AlarmText = alarmDefinition.AlarmText ricAlarmDefintion.EventType = alarmDefinition.EventType ricAlarmDefintion.OperationInstructions = alarmDefinition.OperationInstructions + ricAlarmDefintion.RaiseDelay = alarmDefinition.RaiseDelay + ricAlarmDefintion.ClearDelay = alarmDefinition.ClearDelay + ricAlarmDefintion.TimeToLive = alarmDefinition.TimeToLive alarm.RICAlarmDefinitions[alarmDefinition.AlarmId] = ricAlarmDefintion } } @@ -357,8 +453,10 @@ func (a *AlarmManager) WriteAlarmInfoToPersistentVolume() { alarmpersistentinfo.UniqueAlarmId = a.uniqueAlarmId alarmpersistentinfo.ActiveAlarms = make([]AlarmNotification, len(a.activeAlarms)) alarmpersistentinfo.AlarmHistory = make([]AlarmNotification, len(a.alarmHistory)) + copy(alarmpersistentinfo.ActiveAlarms, a.activeAlarms) copy(alarmpersistentinfo.AlarmHistory, a.alarmHistory) + wdata, err := json.MarshalIndent(alarmpersistentinfo, "", " ") if err != nil { app.Logger.Error("alarmpersistentinfo json marshal error %v", err) @@ -370,7 +468,7 @@ func (a *AlarmManager) WriteAlarmInfoToPersistentVolume() { } } -func (a *AlarmManager) Run(sdlcheck bool) { +func (a *AlarmManager) Run(sdlcheck bool, ttlInterval int) { app.Logger.SetMdc("alarmManager", fmt.Sprintf("%s:%s", Version, Hash)) app.SetReadyCB(func(d interface{}) { a.rmrReady = true }, true) app.Resource.InjectStatusCb(a.StatusCB) @@ -379,19 +477,12 @@ func (a *AlarmManager) Run(sdlcheck bool) { alarm.RICAlarmDefinitions = make(map[int]*alarm.AlarmDefinition) a.ReadAlarmDefinitionFromJson() - app.Resource.InjectRoute("/ric/v1/alarms", a.RaiseAlarm, "POST") - app.Resource.InjectRoute("/ric/v1/alarms", a.ClearAlarm, "DELETE") - app.Resource.InjectRoute("/ric/v1/alarms/active", a.GetActiveAlarms, "GET") - app.Resource.InjectRoute("/ric/v1/alarms/history", a.GetAlarmHistory, "GET") - app.Resource.InjectRoute("/ric/v1/alarms/config", a.SetAlarmConfig, "POST") - app.Resource.InjectRoute("/ric/v1/alarms/config", a.GetAlarmConfig, "GET") - app.Resource.InjectRoute("/ric/v1/alarms/define", a.SetAlarmDefinition, "POST") - app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.DeleteAlarmDefinition, "DELETE") - app.Resource.InjectRoute("/ric/v1/alarms/define", a.GetAlarmDefinition, "GET") - app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.GetAlarmDefinition, "GET") + a.InjectRoutes() // Start background timer for re-raising alerts go a.StartAlertTimer() + go a.StartTTLTimer(ttlInterval) + a.alarmClient, _ = alarm.InitAlarm("SEP", "ALARMMANAGER") a.ReadAlarmInfoFromPersistentVolume() @@ -408,6 +499,16 @@ func NewAlarmManager(amHost string, alertInterval int, clearAlarm bool) *AlarmMa amHost = viper.GetString("controls.promAlertManager.address") } + maxActiveAlarms := app.Config.GetInt("controls.maxActiveAlarms") + if maxActiveAlarms == 0 { + maxActiveAlarms = 5000 + } + + maxAlarmHistory := app.Config.GetInt("controls.maxAlarmHistory") + if maxAlarmHistory == 0 { + maxAlarmHistory = 20000 + } + return &AlarmManager{ rmrReady: false, postClear: clearAlarm, @@ -418,8 +519,8 @@ func NewAlarmManager(amHost string, alertInterval int, clearAlarm bool) *AlarmMa activeAlarms: make([]AlarmNotification, 0), alarmHistory: make([]AlarmNotification, 0), uniqueAlarmId: 0, - maxActiveAlarms: app.Config.GetInt("controls.maxActiveAlarms"), - maxAlarmHistory: app.Config.GetInt("controls.maxAlarmHistory"), + maxActiveAlarms: maxActiveAlarms, + maxAlarmHistory: maxAlarmHistory, exceededActiveAlarmOn: false, exceededAlarmHistoryOn: false, alarmInfoPvFile: app.Config.GetString("controls.alarmInfoPvFile"), @@ -428,5 +529,5 @@ func NewAlarmManager(amHost string, alertInterval int, clearAlarm bool) *AlarmMa // Main function func main() { - NewAlarmManager("", 0, true).Run(true) + NewAlarmManager("", 0, true).Run(true, 10) }