2 * Copyright (c) 2020 AT&T Intellectual Property.
3 * Copyright (c) 2020 Nokia.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 * This source code is part of the near-RT RIC (RAN Intelligent Controller)
18 * platform project (RICP).
32 "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm"
33 app "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp"
34 clientruntime "github.com/go-openapi/runtime/client"
35 "github.com/go-openapi/strfmt"
36 "github.com/prometheus/alertmanager/api/v2/client"
37 "github.com/prometheus/alertmanager/api/v2/client/alert"
38 "github.com/prometheus/alertmanager/api/v2/models"
39 "github.com/spf13/viper"
42 func (a *AlarmManager) ClearExpiredAlarms(m AlarmNotification, idx int, mLocked bool) bool {
43 d, ok := alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]
44 if !ok || d.TimeToLive == 0 {
48 elapsed := (time.Now().UnixNano() - m.AlarmTime) / 1e9
49 if int(elapsed) >= d.TimeToLive {
50 app.Logger.Info("Alarm (sp=%d id=%d) with TTL=%d expired, clearing ...", m.Alarm.SpecificProblem, m.AlarmId, d.TimeToLive)
52 m.AlarmAction = alarm.AlarmActionClear
53 m.AlarmTime = time.Now().UnixNano()
55 if !mLocked { // For testing purpose
58 a.ProcessClearAlarm(&m, d, idx)
64 func (a *AlarmManager) StartTTLTimer(interval int) {
65 tick := time.Tick(time.Duration(interval) * time.Second)
68 for idx, m := range a.activeAlarms {
69 if a.ClearExpiredAlarms(m, idx, true) {
70 a.mutex.Lock() // ClearExpiredAlarms unlocks the mutex, so re-lock here
78 func (a *AlarmManager) StartAlertTimer() {
79 tick := time.Tick(time.Duration(a.alertInterval) * time.Millisecond)
82 for _, m := range a.activeAlarms {
83 app.Logger.Info("Re-raising alarm: %v", m)
84 a.PostAlert(a.GenerateAlertLabels(m.AlarmId, m.Alarm, AlertStatusActive, m.AlarmTime))
90 func (a *AlarmManager) Consume(rp *app.RMRParams) (err error) {
91 app.Logger.Info("Message received!")
93 defer app.Rmr.Free(rp.Mbuf)
95 case alarm.RIC_ALARM_UPDATE:
98 app.Logger.Info("Unknown Message Type '%d', discarding", rp.Mtype)
104 func (a *AlarmManager) HandleAlarms(rp *app.RMRParams) (*alert.PostAlertsOK, error) {
105 var m alarm.AlarmMessage
106 app.Logger.Info("Received JSON: %s", rp.Payload)
107 if err := json.Unmarshal(rp.Payload, &m); err != nil {
108 app.Logger.Error("json.Unmarshal failed: %v", err)
111 app.Logger.Info("newAlarm: %v", m)
113 return a.ProcessAlarm(&AlarmNotification{m, alarm.AlarmDefinition{}})
116 func (a *AlarmManager) ProcessAlarm(m *AlarmNotification) (*alert.PostAlertsOK, error) {
118 alarmDef := &alarm.AlarmDefinition{}
120 if alarmDef, ok = alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok {
121 app.Logger.Warn("Alarm (SP='%d') not recognized, suppressing ...", m.Alarm.SpecificProblem)
126 idx, found := a.IsMatchFound(m.Alarm)
127 // Suppress duplicate alarms
128 if found && m.AlarmAction == alarm.AlarmActionRaise {
129 app.Logger.Info("Duplicate alarm found, suppressing ...")
130 if m.PerceivedSeverity == a.activeAlarms[idx].PerceivedSeverity {
131 // Duplicate with same severity found
135 // Remove duplicate with different severity
136 a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
140 // Clear alarm if found from active alarm list
141 if found && m.AlarmAction == alarm.AlarmActionClear {
142 return a.ProcessClearAlarm(m, alarmDef, idx)
145 // New alarm -> update active alarms and post to Alert Manager
146 if m.AlarmAction == alarm.AlarmActionRaise {
147 return a.ProcessRaiseAlarm(m, alarmDef)
154 func (a *AlarmManager) ProcessRaiseAlarm(m *AlarmNotification, alarmDef *alarm.AlarmDefinition) (*alert.PostAlertsOK, error) {
155 app.Logger.Debug("Raise alarmDef.RaiseDelay = %v, AlarmNotification = %v", alarmDef.RaiseDelay, *m)
157 // RaiseDelay > 0 in an alarm object in active alarm table indicates that raise delay is still ongoing for the alarm
158 m.AlarmDefinition.RaiseDelay = alarmDef.RaiseDelay
159 a.UpdateAlarmFields(a.GenerateAlarmId(), m)
160 a.UpdateActiveAlarmList(m)
163 if alarmDef.RaiseDelay > 0 {
164 timerDelay(alarmDef.RaiseDelay)
166 // Alarm may have been deleted from active alarms table during delay or table index may have changed
167 idx, found := a.IsMatchFound(m.Alarm)
169 // Alarm is not showed in active alarms or alarm history via CLI before RaiseDelay has elapsed, i.e the value is 0
170 a.activeAlarms[idx].AlarmDefinition.RaiseDelay = 0
171 app.Logger.Debug("Raise after delay alarmDef.RaiseDelay = %v, AlarmNotification = %v", alarmDef.RaiseDelay, *m)
174 app.Logger.Debug("Alarm deleted during raise delay. AlarmNotification = %v", *m)
180 m.AlarmDefinition.RaiseDelay = 0
181 a.UpdateAlarmHistoryList(m)
182 a.WriteAlarmInfoToPersistentVolume()
184 // Send alarm notification to NOMA, if enabled
185 if app.Config.GetBool("controls.noma.enabled") {
186 return a.PostAlarm(m)
188 return a.PostAlert(a.GenerateAlertLabels(m.AlarmId, m.Alarm, AlertStatusActive, m.AlarmTime))
191 func (a *AlarmManager) ProcessClearAlarm(m *AlarmNotification, alarmDef *alarm.AlarmDefinition, idx int) (*alert.PostAlertsOK, error) {
192 app.Logger.Debug("Clear alarmDef.ClearDelay = %v, AlarmNotification = %v", alarmDef.ClearDelay, *m)
193 if alarmDef.ClearDelay > 0 {
195 timerDelay(alarmDef.ClearDelay)
196 app.Logger.Debug("Clear after delay alarmDef.ClearDelay = %v, AlarmNotification = %v", alarmDef.ClearDelay, *m)
198 // Another alarm clear may have happened during delay and active alarms table index changed
200 idx, found = a.IsMatchFound(m.Alarm)
206 a.UpdateAlarmFields(a.activeAlarms[idx].AlarmId, m)
207 a.alarmHistory = append(a.alarmHistory, *m)
208 a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
209 if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) {
210 app.Logger.Warn("alarm history count exceeded maxAlarmHistory threshold")
211 a.GenerateThresholdAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, "history")
214 if a.exceededActiveAlarmOn && m.Alarm.SpecificProblem == alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD {
215 a.exceededActiveAlarmOn = false
218 if a.exceededAlarmHistoryOn && m.Alarm.SpecificProblem == alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD {
219 a.exceededAlarmHistoryOn = false
221 a.WriteAlarmInfoToPersistentVolume()
224 if a.postClear && app.Config.GetBool("controls.noma.enabled") {
225 m.PerceivedSeverity = alarm.SeverityCleared
226 return a.PostAlarm(m)
231 func timerDelay(delay int) {
232 timer := time.NewTimer(time.Duration(delay) * time.Second)
236 func (a *AlarmManager) IsMatchFound(newAlarm alarm.Alarm) (int, bool) {
237 for i, m := range a.activeAlarms {
238 if m.ManagedObjectId == newAlarm.ManagedObjectId && m.ApplicationId == newAlarm.ApplicationId &&
239 m.SpecificProblem == newAlarm.SpecificProblem && m.IdentifyingInfo == newAlarm.IdentifyingInfo {
246 func (a *AlarmManager) RemoveAlarm(alarms []AlarmNotification, i int, listName string) []AlarmNotification {
247 app.Logger.Info("Alarm '%+v' deleted from the '%s' list", alarms[i], listName)
248 copy(alarms[i:], alarms[i+1:])
249 return alarms[:len(alarms)-1]
252 func (a *AlarmManager) GenerateAlarmId() int {
253 a.uniqueAlarmId++ // @todo: generate a unique ID
254 return a.uniqueAlarmId
257 func (a *AlarmManager) UpdateAlarmFields(alarmId int, newAlarm *AlarmNotification) {
258 alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem]
259 newAlarm.AlarmId = alarmId
260 newAlarm.AlarmText = alarmDef.AlarmText
261 newAlarm.EventType = alarmDef.EventType
264 func (a *AlarmManager) GenerateThresholdAlarm(sp int, data string) bool {
265 thresholdAlarm := a.alarmClient.NewAlarm(sp, alarm.SeverityWarning, "threshold", data)
266 thresholdMessage := alarm.AlarmMessage{
267 Alarm: thresholdAlarm,
268 AlarmAction: alarm.AlarmActionRaise,
269 AlarmTime: time.Now().UnixNano(),
271 alarmDef := alarm.RICAlarmDefinitions[sp]
272 alarmId := a.GenerateAlarmId()
273 alarmDef.AlarmId = alarmId
274 a.activeAlarms = append(a.activeAlarms, AlarmNotification{thresholdMessage, *alarmDef})
275 a.alarmHistory = append(a.alarmHistory, AlarmNotification{thresholdMessage, *alarmDef})
280 func (a *AlarmManager) UpdateActiveAlarmList(newAlarm *AlarmNotification) {
281 /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised.
282 The attempt to raise the alarm next time will be suppressed when found as duplicate. */
283 if (len(a.activeAlarms) >= a.maxActiveAlarms) && (a.exceededActiveAlarmOn == false) {
284 app.Logger.Warn("active alarm count exceeded maxActiveAlarms threshold")
285 a.exceededActiveAlarmOn = a.GenerateThresholdAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, "active")
288 // @todo: For now just keep the active alarms in-memory. Use SDL later for persistence
289 a.activeAlarms = append(a.activeAlarms, *newAlarm)
292 func (a *AlarmManager) UpdateAlarmHistoryList(newAlarm *AlarmNotification) {
293 /* If maximum number of events in alarm history is reached, an error log writing is made,
294 and new alarm indicating the problem is raised. The attempt to add new event time will
297 if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) {
298 app.Logger.Warn("alarm history count exceeded maxAlarmHistory threshold")
299 a.exceededAlarmHistoryOn = a.GenerateThresholdAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, "history")
302 // @todo: For now just keep the alarms history in-memory. Use SDL later for persistence
303 a.alarmHistory = append(a.alarmHistory, *newAlarm)
306 func (a *AlarmManager) PostAlarm(m *AlarmNotification) (*alert.PostAlertsOK, error) {
307 result, err := json.Marshal(m)
309 app.Logger.Info("json.Marshal failed: %v", err)
313 fullUrl := fmt.Sprintf("%s/%s", app.Config.GetString("controls.noma.host"), app.Config.GetString("controls.noma.alarmUrl"))
314 app.Logger.Info("Posting alarm to '%s'", fullUrl)
316 resp, err := http.Post(fullUrl, "application/json", bytes.NewReader(result))
317 if err != nil || resp == nil {
318 app.Logger.Info("Unable to post alarm to '%s': %v", fullUrl, err)
324 func (a *AlarmManager) GenerateAlertLabels(alarmId int, newAlarm alarm.Alarm, status AlertStatus, alarmTime int64) (models.LabelSet, models.LabelSet) {
325 alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem]
326 amLabels := models.LabelSet{
327 "status": string(status),
328 "alertname": alarmDef.AlarmText,
329 "severity": string(newAlarm.PerceivedSeverity),
330 "service": fmt.Sprintf("%s/%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
331 "info": newAlarm.IdentifyingInfo,
332 "system_name": "RIC",
334 amAnnotations := models.LabelSet{
335 "alarm_id": fmt.Sprintf("%d", alarmId),
336 "specific_problem": fmt.Sprintf("%d", newAlarm.SpecificProblem),
337 "event_type": alarmDef.EventType,
338 "identifying_info": newAlarm.IdentifyingInfo,
339 "additional_info": newAlarm.AdditionalInfo,
340 "description": fmt.Sprintf("%s:%s", newAlarm.IdentifyingInfo, newAlarm.AdditionalInfo),
341 "instructions": alarmDef.OperationInstructions,
342 "timestamp": fmt.Sprintf("%s", time.Unix(0, alarmTime).Format("02/01/2006, 15:04:05")),
345 return amLabels, amAnnotations
348 func (a *AlarmManager) NewAlertmanagerClient() *client.Alertmanager {
349 cr := clientruntime.New(a.amHost, a.amBaseUrl, a.amSchemes)
350 return client.New(cr, strfmt.Default)
353 func (a *AlarmManager) PostAlert(amLabels, amAnnotations models.LabelSet) (*alert.PostAlertsOK, error) {
354 pa := &models.PostableAlert{
356 GeneratorURL: strfmt.URI(""),
359 Annotations: amAnnotations,
361 alertParams := alert.NewPostAlertsParams().WithAlerts(models.PostableAlerts{pa})
363 app.Logger.Info("Posting alerts: labels: %+v, annotations: %+v", amLabels, amAnnotations)
364 ok, err := a.NewAlertmanagerClient().Alert.PostAlerts(alertParams)
366 app.Logger.Error("Posting alerts to '%s/%s' failed with error: %v", a.amHost, a.amBaseUrl, err)
371 func (a *AlarmManager) StatusCB() bool {
373 app.Logger.Info("RMR not ready yet!")
378 func (a *AlarmManager) ConfigChangeCB(configparam string) {
379 a.maxActiveAlarms = app.Config.GetInt("controls.maxActiveAlarms")
380 if a.maxActiveAlarms == 0 {
381 a.maxActiveAlarms = 5000
384 a.maxAlarmHistory = app.Config.GetInt("controls.maxAlarmHistory")
385 if a.maxAlarmHistory == 0 {
386 a.maxAlarmHistory = 20000
389 a.alertInterval = viper.GetInt("controls.promAlertManager.alertInterval")
390 a.amHost = viper.GetString("controls.promAlertManager.address")
392 app.Logger.Debug("ConfigChangeCB: maxActiveAlarms %v", a.maxActiveAlarms)
393 app.Logger.Debug("ConfigChangeCB: maxAlarmHistory = %v", a.maxAlarmHistory)
394 app.Logger.Debug("ConfigChangeCB: alertInterval %v", a.alertInterval)
395 app.Logger.Debug("ConfigChangeCB: amHost = %v", a.amHost)
400 func (a *AlarmManager) ReadAlarmDefinitionFromJson() {
402 filename := os.Getenv("DEF_FILE")
403 file, err := ioutil.ReadFile(filename)
405 data := RicAlarmDefinitions{}
406 err = json.Unmarshal([]byte(file), &data)
408 for _, alarmDefinition := range data.AlarmDefinitions {
409 _, exists := alarm.RICAlarmDefinitions[alarmDefinition.AlarmId]
411 app.Logger.Error("ReadAlarmDefinitionFromJson: alarm definition already exists for %v", alarmDefinition.AlarmId)
413 app.Logger.Debug("ReadAlarmDefinitionFromJson: alarm %v", alarmDefinition.AlarmId)
414 ricAlarmDefintion := new(alarm.AlarmDefinition)
415 ricAlarmDefintion.AlarmId = alarmDefinition.AlarmId
416 ricAlarmDefintion.AlarmText = alarmDefinition.AlarmText
417 ricAlarmDefintion.EventType = alarmDefinition.EventType
418 ricAlarmDefintion.OperationInstructions = alarmDefinition.OperationInstructions
419 ricAlarmDefintion.RaiseDelay = alarmDefinition.RaiseDelay
420 ricAlarmDefintion.ClearDelay = alarmDefinition.ClearDelay
421 ricAlarmDefintion.TimeToLive = alarmDefinition.TimeToLive
422 alarm.RICAlarmDefinitions[alarmDefinition.AlarmId] = ricAlarmDefintion
426 app.Logger.Error("ReadAlarmDefinitionFromJson: json.Unmarshal failed with error %v", err)
429 app.Logger.Error("ReadAlarmDefinitionFromJson: ioutil.ReadFile failed with error %v", err)
433 func (a *AlarmManager) ReadAlarmInfoFromPersistentVolume() {
434 var alarmpersistentinfo AlarmPersistentInfo
435 byteValue, rerr := ioutil.ReadFile(a.alarmInfoPvFile)
437 app.Logger.Error("ararminfo.json file read error %v", rerr)
439 err := json.Unmarshal(byteValue, &alarmpersistentinfo)
441 app.Logger.Error("alarmpersistentinfo json unmarshal error %v", err)
443 a.uniqueAlarmId = alarmpersistentinfo.UniqueAlarmId
444 a.activeAlarms = make([]AlarmNotification, len(alarmpersistentinfo.ActiveAlarms))
445 a.alarmHistory = make([]AlarmNotification, len(alarmpersistentinfo.AlarmHistory))
446 copy(a.activeAlarms, alarmpersistentinfo.ActiveAlarms)
447 copy(a.alarmHistory, alarmpersistentinfo.AlarmHistory)
452 func (a *AlarmManager) WriteAlarmInfoToPersistentVolume() {
453 var alarmpersistentinfo AlarmPersistentInfo
454 alarmpersistentinfo.UniqueAlarmId = a.uniqueAlarmId
455 alarmpersistentinfo.ActiveAlarms = make([]AlarmNotification, len(a.activeAlarms))
456 alarmpersistentinfo.AlarmHistory = make([]AlarmNotification, len(a.alarmHistory))
458 copy(alarmpersistentinfo.ActiveAlarms, a.activeAlarms)
459 copy(alarmpersistentinfo.AlarmHistory, a.alarmHistory)
461 wdata, err := json.MarshalIndent(alarmpersistentinfo, "", " ")
463 app.Logger.Error("alarmpersistentinfo json marshal error %v", err)
465 werr := ioutil.WriteFile(a.alarmInfoPvFile, wdata, 0777)
467 app.Logger.Error("alarminfo.json file write error %v", werr)
472 func (a *AlarmManager) Run(sdlcheck bool, ttlInterval int) {
473 app.Logger.SetMdc("alarmManager", fmt.Sprintf("%s:%s", Version, Hash))
474 app.SetReadyCB(func(d interface{}) { a.rmrReady = true }, true)
475 app.Resource.InjectStatusCb(a.StatusCB)
476 app.AddConfigChangeListener(a.ConfigChangeCB)
478 alarm.RICAlarmDefinitions = make(map[int]*alarm.AlarmDefinition)
479 a.ReadAlarmDefinitionFromJson()
483 // Start background timer for re-raising alerts
484 go a.StartAlertTimer()
485 go a.StartTTLTimer(ttlInterval)
487 a.alarmClient, _ = alarm.InitAlarm("SEP", "ALARMMANAGER")
489 a.ReadAlarmInfoFromPersistentVolume()
491 app.RunWithParams(a, sdlcheck)
494 func NewAlarmManager(amHost string, alertInterval int, clearAlarm bool) *AlarmManager {
495 if alertInterval == 0 {
496 alertInterval = viper.GetInt("controls.promAlertManager.alertInterval")
500 amHost = viper.GetString("controls.promAlertManager.address")
503 maxActiveAlarms := app.Config.GetInt("controls.maxActiveAlarms")
504 if maxActiveAlarms == 0 {
505 maxActiveAlarms = 5000
508 maxAlarmHistory := app.Config.GetInt("controls.maxAlarmHistory")
509 if maxAlarmHistory == 0 {
510 maxAlarmHistory = 20000
513 return &AlarmManager{
515 postClear: clearAlarm,
517 amBaseUrl: app.Config.GetString("controls.promAlertManager.baseUrl"),
518 amSchemes: []string{app.Config.GetString("controls.promAlertManager.schemes")},
519 alertInterval: alertInterval,
520 activeAlarms: make([]AlarmNotification, 0),
521 alarmHistory: make([]AlarmNotification, 0),
523 maxActiveAlarms: maxActiveAlarms,
524 maxAlarmHistory: maxAlarmHistory,
525 exceededActiveAlarmOn: false,
526 exceededAlarmHistoryOn: false,
527 alarmInfoPvFile: app.Config.GetString("controls.alarmInfoPvFile"),
533 NewAlarmManager("", 0, true).Run(true, 10)