enabled persistence storage for active alarms and alarm history
[ric-plt/alarm-go.git] / manager / cmd / manager.go
1 /*
2  *  Copyright (c) 2020 AT&T Intellectual Property.
3  *  Copyright (c) 2020 Nokia.
4  *
5  *  Licensed under the Apache License, Version 2.0 (the "License");
6  *  you may not use this file except in compliance with the License.
7  *  You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *  Unless required by applicable law or agreed to in writing, software
12  *  distributed under the License is distributed on an "AS IS" BASIS,
13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *  See the License for the specific language governing permissions and
15  *  limitations under the License.
16  *
17  * This source code is part of the near-RT RIC (RAN Intelligent Controller)
18  * platform project (RICP).
19  */
20
21 package main
22
23 import (
24         "bytes"
25         "encoding/json"
26         "fmt"
27         "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm"
28         app "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp"
29         clientruntime "github.com/go-openapi/runtime/client"
30         "github.com/go-openapi/strfmt"
31         "github.com/prometheus/alertmanager/api/v2/client"
32         "github.com/prometheus/alertmanager/api/v2/client/alert"
33         "github.com/prometheus/alertmanager/api/v2/models"
34         "github.com/spf13/viper"
35         "io/ioutil"
36         "net/http"
37         "os"
38         "time"
39 )
40
41 func (a *AlarmManager) StartAlertTimer() {
42         tick := time.Tick(time.Duration(a.alertInterval) * time.Millisecond)
43         for range tick {
44                 a.mutex.Lock()
45                 for _, m := range a.activeAlarms {
46                         app.Logger.Info("Re-raising alarm: %v", m)
47                         a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
48                 }
49                 a.mutex.Unlock()
50         }
51 }
52
53 func (a *AlarmManager) Consume(rp *app.RMRParams) (err error) {
54         app.Logger.Info("Message received!")
55
56         defer app.Rmr.Free(rp.Mbuf)
57         switch rp.Mtype {
58         case alarm.RIC_ALARM_UPDATE:
59                 a.HandleAlarms(rp)
60         default:
61                 app.Logger.Info("Unknown Message Type '%d', discarding", rp.Mtype)
62         }
63
64         return nil
65 }
66
67 func (a *AlarmManager) HandleAlarms(rp *app.RMRParams) (*alert.PostAlertsOK, error) {
68         var m alarm.AlarmMessage
69         app.Logger.Info("Received JSON: %s", rp.Payload)
70         if err := json.Unmarshal(rp.Payload, &m); err != nil {
71                 app.Logger.Error("json.Unmarshal failed: %v", err)
72                 return nil, err
73         }
74         app.Logger.Info("newAlarm: %v", m)
75
76         return a.ProcessAlarm(&AlarmNotification{m, alarm.AlarmDefinition{}})
77 }
78
79 func (a *AlarmManager) ProcessAlarm(m *AlarmNotification) (*alert.PostAlertsOK, error) {
80         a.mutex.Lock()
81
82         if _, ok := alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok {
83                 app.Logger.Warn("Alarm (SP='%d') not recognized, suppressing ...", m.Alarm.SpecificProblem)
84                 a.mutex.Unlock()
85                 return nil, nil
86         }
87
88         // Suppress duplicate alarms
89         idx, found := a.IsMatchFound(m.Alarm)
90         if found && m.AlarmAction == alarm.AlarmActionRaise {
91                 app.Logger.Info("Duplicate alarm found, suppressing ...")
92                 if m.PerceivedSeverity == a.activeAlarms[idx].PerceivedSeverity {
93                         // Duplicate with same severity found
94                         a.mutex.Unlock()
95                         return nil, nil
96                 } else {
97                         // Remove duplicate with different severity
98                         a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
99                 }
100         }
101
102         // Clear alarm if found from active alarm list
103         if m.AlarmAction == alarm.AlarmActionClear {
104                 if found {
105                         a.UpdateAlarmFields(a.activeAlarms[idx].AlarmId, m)
106                         a.alarmHistory = append(a.alarmHistory, *m)
107                         a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
108                         if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) {
109                                 app.Logger.Warn("alarm history count exceeded maxAlarmHistory threshold")
110                                 a.GenerateThresholdAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, "history")
111                         }
112
113                         if a.exceededActiveAlarmOn && m.Alarm.SpecificProblem == alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD {
114                                 a.exceededActiveAlarmOn = false
115                         }
116
117                         if a.exceededAlarmHistoryOn && m.Alarm.SpecificProblem == alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD {
118                                 a.exceededAlarmHistoryOn = false
119                         }
120
121                         a.WriteAlarmInfoToPersistentVolume()
122
123                         if a.postClear {
124                                 a.mutex.Unlock()
125
126                                 // Send alarm notification to NOMA, if enabled
127                                 if app.Config.GetBool("controls.noma.enabled") {
128                                         m.PerceivedSeverity = alarm.SeverityCleared
129                                         return a.PostAlarm(m)
130                                 }
131                                 return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusResolved, m.AlarmTime))
132                         }
133                 }
134                 app.Logger.Info("No matching active alarm found, suppressing ...")
135                 a.mutex.Unlock()
136                 return nil, nil
137         }
138
139         // New alarm -> update active alarms and post to Alert Manager
140         if m.AlarmAction == alarm.AlarmActionRaise {
141                 a.UpdateAlarmFields(a.GenerateAlarmId(), m)
142                 a.UpdateAlarmLists(m)
143                 a.WriteAlarmInfoToPersistentVolume()
144                 a.mutex.Unlock()
145
146                 // Send alarm notification to NOMA, if enabled
147                 if app.Config.GetBool("controls.noma.enabled") {
148                         return a.PostAlarm(m)
149                 }
150                 return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
151         }
152
153         a.mutex.Unlock()
154         return nil, nil
155 }
156
157 func (a *AlarmManager) IsMatchFound(newAlarm alarm.Alarm) (int, bool) {
158         for i, m := range a.activeAlarms {
159                 if m.ManagedObjectId == newAlarm.ManagedObjectId && m.ApplicationId == newAlarm.ApplicationId &&
160                         m.SpecificProblem == newAlarm.SpecificProblem && m.IdentifyingInfo == newAlarm.IdentifyingInfo {
161                         return i, true
162                 }
163         }
164         return -1, false
165 }
166
167 func (a *AlarmManager) RemoveAlarm(alarms []AlarmNotification, i int, listName string) []AlarmNotification {
168         app.Logger.Info("Alarm '%+v' deleted from the '%s' list", alarms[i], listName)
169         copy(alarms[i:], alarms[i+1:])
170         return alarms[:len(alarms)-1]
171 }
172
173 func (a *AlarmManager) GenerateAlarmId() int {
174         a.uniqueAlarmId++ // @todo: generate a unique ID
175         return a.uniqueAlarmId
176 }
177
178 func (a *AlarmManager) UpdateAlarmFields(alarmId int, newAlarm *AlarmNotification) {
179         alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem]
180         newAlarm.AlarmId = alarmId
181         newAlarm.AlarmText = alarmDef.AlarmText
182         newAlarm.EventType = alarmDef.EventType
183 }
184
185 func (a *AlarmManager) GenerateThresholdAlarm(sp int, data string) bool {
186         thresholdAlarm := a.alarmClient.NewAlarm(sp, alarm.SeverityWarning, "threshold", data)
187         thresholdMessage := alarm.AlarmMessage{
188                 Alarm:       thresholdAlarm,
189                 AlarmAction: alarm.AlarmActionRaise,
190                 AlarmTime:   (time.Now().UnixNano()),
191         }
192         alarmDef := alarm.RICAlarmDefinitions[sp]
193         alarmId := a.GenerateAlarmId()
194         alarmDef.AlarmId = alarmId
195         a.activeAlarms = append(a.activeAlarms, AlarmNotification{thresholdMessage, *alarmDef})
196         a.alarmHistory = append(a.alarmHistory, AlarmNotification{thresholdMessage, *alarmDef})
197
198         return true
199 }
200
201 func (a *AlarmManager) UpdateAlarmLists(newAlarm *AlarmNotification) {
202         /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised.
203            The attempt to raise the alarm next time will be supressed when found as duplicate. */
204         if (len(a.activeAlarms) >= a.maxActiveAlarms) && (a.exceededActiveAlarmOn == false) {
205                 app.Logger.Warn("active alarm count exceeded maxActiveAlarms threshold")
206                 a.exceededActiveAlarmOn = a.GenerateThresholdAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, "active")
207         }
208
209         if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) {
210                 app.Logger.Warn("alarm history count exceeded maxAlarmHistory threshold")
211                 a.exceededAlarmHistoryOn = a.GenerateThresholdAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, "history")
212         }
213
214         // @todo: For now just keep the alarms (both active and history) in-memory. Use SDL later for persistence
215         a.activeAlarms = append(a.activeAlarms, *newAlarm)
216         a.alarmHistory = append(a.alarmHistory, *newAlarm)
217 }
218
219 func (a *AlarmManager) PostAlarm(m *AlarmNotification) (*alert.PostAlertsOK, error) {
220         result, err := json.Marshal(m)
221         if err != nil {
222                 app.Logger.Info("json.Marshal failed: %v", err)
223                 return nil, err
224         }
225
226         fullUrl := fmt.Sprintf("%s/%s", app.Config.GetString("controls.noma.host"), app.Config.GetString("controls.noma.alarmUrl"))
227         app.Logger.Info("Posting alarm to '%s'", fullUrl)
228
229         resp, err := http.Post(fullUrl, "application/json", bytes.NewReader(result))
230         if err != nil || resp == nil {
231                 app.Logger.Info("Unable to post alarm to '%s': %v", fullUrl, err)
232         }
233
234         return nil, err
235 }
236
237 func (a *AlarmManager) GenerateAlertLabels(newAlarm alarm.Alarm, status AlertStatus, alarmTime int64) (models.LabelSet, models.LabelSet) {
238         alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem]
239         amLabels := models.LabelSet{
240                 "status":      string(status),
241                 "alertname":   alarmDef.AlarmText,
242                 "severity":    string(newAlarm.PerceivedSeverity),
243                 "service":     fmt.Sprintf("%s/%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
244                 "system_name": "RIC",
245         }
246         amAnnotations := models.LabelSet{
247                 "alarm_id":         fmt.Sprintf("%d", alarmDef.AlarmId),
248                 "specific_problem": fmt.Sprintf("%d", newAlarm.SpecificProblem),
249                 "event_type":       alarmDef.EventType,
250                 "identifying_info": newAlarm.IdentifyingInfo,
251                 "additional_info":  newAlarm.AdditionalInfo,
252                 "description":      fmt.Sprintf("%s:%s", newAlarm.IdentifyingInfo, newAlarm.AdditionalInfo),
253                 "instructions":     alarmDef.OperationInstructions,
254                 "timestamp":        fmt.Sprintf("%s", time.Unix(0, alarmTime).Format("02/01/2006, 15:04:05")),
255         }
256
257         return amLabels, amAnnotations
258 }
259
260 func (a *AlarmManager) NewAlertmanagerClient() *client.Alertmanager {
261         cr := clientruntime.New(a.amHost, a.amBaseUrl, a.amSchemes)
262         return client.New(cr, strfmt.Default)
263 }
264
265 func (a *AlarmManager) PostAlert(amLabels, amAnnotations models.LabelSet) (*alert.PostAlertsOK, error) {
266         pa := &models.PostableAlert{
267                 Alert: models.Alert{
268                         GeneratorURL: strfmt.URI(""),
269                         Labels:       amLabels,
270                 },
271                 Annotations: amAnnotations,
272         }
273         alertParams := alert.NewPostAlertsParams().WithAlerts(models.PostableAlerts{pa})
274
275         app.Logger.Info("Posting alerts: labels: %+v, annotations: %+v", amLabels, amAnnotations)
276         ok, err := a.NewAlertmanagerClient().Alert.PostAlerts(alertParams)
277         if err != nil {
278                 app.Logger.Error("Posting alerts to '%s/%s' failed with error: %v", a.amHost, a.amBaseUrl, err)
279         }
280         return ok, err
281 }
282
283 func (a *AlarmManager) StatusCB() bool {
284         if !a.rmrReady {
285                 app.Logger.Info("RMR not ready yet!")
286         }
287
288         return a.rmrReady
289 }
290
291 func (a *AlarmManager) ConfigChangeCB(configparam string) {
292
293         a.maxActiveAlarms = app.Config.GetInt("controls.maxActiveAlarms")
294         a.maxAlarmHistory = app.Config.GetInt("controls.maxAlarmHistory")
295         a.alertInterval = viper.GetInt("controls.promAlertManager.alertInterval")
296         a.amHost = viper.GetString("controls.promAlertManager.address")
297
298         app.Logger.Debug("ConfigChangeCB: maxActiveAlarms %v", a.maxActiveAlarms)
299         app.Logger.Debug("ConfigChangeCB: maxAlarmHistory = %v", a.maxAlarmHistory)
300         app.Logger.Debug("ConfigChangeCB: alertInterval %v", a.alertInterval)
301         app.Logger.Debug("ConfigChangeCB: amHost = %v", a.amHost)
302
303         return
304 }
305
306 func (a *AlarmManager) ReadAlarmDefinitionFromJson() {
307
308         filename := os.Getenv("DEF_FILE")
309         file, err := ioutil.ReadFile(filename)
310         if err == nil {
311                 data := RicAlarmDefinitions{}
312                 err = json.Unmarshal([]byte(file), &data)
313                 if err == nil {
314                         for _, alarmDefinition := range data.AlarmDefinitions {
315                                 _, exists := alarm.RICAlarmDefinitions[alarmDefinition.AlarmId]
316                                 if exists {
317                                         app.Logger.Error("ReadAlarmDefinitionFromJson: alarm definition already exists for %v", alarmDefinition.AlarmId)
318                                 } else {
319                                         app.Logger.Debug("ReadAlarmDefinitionFromJson: alarm  %v", alarmDefinition.AlarmId)
320                                         ricAlarmDefintion := new(alarm.AlarmDefinition)
321                                         ricAlarmDefintion.AlarmId = alarmDefinition.AlarmId
322                                         ricAlarmDefintion.AlarmText = alarmDefinition.AlarmText
323                                         ricAlarmDefintion.EventType = alarmDefinition.EventType
324                                         ricAlarmDefintion.OperationInstructions = alarmDefinition.OperationInstructions
325                                         alarm.RICAlarmDefinitions[alarmDefinition.AlarmId] = ricAlarmDefintion
326                                 }
327                         }
328                 } else {
329                         app.Logger.Error("ReadAlarmDefinitionFromJson: json.Unmarshal failed with error %v", err)
330                 }
331         } else {
332                 app.Logger.Error("ReadAlarmDefinitionFromJson: ioutil.ReadFile failed with error %v", err)
333         }
334 }
335
336 func (a *AlarmManager) ReadAlarmInfoFromPersistentVolume() {
337         var alarmpersistentinfo AlarmPersistentInfo
338         byteValue, rerr := ioutil.ReadFile(a.alarmInfoPvFile)
339         if rerr != nil {
340                 app.Logger.Error("ararminfo.json file read error %v", rerr)
341         } else {
342                 err := json.Unmarshal(byteValue, &alarmpersistentinfo)
343                 if err != nil {
344                         app.Logger.Error("alarmpersistentinfo json unmarshal error %v", err)
345                 } else {
346                         a.uniqueAlarmId = alarmpersistentinfo.UniqueAlarmId
347                         a.activeAlarms = make([]AlarmNotification, len(alarmpersistentinfo.ActiveAlarms))
348                         a.alarmHistory = make([]AlarmNotification, len(alarmpersistentinfo.AlarmHistory))
349                         copy(a.activeAlarms, alarmpersistentinfo.ActiveAlarms)
350                         copy(a.alarmHistory, alarmpersistentinfo.AlarmHistory)
351                 }
352         }
353 }
354
355 func (a *AlarmManager) WriteAlarmInfoToPersistentVolume() {
356         var alarmpersistentinfo AlarmPersistentInfo
357         alarmpersistentinfo.UniqueAlarmId = a.uniqueAlarmId
358         alarmpersistentinfo.ActiveAlarms = make([]AlarmNotification, len(a.activeAlarms))
359         alarmpersistentinfo.AlarmHistory = make([]AlarmNotification, len(a.alarmHistory))
360         copy(alarmpersistentinfo.ActiveAlarms, a.activeAlarms)
361         copy(alarmpersistentinfo.AlarmHistory, a.alarmHistory)
362         wdata, err := json.MarshalIndent(alarmpersistentinfo, "", " ")
363         if err != nil {
364                 app.Logger.Error("alarmpersistentinfo json marshal error %v", err)
365         } else {
366                 werr := ioutil.WriteFile(a.alarmInfoPvFile, wdata, 0777)
367                 if werr != nil {
368                         app.Logger.Error("alarminfo.json file write error %v", werr)
369                 }
370         }
371 }
372
373 func (a *AlarmManager) Run(sdlcheck bool) {
374         app.Logger.SetMdc("alarmManager", fmt.Sprintf("%s:%s", Version, Hash))
375         app.SetReadyCB(func(d interface{}) { a.rmrReady = true }, true)
376         app.Resource.InjectStatusCb(a.StatusCB)
377         app.AddConfigChangeListener(a.ConfigChangeCB)
378
379         alarm.RICAlarmDefinitions = make(map[int]*alarm.AlarmDefinition)
380         a.ReadAlarmDefinitionFromJson()
381
382         app.Resource.InjectRoute("/ric/v1/alarms", a.RaiseAlarm, "POST")
383         app.Resource.InjectRoute("/ric/v1/alarms", a.ClearAlarm, "DELETE")
384         app.Resource.InjectRoute("/ric/v1/alarms/active", a.GetActiveAlarms, "GET")
385         app.Resource.InjectRoute("/ric/v1/alarms/history", a.GetAlarmHistory, "GET")
386         app.Resource.InjectRoute("/ric/v1/alarms/config", a.SetAlarmConfig, "POST")
387         app.Resource.InjectRoute("/ric/v1/alarms/config", a.GetAlarmConfig, "GET")
388         app.Resource.InjectRoute("/ric/v1/alarms/define", a.SetAlarmDefinition, "POST")
389         app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.DeleteAlarmDefinition, "DELETE")
390         app.Resource.InjectRoute("/ric/v1/alarms/define", a.GetAlarmDefinition, "GET")
391         app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.GetAlarmDefinition, "GET")
392
393         // Start background timer for re-raising alerts
394         go a.StartAlertTimer()
395         a.alarmClient, _ = alarm.InitAlarm("SEP", "ALARMMANAGER")
396
397         a.ReadAlarmInfoFromPersistentVolume()
398
399         app.RunWithParams(a, sdlcheck)
400 }
401
402 func NewAlarmManager(amHost string, alertInterval int, clearAlarm bool) *AlarmManager {
403         if alertInterval == 0 {
404                 alertInterval = viper.GetInt("controls.promAlertManager.alertInterval")
405         }
406
407         if amHost == "" {
408                 amHost = viper.GetString("controls.promAlertManager.address")
409         }
410
411         return &AlarmManager{
412                 rmrReady:               false,
413                 postClear:              clearAlarm,
414                 amHost:                 amHost,
415                 amBaseUrl:              app.Config.GetString("controls.promAlertManager.baseUrl"),
416                 amSchemes:              []string{app.Config.GetString("controls.promAlertManager.schemes")},
417                 alertInterval:          alertInterval,
418                 activeAlarms:           make([]AlarmNotification, 0),
419                 alarmHistory:           make([]AlarmNotification, 0),
420                 uniqueAlarmId:          0,
421                 maxActiveAlarms:        app.Config.GetInt("controls.maxActiveAlarms"),
422                 maxAlarmHistory:        app.Config.GetInt("controls.maxAlarmHistory"),
423                 exceededActiveAlarmOn:  false,
424                 exceededAlarmHistoryOn: false,
425                 alarmInfoPvFile:        app.Config.GetString("controls.alarmInfoPvFile"),
426         }
427 }
428
429 // Main function
430 func main() {
431         NewAlarmManager("", 0, true).Run(true)
432 }