Additional alarm information
[ric-plt/alarm-go.git] / manager / cmd / manager.go
1 /*
2  *  Copyright (c) 2020 AT&T Intellectual Property.
3  *  Copyright (c) 2020 Nokia.
4  *
5  *  Licensed under the Apache License, Version 2.0 (the "License");
6  *  you may not use this file except in compliance with the License.
7  *  You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *  Unless required by applicable law or agreed to in writing, software
12  *  distributed under the License is distributed on an "AS IS" BASIS,
13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *  See the License for the specific language governing permissions and
15  *  limitations under the License.
16  *
17  * This source code is part of the near-RT RIC (RAN Intelligent Controller)
18  * platform project (RICP).
19  */
20
21 package main
22
23 import (
24         "encoding/json"
25         "fmt"
26         "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm"
27         app "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp"
28         clientruntime "github.com/go-openapi/runtime/client"
29         "github.com/go-openapi/strfmt"
30         "github.com/prometheus/alertmanager/api/v2/client"
31         "github.com/prometheus/alertmanager/api/v2/client/alert"
32         "github.com/prometheus/alertmanager/api/v2/models"
33         "github.com/spf13/viper"
34         "io/ioutil"
35         "os"
36         "time"
37 )
38
39 func (a *AlarmManager) StartAlertTimer() {
40         tick := time.Tick(time.Duration(a.alertInterval) * time.Millisecond)
41         for range tick {
42                 a.mutex.Lock()
43                 for _, m := range a.activeAlarms {
44                         app.Logger.Info("Re-raising alarm: %v", m)
45                         a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
46                 }
47                 a.mutex.Unlock()
48         }
49 }
50
51 func (a *AlarmManager) Consume(rp *app.RMRParams) (err error) {
52         app.Logger.Info("Message received!")
53
54         defer app.Rmr.Free(rp.Mbuf)
55         switch rp.Mtype {
56         case alarm.RIC_ALARM_UPDATE:
57                 a.HandleAlarms(rp)
58         default:
59                 app.Logger.Info("Unknown Message Type '%d', discarding", rp.Mtype)
60         }
61
62         return nil
63 }
64
65 func (a *AlarmManager) HandleAlarms(rp *app.RMRParams) (*alert.PostAlertsOK, error) {
66         var m alarm.AlarmMessage
67         app.Logger.Info("Received JSON: %s", rp.Payload)
68         if err := json.Unmarshal(rp.Payload, &m); err != nil {
69                 app.Logger.Error("json.Unmarshal failed: %v", err)
70                 return nil, err
71         }
72         app.Logger.Info("newAlarm: %v", m)
73
74         return a.ProcessAlarm(&AlarmInformation{m, alarm.AlarmDefinition{}})
75 }
76
77 func (a *AlarmManager) ProcessAlarm(m *AlarmInformation) (*alert.PostAlertsOK, error) {
78         a.mutex.Lock()
79         if _, ok := alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok {
80                 app.Logger.Warn("Alarm (SP='%d') not recognized, suppressing ...", m.Alarm.SpecificProblem)
81                 a.mutex.Unlock()
82                 return nil, nil
83         }
84
85         // Suppress duplicate alarms
86         idx, found := a.IsMatchFound(m.Alarm)
87         if found && m.AlarmAction == alarm.AlarmActionRaise {
88                 app.Logger.Info("Duplicate alarm found, suppressing ...")
89                 if m.PerceivedSeverity == a.activeAlarms[idx].PerceivedSeverity {
90                         // Duplicate with same severity found
91                         a.mutex.Unlock()
92                         return nil, nil
93                 } else {
94                         // Remove duplicate with different severity
95                         a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
96                 }
97         }
98
99         // Clear alarm if found from active alarm list
100         if m.AlarmAction == alarm.AlarmActionClear {
101                 if found {
102                         a.alarmHistory = append(a.alarmHistory, *m)
103                         a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
104                         if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) {
105                                 app.Logger.Error("alarm history count exceeded maxAlarmHistory threshold")
106                                 histAlarm := a.alarmClient.NewAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "threshold", "history")
107                                 am := alarm.AlarmMessage{Alarm: histAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
108                                 histAlarmMessage := AlarmInformation{am, alarm.AlarmDefinition{}}
109                                 a.activeAlarms = append(a.activeAlarms, histAlarmMessage)
110                                 a.alarmHistory = append(a.alarmHistory, histAlarmMessage)
111                         }
112                         if (a.exceededActiveAlarmOn == true) && (m.Alarm.SpecificProblem == alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD) {
113                                 a.exceededActiveAlarmOn = false
114                         }
115                         if (a.exceededAlarmHistoryOn == true) && (m.Alarm.SpecificProblem == alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD) {
116                                 a.exceededAlarmHistoryOn = false
117                         }
118                         if a.postClear {
119                                 a.mutex.Unlock()
120                                 return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusResolved, m.AlarmTime))
121                         }
122                 }
123                 app.Logger.Info("No matching active alarm found, suppressing ...")
124                 a.mutex.Unlock()
125                 return nil, nil
126         }
127
128         // New alarm -> update active alarms and post to Alert Manager
129         if m.AlarmAction == alarm.AlarmActionRaise {
130                 a.UpdateAlarmLists(m)
131                 a.mutex.Unlock()
132                 return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
133         }
134
135         a.mutex.Unlock()
136         return nil, nil
137 }
138
139 func (a *AlarmManager) IsMatchFound(newAlarm alarm.Alarm) (int, bool) {
140         for i, m := range a.activeAlarms {
141                 if m.ManagedObjectId == newAlarm.ManagedObjectId && m.ApplicationId == newAlarm.ApplicationId &&
142                         m.SpecificProblem == newAlarm.SpecificProblem && m.IdentifyingInfo == newAlarm.IdentifyingInfo {
143                         return i, true
144                 }
145         }
146         return -1, false
147 }
148
149 func (a *AlarmManager) RemoveAlarm(alarms []AlarmInformation, i int, listName string) []AlarmInformation {
150         app.Logger.Info("Alarm '%+v' deleted from the '%s' list", alarms[i], listName)
151         copy(alarms[i:], alarms[i+1:])
152         return alarms[:len(alarms)-1]
153 }
154
155 func (a *AlarmManager) UpdateAlarmFields(newAlarm *AlarmInformation) {
156         alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem]
157         newAlarm.AlarmId = a.uniqueAlarmId
158         a.uniqueAlarmId++ // @todo: generate a unique ID
159         newAlarm.AlarmText = alarmDef.AlarmText
160         newAlarm.EventType = alarmDef.EventType
161 }
162
163 func (a *AlarmManager) UpdateAlarmLists(newAlarm *AlarmInformation) {
164         /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised.
165            The attempt to raise the alarm next time will be supressed when found as duplicate. */
166         if (len(a.activeAlarms) >= a.maxActiveAlarms) && (a.exceededActiveAlarmOn == false) {
167                 app.Logger.Error("active alarm count exceeded maxActiveAlarms threshold")
168                 actAlarm := a.alarmClient.NewAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "threshold", "active")
169                 actAlarmMessage := alarm.AlarmMessage{Alarm: actAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
170                 a.activeAlarms = append(a.activeAlarms, AlarmInformation{actAlarmMessage, alarm.AlarmDefinition{}})
171                 a.alarmHistory = append(a.alarmHistory, AlarmInformation{actAlarmMessage, alarm.AlarmDefinition{}})
172                 a.exceededActiveAlarmOn = true
173         }
174
175         if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) {
176                 app.Logger.Error("alarm history count exceeded maxAlarmHistory threshold")
177                 histAlarm := a.alarmClient.NewAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "threshold", "history")
178                 histAlarmMessage := alarm.AlarmMessage{Alarm: histAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
179                 a.activeAlarms = append(a.activeAlarms, AlarmInformation{histAlarmMessage, alarm.AlarmDefinition{}})
180                 a.alarmHistory = append(a.alarmHistory, AlarmInformation{histAlarmMessage, alarm.AlarmDefinition{}})
181                 a.exceededAlarmHistoryOn = true
182         }
183
184         a.UpdateAlarmFields(newAlarm)
185
186         // @todo: For now just keep the alarms (both active and history) in-memory. Use SDL later for persistence
187         a.activeAlarms = append(a.activeAlarms, *newAlarm)
188         a.alarmHistory = append(a.alarmHistory, *newAlarm)
189 }
190
191 func (a *AlarmManager) GenerateAlertLabels(newAlarm alarm.Alarm, status AlertStatus, alarmTime int64) (models.LabelSet, models.LabelSet) {
192         alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem]
193         amLabels := models.LabelSet{
194                 "status":      string(status),
195                 "alertname":   alarmDef.AlarmText,
196                 "severity":    string(newAlarm.PerceivedSeverity),
197                 "service":     fmt.Sprintf("%s:%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
198                 "system_name": fmt.Sprintf("RIC:%s:%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
199         }
200         amAnnotations := models.LabelSet{
201                 "alarm_id":        fmt.Sprintf("%d", alarmDef.AlarmId),
202                 "description":     fmt.Sprintf("%d:%s:%s", newAlarm.SpecificProblem, newAlarm.IdentifyingInfo, newAlarm.AdditionalInfo),
203                 "additional_info": newAlarm.AdditionalInfo,
204                 "summary":         alarmDef.EventType,
205                 "instructions":    alarmDef.OperationInstructions,
206                 "timestamp":       fmt.Sprintf("%s", time.Unix(0, alarmTime).Format("02/01/2006, 15:04:05")),
207         }
208
209         return amLabels, amAnnotations
210 }
211
212 func (a *AlarmManager) NewAlertmanagerClient() *client.Alertmanager {
213         cr := clientruntime.New(a.amHost, a.amBaseUrl, a.amSchemes)
214         return client.New(cr, strfmt.Default)
215 }
216
217 func (a *AlarmManager) PostAlert(amLabels, amAnnotations models.LabelSet) (*alert.PostAlertsOK, error) {
218         pa := &models.PostableAlert{
219                 Alert: models.Alert{
220                         GeneratorURL: strfmt.URI(""),
221                         Labels:       amLabels,
222                 },
223                 Annotations: amAnnotations,
224         }
225         alertParams := alert.NewPostAlertsParams().WithAlerts(models.PostableAlerts{pa})
226
227         app.Logger.Info("Posting alerts: labels: %+v, annotations: %+v", amLabels, amAnnotations)
228         ok, err := a.NewAlertmanagerClient().Alert.PostAlerts(alertParams)
229         if err != nil {
230                 app.Logger.Error("Posting alerts to '%s/%s' failed with error: %v", a.amHost, a.amBaseUrl, err)
231         }
232         return ok, err
233 }
234
235 func (a *AlarmManager) StatusCB() bool {
236         if !a.rmrReady {
237                 app.Logger.Info("RMR not ready yet!")
238         }
239
240         return a.rmrReady
241 }
242
243 func (a *AlarmManager) ConfigChangeCB(configparam string) {
244
245         a.maxActiveAlarms = app.Config.GetInt("controls.maxActiveAlarms")
246         a.maxAlarmHistory = app.Config.GetInt("controls.maxAlarmHistory")
247         a.alertInterval = viper.GetInt("controls.promAlertManager.alertInterval")
248         a.amHost = viper.GetString("controls.promAlertManager.address")
249
250         app.Logger.Debug("ConfigChangeCB: maxActiveAlarms %v", a.maxActiveAlarms)
251         app.Logger.Debug("ConfigChangeCB: maxAlarmHistory = %v", a.maxAlarmHistory)
252         app.Logger.Debug("ConfigChangeCB: alertInterval %v", a.alertInterval)
253         app.Logger.Debug("ConfigChangeCB: amHost = %v", a.amHost)
254
255         return
256 }
257
258 func (a *AlarmManager) ReadAlarmDefinitionFromJson() {
259
260         filename := os.Getenv("DEF_FILE")
261         file, err := ioutil.ReadFile(filename)
262         if err == nil {
263                 data := RicAlarmDefinitions{}
264                 err = json.Unmarshal([]byte(file), &data)
265                 if err == nil {
266                         for _, alarmDefinition := range data.AlarmDefinitions {
267                                 _, exists := alarm.RICAlarmDefinitions[alarmDefinition.AlarmId]
268                                 if exists {
269                                         app.Logger.Error("ReadAlarmDefinitionFromJson: alarm definition already exists for %v", alarmDefinition.AlarmId)
270                                 } else {
271                                         app.Logger.Debug("ReadAlarmDefinitionFromJson: alarm  %v", alarmDefinition.AlarmId)
272                                         ricAlarmDefintion := new(alarm.AlarmDefinition)
273                                         ricAlarmDefintion.AlarmId = alarmDefinition.AlarmId
274                                         ricAlarmDefintion.AlarmText = alarmDefinition.AlarmText
275                                         ricAlarmDefintion.EventType = alarmDefinition.EventType
276                                         ricAlarmDefintion.OperationInstructions = alarmDefinition.OperationInstructions
277                                         alarm.RICAlarmDefinitions[alarmDefinition.AlarmId] = ricAlarmDefintion
278                                 }
279                         }
280                 } else {
281                         app.Logger.Error("ReadAlarmDefinitionFromJson: json.Unmarshal failed with error %v", err)
282                 }
283         } else {
284                 app.Logger.Error("ReadAlarmDefinitionFromJson: ioutil.ReadFile failed with error %v", err)
285         }
286 }
287
288 func (a *AlarmManager) Run(sdlcheck bool) {
289         app.Logger.SetMdc("alarmManager", fmt.Sprintf("%s:%s", Version, Hash))
290         app.SetReadyCB(func(d interface{}) { a.rmrReady = true }, true)
291         app.Resource.InjectStatusCb(a.StatusCB)
292         app.AddConfigChangeListener(a.ConfigChangeCB)
293
294         alarm.RICAlarmDefinitions = make(map[int]*alarm.AlarmDefinition)
295         a.ReadAlarmDefinitionFromJson()
296
297         app.Resource.InjectRoute("/ric/v1/alarms", a.RaiseAlarm, "POST")
298         app.Resource.InjectRoute("/ric/v1/alarms", a.ClearAlarm, "DELETE")
299         app.Resource.InjectRoute("/ric/v1/alarms/active", a.GetActiveAlarms, "GET")
300         app.Resource.InjectRoute("/ric/v1/alarms/history", a.GetAlarmHistory, "GET")
301         app.Resource.InjectRoute("/ric/v1/alarms/config", a.SetAlarmConfig, "POST")
302         app.Resource.InjectRoute("/ric/v1/alarms/config", a.GetAlarmConfig, "GET")
303         app.Resource.InjectRoute("/ric/v1/alarms/define", a.SetAlarmDefinition, "POST")
304         app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.DeleteAlarmDefinition, "DELETE")
305         app.Resource.InjectRoute("/ric/v1/alarms/define", a.GetAlarmDefinition, "GET")
306         app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.GetAlarmDefinition, "GET")
307
308         // Start background timer for re-raising alerts
309         a.postClear = sdlcheck
310         go a.StartAlertTimer()
311         a.alarmClient, _ = alarm.InitAlarm("SEP", "ALARMMANAGER")
312
313         app.RunWithParams(a, sdlcheck)
314 }
315
316 func NewAlarmManager(amHost string, alertInterval int) *AlarmManager {
317         if alertInterval == 0 {
318                 alertInterval = viper.GetInt("controls.promAlertManager.alertInterval")
319         }
320
321         if amHost == "" {
322                 amHost = viper.GetString("controls.promAlertManager.address")
323         }
324
325         return &AlarmManager{
326                 rmrReady:               false,
327                 amHost:                 amHost,
328                 amBaseUrl:              viper.GetString("controls.promAlertManager.baseUrl"),
329                 amSchemes:              []string{viper.GetString("controls.promAlertManager.schemes")},
330                 alertInterval:          alertInterval,
331                 activeAlarms:           make([]AlarmInformation, 0),
332                 alarmHistory:           make([]AlarmInformation, 0),
333                 uniqueAlarmId:          1,
334                 maxActiveAlarms:        app.Config.GetInt("controls.maxActiveAlarms"),
335                 maxAlarmHistory:        app.Config.GetInt("controls.maxAlarmHistory"),
336                 exceededActiveAlarmOn:  false,
337                 exceededAlarmHistoryOn: false,
338         }
339 }
340
341 // Main function
342 func main() {
343         NewAlarmManager("", 0).Run(true)
344 }