1a950db297f2c6cfb8e0882436a21e6b8e634935
[ric-plt/alarm-go.git] / manager / cmd / manager.go
1 /*
2  *  Copyright (c) 2020 AT&T Intellectual Property.
3  *  Copyright (c) 2020 Nokia.
4  *
5  *  Licensed under the Apache License, Version 2.0 (the "License");
6  *  you may not use this file except in compliance with the License.
7  *  You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *  Unless required by applicable law or agreed to in writing, software
12  *  distributed under the License is distributed on an "AS IS" BASIS,
13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *  See the License for the specific language governing permissions and
15  *  limitations under the License.
16  *
17  * This source code is part of the near-RT RIC (RAN Intelligent Controller)
18  * platform project (RICP).
19  */
20
21 package main
22
23 import (
24         "bytes"
25         "encoding/json"
26         "fmt"
27         "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm"
28         app "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp"
29         clientruntime "github.com/go-openapi/runtime/client"
30         "github.com/go-openapi/strfmt"
31         "github.com/prometheus/alertmanager/api/v2/client"
32         "github.com/prometheus/alertmanager/api/v2/client/alert"
33         "github.com/prometheus/alertmanager/api/v2/models"
34         "github.com/spf13/viper"
35         "io/ioutil"
36         "net/http"
37         "os"
38         "time"
39 )
40
41 func (a *AlarmManager) StartAlertTimer() {
42         tick := time.Tick(time.Duration(a.alertInterval) * time.Millisecond)
43         for range tick {
44                 a.mutex.Lock()
45                 for _, m := range a.activeAlarms {
46                         app.Logger.Info("Re-raising alarm: %v", m)
47                         a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
48                 }
49                 a.mutex.Unlock()
50         }
51 }
52
53 func (a *AlarmManager) Consume(rp *app.RMRParams) (err error) {
54         app.Logger.Info("Message received!")
55
56         defer app.Rmr.Free(rp.Mbuf)
57         switch rp.Mtype {
58         case alarm.RIC_ALARM_UPDATE:
59                 a.HandleAlarms(rp)
60         default:
61                 app.Logger.Info("Unknown Message Type '%d', discarding", rp.Mtype)
62         }
63
64         return nil
65 }
66
67 func (a *AlarmManager) HandleAlarms(rp *app.RMRParams) (*alert.PostAlertsOK, error) {
68         var m alarm.AlarmMessage
69         app.Logger.Info("Received JSON: %s", rp.Payload)
70         if err := json.Unmarshal(rp.Payload, &m); err != nil {
71                 app.Logger.Error("json.Unmarshal failed: %v", err)
72                 return nil, err
73         }
74         app.Logger.Info("newAlarm: %v", m)
75
76         return a.ProcessAlarm(&AlarmNotification{m, alarm.AlarmDefinition{}})
77 }
78
79 func (a *AlarmManager) ProcessAlarm(m *AlarmNotification) (*alert.PostAlertsOK, error) {
80         a.mutex.Lock()
81         if _, ok := alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok {
82                 app.Logger.Warn("Alarm (SP='%d') not recognized, suppressing ...", m.Alarm.SpecificProblem)
83                 a.mutex.Unlock()
84                 return nil, nil
85         }
86
87         // Suppress duplicate alarms
88         idx, found := a.IsMatchFound(m.Alarm)
89         if found && m.AlarmAction == alarm.AlarmActionRaise {
90                 app.Logger.Info("Duplicate alarm found, suppressing ...")
91                 if m.PerceivedSeverity == a.activeAlarms[idx].PerceivedSeverity {
92                         // Duplicate with same severity found
93                         a.mutex.Unlock()
94                         return nil, nil
95                 } else {
96                         // Remove duplicate with different severity
97                         a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
98                 }
99         }
100
101         // Clear alarm if found from active alarm list
102         if m.AlarmAction == alarm.AlarmActionClear {
103                 if found {
104                         a.UpdateAlarmFields(a.activeAlarms[idx].AlarmId, m)
105                         a.alarmHistory = append(a.alarmHistory, *m)
106                         a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
107                         if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) {
108                                 app.Logger.Warn("alarm history count exceeded maxAlarmHistory threshold")
109                                 a.GenerateThresholdAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, "history")
110                         }
111
112                         if a.exceededActiveAlarmOn && m.Alarm.SpecificProblem == alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD {
113                                 a.exceededActiveAlarmOn = false
114                         }
115
116                         if a.exceededAlarmHistoryOn && m.Alarm.SpecificProblem == alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD {
117                                 a.exceededAlarmHistoryOn = false
118                         }
119
120                         if a.postClear {
121                                 a.mutex.Unlock()
122
123                                 // Send alarm notification to NOMA, if enabled
124                                 if app.Config.GetBool("controls.noma.enabled") {
125                                         m.PerceivedSeverity = alarm.SeverityCleared
126                                         return a.PostAlarm(m)
127                                 }
128                                 return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusResolved, m.AlarmTime))
129                         }
130                 }
131                 app.Logger.Info("No matching active alarm found, suppressing ...")
132                 a.mutex.Unlock()
133                 return nil, nil
134         }
135
136         // New alarm -> update active alarms and post to Alert Manager
137         if m.AlarmAction == alarm.AlarmActionRaise {
138                 a.UpdateAlarmFields(a.GenerateAlarmId(), m)
139                 a.UpdateAlarmLists(m)
140                 a.mutex.Unlock()
141
142                 // Send alarm notification to NOMA, if enabled
143                 if app.Config.GetBool("controls.noma.enabled") {
144                         return a.PostAlarm(m)
145                 }
146                 return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
147         }
148
149         a.mutex.Unlock()
150         return nil, nil
151 }
152
153 func (a *AlarmManager) IsMatchFound(newAlarm alarm.Alarm) (int, bool) {
154         for i, m := range a.activeAlarms {
155                 if m.ManagedObjectId == newAlarm.ManagedObjectId && m.ApplicationId == newAlarm.ApplicationId &&
156                         m.SpecificProblem == newAlarm.SpecificProblem && m.IdentifyingInfo == newAlarm.IdentifyingInfo {
157                         return i, true
158                 }
159         }
160         return -1, false
161 }
162
163 func (a *AlarmManager) RemoveAlarm(alarms []AlarmNotification, i int, listName string) []AlarmNotification {
164         app.Logger.Info("Alarm '%+v' deleted from the '%s' list", alarms[i], listName)
165         copy(alarms[i:], alarms[i+1:])
166         return alarms[:len(alarms)-1]
167 }
168
169 func (a *AlarmManager) GenerateAlarmId() int {
170         a.uniqueAlarmId++ // @todo: generate a unique ID
171         return a.uniqueAlarmId
172 }
173
174 func (a *AlarmManager) UpdateAlarmFields(alarmId int, newAlarm *AlarmNotification) {
175         alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem]
176         newAlarm.AlarmId = alarmId
177         newAlarm.AlarmText = alarmDef.AlarmText
178         newAlarm.EventType = alarmDef.EventType
179 }
180
181 func (a *AlarmManager) GenerateThresholdAlarm(sp int, data string) bool {
182         thresholdAlarm := a.alarmClient.NewAlarm(sp, alarm.SeverityWarning, "threshold", data)
183         thresholdMessage := alarm.AlarmMessage{
184                 Alarm:       thresholdAlarm,
185                 AlarmAction: alarm.AlarmActionRaise,
186                 AlarmTime:   (time.Now().UnixNano()),
187         }
188         a.activeAlarms = append(a.activeAlarms, AlarmNotification{thresholdMessage, alarm.AlarmDefinition{}})
189         a.alarmHistory = append(a.alarmHistory, AlarmNotification{thresholdMessage, alarm.AlarmDefinition{}})
190
191         return true
192 }
193
194 func (a *AlarmManager) UpdateAlarmLists(newAlarm *AlarmNotification) {
195         /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised.
196            The attempt to raise the alarm next time will be supressed when found as duplicate. */
197         if (len(a.activeAlarms) >= a.maxActiveAlarms) && (a.exceededActiveAlarmOn == false) {
198                 app.Logger.Warn("active alarm count exceeded maxActiveAlarms threshold")
199                 a.exceededActiveAlarmOn = a.GenerateThresholdAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, "active")
200         }
201
202         if (len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false) {
203                 app.Logger.Warn("alarm history count exceeded maxAlarmHistory threshold")
204                 a.exceededAlarmHistoryOn = a.GenerateThresholdAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, "history")
205         }
206
207         // @todo: For now just keep the alarms (both active and history) in-memory. Use SDL later for persistence
208         a.activeAlarms = append(a.activeAlarms, *newAlarm)
209         a.alarmHistory = append(a.alarmHistory, *newAlarm)
210 }
211
212 func (a *AlarmManager) PostAlarm(m *AlarmNotification) (*alert.PostAlertsOK, error) {
213         result, err := json.Marshal(m)
214         if err != nil {
215                 app.Logger.Info("json.Marshal failed: %v", err)
216                 return nil, err
217         }
218
219         fullUrl := fmt.Sprintf("%s/%s", app.Config.GetString("controls.noma.host"), app.Config.GetString("controls.noma.alarmUrl"))
220         app.Logger.Info("Posting alarm to '%s'", fullUrl)
221
222         resp, err := http.Post(fullUrl, "application/json", bytes.NewReader(result))
223         if err != nil || resp == nil {
224                 app.Logger.Info("Unable to post alarm to '%s': %v", fullUrl, err)
225         }
226
227         return nil, err
228 }
229
230 func (a *AlarmManager) GenerateAlertLabels(newAlarm alarm.Alarm, status AlertStatus, alarmTime int64) (models.LabelSet, models.LabelSet) {
231         alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem]
232         amLabels := models.LabelSet{
233                 "status":      string(status),
234                 "alertname":   alarmDef.AlarmText,
235                 "severity":    string(newAlarm.PerceivedSeverity),
236                 "service":     fmt.Sprintf("%s/%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
237                 "system_name": "RIC",
238         }
239         amAnnotations := models.LabelSet{
240                 "alarm_id":         fmt.Sprintf("%d", alarmDef.AlarmId),
241                 "specific_problem": fmt.Sprintf("%d", newAlarm.SpecificProblem),
242                 "event_type":       alarmDef.EventType,
243                 "identifying_info": newAlarm.IdentifyingInfo,
244                 "additional_info":  newAlarm.AdditionalInfo,
245                 "description":      fmt.Sprintf("%s:%s", newAlarm.IdentifyingInfo, newAlarm.AdditionalInfo),
246                 "instructions":     alarmDef.OperationInstructions,
247                 "timestamp":        fmt.Sprintf("%s", time.Unix(0, alarmTime).Format("02/01/2006, 15:04:05")),
248         }
249
250         return amLabels, amAnnotations
251 }
252
253 func (a *AlarmManager) NewAlertmanagerClient() *client.Alertmanager {
254         cr := clientruntime.New(a.amHost, a.amBaseUrl, a.amSchemes)
255         return client.New(cr, strfmt.Default)
256 }
257
258 func (a *AlarmManager) PostAlert(amLabels, amAnnotations models.LabelSet) (*alert.PostAlertsOK, error) {
259         pa := &models.PostableAlert{
260                 Alert: models.Alert{
261                         GeneratorURL: strfmt.URI(""),
262                         Labels:       amLabels,
263                 },
264                 Annotations: amAnnotations,
265         }
266         alertParams := alert.NewPostAlertsParams().WithAlerts(models.PostableAlerts{pa})
267
268         app.Logger.Info("Posting alerts: labels: %+v, annotations: %+v", amLabels, amAnnotations)
269         ok, err := a.NewAlertmanagerClient().Alert.PostAlerts(alertParams)
270         if err != nil {
271                 app.Logger.Error("Posting alerts to '%s/%s' failed with error: %v", a.amHost, a.amBaseUrl, err)
272         }
273         return ok, err
274 }
275
276 func (a *AlarmManager) StatusCB() bool {
277         if !a.rmrReady {
278                 app.Logger.Info("RMR not ready yet!")
279         }
280
281         return a.rmrReady
282 }
283
284 func (a *AlarmManager) ConfigChangeCB(configparam string) {
285
286         a.maxActiveAlarms = app.Config.GetInt("controls.maxActiveAlarms")
287         a.maxAlarmHistory = app.Config.GetInt("controls.maxAlarmHistory")
288         a.alertInterval = viper.GetInt("controls.promAlertManager.alertInterval")
289         a.amHost = viper.GetString("controls.promAlertManager.address")
290
291         app.Logger.Debug("ConfigChangeCB: maxActiveAlarms %v", a.maxActiveAlarms)
292         app.Logger.Debug("ConfigChangeCB: maxAlarmHistory = %v", a.maxAlarmHistory)
293         app.Logger.Debug("ConfigChangeCB: alertInterval %v", a.alertInterval)
294         app.Logger.Debug("ConfigChangeCB: amHost = %v", a.amHost)
295
296         return
297 }
298
299 func (a *AlarmManager) ReadAlarmDefinitionFromJson() {
300
301         filename := os.Getenv("DEF_FILE")
302         file, err := ioutil.ReadFile(filename)
303         if err == nil {
304                 data := RicAlarmDefinitions{}
305                 err = json.Unmarshal([]byte(file), &data)
306                 if err == nil {
307                         for _, alarmDefinition := range data.AlarmDefinitions {
308                                 _, exists := alarm.RICAlarmDefinitions[alarmDefinition.AlarmId]
309                                 if exists {
310                                         app.Logger.Error("ReadAlarmDefinitionFromJson: alarm definition already exists for %v", alarmDefinition.AlarmId)
311                                 } else {
312                                         app.Logger.Debug("ReadAlarmDefinitionFromJson: alarm  %v", alarmDefinition.AlarmId)
313                                         ricAlarmDefintion := new(alarm.AlarmDefinition)
314                                         ricAlarmDefintion.AlarmId = alarmDefinition.AlarmId
315                                         ricAlarmDefintion.AlarmText = alarmDefinition.AlarmText
316                                         ricAlarmDefintion.EventType = alarmDefinition.EventType
317                                         ricAlarmDefintion.OperationInstructions = alarmDefinition.OperationInstructions
318                                         alarm.RICAlarmDefinitions[alarmDefinition.AlarmId] = ricAlarmDefintion
319                                 }
320                         }
321                 } else {
322                         app.Logger.Error("ReadAlarmDefinitionFromJson: json.Unmarshal failed with error %v", err)
323                 }
324         } else {
325                 app.Logger.Error("ReadAlarmDefinitionFromJson: ioutil.ReadFile failed with error %v", err)
326         }
327 }
328
329 func (a *AlarmManager) Run(sdlcheck bool) {
330         app.Logger.SetMdc("alarmManager", fmt.Sprintf("%s:%s", Version, Hash))
331         app.SetReadyCB(func(d interface{}) { a.rmrReady = true }, true)
332         app.Resource.InjectStatusCb(a.StatusCB)
333         app.AddConfigChangeListener(a.ConfigChangeCB)
334
335         alarm.RICAlarmDefinitions = make(map[int]*alarm.AlarmDefinition)
336         a.ReadAlarmDefinitionFromJson()
337
338         app.Resource.InjectRoute("/ric/v1/alarms", a.RaiseAlarm, "POST")
339         app.Resource.InjectRoute("/ric/v1/alarms", a.ClearAlarm, "DELETE")
340         app.Resource.InjectRoute("/ric/v1/alarms/active", a.GetActiveAlarms, "GET")
341         app.Resource.InjectRoute("/ric/v1/alarms/history", a.GetAlarmHistory, "GET")
342         app.Resource.InjectRoute("/ric/v1/alarms/config", a.SetAlarmConfig, "POST")
343         app.Resource.InjectRoute("/ric/v1/alarms/config", a.GetAlarmConfig, "GET")
344         app.Resource.InjectRoute("/ric/v1/alarms/define", a.SetAlarmDefinition, "POST")
345         app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.DeleteAlarmDefinition, "DELETE")
346         app.Resource.InjectRoute("/ric/v1/alarms/define", a.GetAlarmDefinition, "GET")
347         app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.GetAlarmDefinition, "GET")
348
349         // Start background timer for re-raising alerts
350         go a.StartAlertTimer()
351         a.alarmClient, _ = alarm.InitAlarm("SEP", "ALARMMANAGER")
352
353         app.RunWithParams(a, sdlcheck)
354 }
355
356 func NewAlarmManager(amHost string, alertInterval int, clearAlarm bool) *AlarmManager {
357         if alertInterval == 0 {
358                 alertInterval = viper.GetInt("controls.promAlertManager.alertInterval")
359         }
360
361         if amHost == "" {
362                 amHost = viper.GetString("controls.promAlertManager.address")
363         }
364
365         return &AlarmManager{
366                 rmrReady:               false,
367                 postClear:              clearAlarm,
368                 amHost:                 amHost,
369                 amBaseUrl:              app.Config.GetString("controls.promAlertManager.baseUrl"),
370                 amSchemes:              []string{app.Config.GetString("controls.promAlertManager.schemes")},
371                 alertInterval:          alertInterval,
372                 activeAlarms:           make([]AlarmNotification, 0),
373                 alarmHistory:           make([]AlarmNotification, 0),
374                 uniqueAlarmId:          0,
375                 maxActiveAlarms:        app.Config.GetInt("controls.maxActiveAlarms"),
376                 maxAlarmHistory:        app.Config.GetInt("controls.maxAlarmHistory"),
377                 exceededActiveAlarmOn:  false,
378                 exceededAlarmHistoryOn: false,
379         }
380 }
381
382 // Main function
383 func main() {
384         NewAlarmManager("", 0, true).Run(true)
385 }