f00a812c534c063e08b9431daa07962370da567d
[ric-plt/alarm-go.git] / manager / cmd / manager.go
1 /*
2  *  Copyright (c) 2020 AT&T Intellectual Property.
3  *  Copyright (c) 2020 Nokia.
4  *
5  *  Licensed under the Apache License, Version 2.0 (the "License");
6  *  you may not use this file except in compliance with the License.
7  *  You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *  Unless required by applicable law or agreed to in writing, software
12  *  distributed under the License is distributed on an "AS IS" BASIS,
13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *  See the License for the specific language governing permissions and
15  *  limitations under the License.
16  *
17  * This source code is part of the near-RT RIC (RAN Intelligent Controller)
18  * platform project (RICP).
19  */
20
21 package main
22
23 import (
24         "encoding/json"
25         "fmt"
26         "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm"
27         app "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp"
28         clientruntime "github.com/go-openapi/runtime/client"
29         "github.com/go-openapi/strfmt"
30         "github.com/prometheus/alertmanager/api/v2/client"
31         "github.com/prometheus/alertmanager/api/v2/client/alert"
32         "github.com/prometheus/alertmanager/api/v2/models"
33         "github.com/spf13/viper"
34         "io/ioutil"
35         "os"
36         "time"
37 )
38
39 func (a *AlarmManager) StartAlertTimer() {
40         tick := time.Tick(time.Duration(a.alertInterval) * time.Millisecond)
41         for range tick {
42                 a.mutex.Lock()
43                 for _, m := range a.activeAlarms {
44                         app.Logger.Info("Re-raising alarm: %v", m)
45                         a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
46                 }
47                 a.mutex.Unlock()
48         }
49 }
50
51 func (a *AlarmManager) Consume(rp *app.RMRParams) (err error) {
52         app.Logger.Info("Message received!")
53
54         defer app.Rmr.Free(rp.Mbuf)
55         switch rp.Mtype {
56         case alarm.RIC_ALARM_UPDATE:
57                 a.HandleAlarms(rp)
58         default:
59                 app.Logger.Info("Unknown Message Type '%d', discarding", rp.Mtype)
60         }
61
62         return nil
63 }
64
65 func (a *AlarmManager) HandleAlarms(rp *app.RMRParams) (*alert.PostAlertsOK, error) {
66         var m alarm.AlarmMessage
67         app.Logger.Info("Received JSON: %s", rp.Payload)
68         if err := json.Unmarshal(rp.Payload, &m); err != nil {
69                 app.Logger.Error("json.Unmarshal failed: %v", err)
70                 return nil, err
71         }
72         app.Logger.Info("newAlarm: %v", m)
73
74         return a.ProcessAlarm(&m)
75 }
76
77 func (a *AlarmManager) ProcessAlarm(m *alarm.AlarmMessage) (*alert.PostAlertsOK, error) {
78         a.mutex.Lock()
79         if _, ok := alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok {
80                 app.Logger.Warn("Alarm (SP='%d') not recognized, suppressing ...", m.Alarm.SpecificProblem)
81                 a.mutex.Unlock()
82                 return nil, nil
83         }
84
85         // Suppress duplicate alarms
86         idx, found := a.IsMatchFound(m.Alarm)
87         if found && m.AlarmAction == alarm.AlarmActionRaise {
88                 app.Logger.Info("Duplicate alarm found, suppressing ...")
89                 if m.PerceivedSeverity == a.activeAlarms[idx].PerceivedSeverity {
90                         // Duplicate with same severity found
91                         a.mutex.Unlock()
92                         return nil, nil
93                 } else {
94                         // Remove duplicate with different severity
95                         a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
96                 }
97         }
98
99         // Clear alarm if found from active alarm list
100         if m.AlarmAction == alarm.AlarmActionClear {
101                 if found {
102                         a.alarmHistory = append(a.alarmHistory, *m)
103                         a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
104                         if ((len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false)){
105                                 app.Logger.Error("alarm history count exceeded maxAlarmHistory threshold")
106                                 histAlarm := a.alarmClient.NewAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "threshold", "history")
107                                 histAlarmMessage := alarm.AlarmMessage{Alarm: histAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
108                                 a.activeAlarms = append(a.activeAlarms, histAlarmMessage)
109                                 a.alarmHistory = append(a.alarmHistory, histAlarmMessage)
110                         }
111                         if ((a.exceededActiveAlarmOn == true) && (m.Alarm.SpecificProblem == alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD)) {
112                                 a.exceededActiveAlarmOn = false
113                         }
114                         if ((a.exceededAlarmHistoryOn == true) && (m.Alarm.SpecificProblem == alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD)) {
115                                 a.exceededAlarmHistoryOn = false
116                         }
117                         if a.postClear {
118                                 a.mutex.Unlock()
119                                 return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusResolved, m.AlarmTime))
120                         }
121                 }
122                 app.Logger.Info("No matching active alarm found, suppressing ...")
123                 a.mutex.Unlock()
124                 return nil, nil
125         }
126
127         // New alarm -> update active alarms and post to Alert Manager
128         if m.AlarmAction == alarm.AlarmActionRaise {
129                 a.UpdateAlarmLists(m)
130                 a.mutex.Unlock()
131                 return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
132         }
133
134         a.mutex.Unlock()
135         return nil, nil
136 }
137
138 func (a *AlarmManager) IsMatchFound(newAlarm alarm.Alarm) (int, bool) {
139         for i, m := range a.activeAlarms {
140                 if m.ManagedObjectId == newAlarm.ManagedObjectId && m.ApplicationId == newAlarm.ApplicationId &&
141                         m.SpecificProblem == newAlarm.SpecificProblem && m.IdentifyingInfo == newAlarm.IdentifyingInfo {
142                         return i, true
143                 }
144         }
145         return -1, false
146 }
147
148 func (a *AlarmManager) RemoveAlarm(alarms []alarm.AlarmMessage, i int, listName string) []alarm.AlarmMessage {
149         app.Logger.Info("Alarm '%+v' deleted from the '%s' list", alarms[i], listName)
150         copy(alarms[i:], alarms[i+1:])
151         return alarms[:len(alarms)-1]
152 }
153
154 func (a *AlarmManager) UpdateAlarmLists(newAlarm *alarm.AlarmMessage) {
155         /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised.
156            The attempt to raise the alarm next time will be supressed when found as duplicate. */
157         if ((len(a.activeAlarms) >= a.maxActiveAlarms) && (a.exceededActiveAlarmOn == false)) {
158                 app.Logger.Error("active alarm count exceeded maxActiveAlarms threshold")
159                 actAlarm := a.alarmClient.NewAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "threshold", "active")
160                 actAlarmMessage := alarm.AlarmMessage{Alarm: actAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
161                 a.activeAlarms = append(a.activeAlarms, actAlarmMessage)
162                 a.alarmHistory = append(a.alarmHistory, actAlarmMessage)
163                 a.exceededActiveAlarmOn = true
164         }
165
166         if ((len(a.alarmHistory) >= a.maxAlarmHistory) && (a.exceededAlarmHistoryOn == false)) {
167                 app.Logger.Error("alarm history count exceeded maxAlarmHistory threshold")
168                 histAlarm := a.alarmClient.NewAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "threshold", "history")
169                 histAlarmMessage := alarm.AlarmMessage{Alarm: histAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
170                 a.activeAlarms = append(a.activeAlarms, histAlarmMessage)
171                 a.alarmHistory = append(a.alarmHistory, histAlarmMessage)
172                 a.exceededAlarmHistoryOn = true
173         }
174
175         // @todo: For now just keep the alarms (both active and history) in-memory. Use SDL later for persistence
176         a.activeAlarms = append(a.activeAlarms, *newAlarm)
177         a.alarmHistory = append(a.alarmHistory, *newAlarm)
178 }
179
180 func (a *AlarmManager) GenerateAlertLabels(newAlarm alarm.Alarm, status AlertStatus, alarmTime int64) (models.LabelSet, models.LabelSet) {
181         alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem]
182         amLabels := models.LabelSet{
183                 "status":      string(status),
184                 "alertname":   alarmDef.AlarmText,
185                 "severity":    string(newAlarm.PerceivedSeverity),
186                 "service":     fmt.Sprintf("%s:%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
187                 "system_name": fmt.Sprintf("RIC:%s:%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
188         }
189         amAnnotations := models.LabelSet{
190                 "alarm_id":        fmt.Sprintf("%d", alarmDef.AlarmId),
191                 "description":     fmt.Sprintf("%d:%s:%s", newAlarm.SpecificProblem, newAlarm.IdentifyingInfo, newAlarm.AdditionalInfo),
192                 "additional_info": newAlarm.AdditionalInfo,
193                 "summary":         alarmDef.EventType,
194                 "instructions":    alarmDef.OperationInstructions,
195                 "timestamp":       fmt.Sprintf("%s", time.Unix(0, alarmTime).Format("02/01/2006, 15:04:05")),
196         }
197
198         return amLabels, amAnnotations
199 }
200
201 func (a *AlarmManager) NewAlertmanagerClient() *client.Alertmanager {
202         cr := clientruntime.New(a.amHost, a.amBaseUrl, a.amSchemes)
203         return client.New(cr, strfmt.Default)
204 }
205
206 func (a *AlarmManager) PostAlert(amLabels, amAnnotations models.LabelSet) (*alert.PostAlertsOK, error) {
207         pa := &models.PostableAlert{
208                 Alert: models.Alert{
209                         GeneratorURL: strfmt.URI(""),
210                         Labels:       amLabels,
211                 },
212                 Annotations: amAnnotations,
213         }
214         alertParams := alert.NewPostAlertsParams().WithAlerts(models.PostableAlerts{pa})
215
216         app.Logger.Info("Posting alerts: labels: %+v, annotations: %+v", amLabels, amAnnotations)
217         ok, err := a.NewAlertmanagerClient().Alert.PostAlerts(alertParams)
218         if err != nil {
219                 app.Logger.Error("Posting alerts to '%s/%s' failed with error: %v", a.amHost, a.amBaseUrl, err)
220         }
221         return ok, err
222 }
223
224 func (a *AlarmManager) StatusCB() bool {
225         if !a.rmrReady {
226                 app.Logger.Info("RMR not ready yet!")
227         }
228
229         return a.rmrReady
230 }
231
232 func (a *AlarmManager) ConfigChangeCB(configparam string) {
233
234         a.maxActiveAlarms = app.Config.GetInt("controls.maxActiveAlarms")
235         a.maxAlarmHistory = app.Config.GetInt("controls.maxAlarmHistory")
236         a.alertInterval = viper.GetInt("controls.promAlertManager.alertInterval")
237         a.amHost = viper.GetString("controls.promAlertManager.address")
238
239         app.Logger.Debug("ConfigChangeCB: maxActiveAlarms %v", a.maxActiveAlarms)
240         app.Logger.Debug("ConfigChangeCB: maxAlarmHistory = %v", a.maxAlarmHistory)
241         app.Logger.Debug("ConfigChangeCB: alertInterval %v", a.alertInterval)
242         app.Logger.Debug("ConfigChangeCB: amHost = %v", a.amHost)
243
244         return
245 }
246
247 func (a *AlarmManager) ReadAlarmDefinitionFromJson() {
248
249         filename := os.Getenv("DEF_FILE")
250         file, err := ioutil.ReadFile(filename)
251         if err == nil {
252                 data := RicAlarmDefinitions{}
253                 err = json.Unmarshal([]byte(file), &data)
254                 if err == nil {
255                         for _, alarmDefinition := range data.AlarmDefinitions {
256                                 _, exists := alarm.RICAlarmDefinitions[alarmDefinition.AlarmId]
257                                 if exists {
258                                         app.Logger.Error("ReadAlarmDefinitionFromJson: alarm definition already exists for %v", alarmDefinition.AlarmId)
259                                 } else {
260                                         app.Logger.Debug("ReadAlarmDefinitionFromJson: alarm  %v", alarmDefinition.AlarmId)
261                                         ricAlarmDefintion := new(alarm.AlarmDefinition)
262                                         ricAlarmDefintion.AlarmId = alarmDefinition.AlarmId
263                                         ricAlarmDefintion.AlarmText = alarmDefinition.AlarmText
264                                         ricAlarmDefintion.EventType = alarmDefinition.EventType
265                                         ricAlarmDefintion.OperationInstructions = alarmDefinition.OperationInstructions
266                                         alarm.RICAlarmDefinitions[alarmDefinition.AlarmId] = ricAlarmDefintion
267                                 }
268                         }
269                 } else {
270                         app.Logger.Error("ReadAlarmDefinitionFromJson: json.Unmarshal failed with error %v", err)
271                 }
272         } else {
273                 app.Logger.Error("ReadAlarmDefinitionFromJson: ioutil.ReadFile failed with error %v", err)
274         }
275 }
276
277 func (a *AlarmManager) Run(sdlcheck bool) {
278         app.Logger.SetMdc("alarmManager", fmt.Sprintf("%s:%s", Version, Hash))
279         app.SetReadyCB(func(d interface{}) { a.rmrReady = true }, true)
280         app.Resource.InjectStatusCb(a.StatusCB)
281         app.AddConfigChangeListener(a.ConfigChangeCB)
282
283         alarm.RICAlarmDefinitions = make(map[int]*alarm.AlarmDefinition)
284         a.ReadAlarmDefinitionFromJson()
285
286         app.Resource.InjectRoute("/ric/v1/alarms", a.RaiseAlarm, "POST")
287         app.Resource.InjectRoute("/ric/v1/alarms", a.ClearAlarm, "DELETE")
288         app.Resource.InjectRoute("/ric/v1/alarms/active", a.GetActiveAlarms, "GET")
289         app.Resource.InjectRoute("/ric/v1/alarms/history", a.GetAlarmHistory, "GET")
290         app.Resource.InjectRoute("/ric/v1/alarms/config", a.SetAlarmConfig, "POST")
291         app.Resource.InjectRoute("/ric/v1/alarms/config", a.GetAlarmConfig, "GET")
292         app.Resource.InjectRoute("/ric/v1/alarms/define", a.SetAlarmDefinition, "POST")
293         app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.DeleteAlarmDefinition, "DELETE")
294         app.Resource.InjectRoute("/ric/v1/alarms/define", a.GetAlarmDefinition, "GET")
295         app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.GetAlarmDefinition, "GET")
296
297         // Start background timer for re-raising alerts
298         a.postClear = sdlcheck
299         go a.StartAlertTimer()
300         a.alarmClient, _ = alarm.InitAlarm("SEP", "ALARMMANAGER")
301
302         app.RunWithParams(a, sdlcheck)
303 }
304
305 func NewAlarmManager(amHost string, alertInterval int) *AlarmManager {
306         if alertInterval == 0 {
307                 alertInterval = viper.GetInt("controls.promAlertManager.alertInterval")
308         }
309
310         if amHost == "" {
311                 amHost = viper.GetString("controls.promAlertManager.address")
312         }
313
314         return &AlarmManager{
315                 rmrReady:        false,
316                 amHost:          amHost,
317                 amBaseUrl:       viper.GetString("controls.promAlertManager.baseUrl"),
318                 amSchemes:       []string{viper.GetString("controls.promAlertManager.schemes")},
319                 alertInterval:   alertInterval,
320                 activeAlarms:    make([]alarm.AlarmMessage, 0),
321                 alarmHistory:    make([]alarm.AlarmMessage, 0),
322                 maxActiveAlarms: app.Config.GetInt("controls.maxActiveAlarms"),
323                 maxAlarmHistory: app.Config.GetInt("controls.maxAlarmHistory"),
324                 exceededActiveAlarmOn:  false,
325                 exceededAlarmHistoryOn: false,
326         }
327 }
328
329 // Main function
330 func main() {
331         NewAlarmManager("", 0).Run(true)
332 }