LN0739_FM_FR5: support functionality to escalate alarm by changing severity
[ric-plt/alarm-go.git] / manager / cmd / manager.go
1 /*
2  *  Copyright (c) 2020 AT&T Intellectual Property.
3  *  Copyright (c) 2020 Nokia.
4  *
5  *  Licensed under the Apache License, Version 2.0 (the "License");
6  *  you may not use this file except in compliance with the License.
7  *  You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *  Unless required by applicable law or agreed to in writing, software
12  *  distributed under the License is distributed on an "AS IS" BASIS,
13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *  See the License for the specific language governing permissions and
15  *  limitations under the License.
16  *
17  * This source code is part of the near-RT RIC (RAN Intelligent Controller)
18  * platform project (RICP).
19  */
20
21 package main
22
23 import (
24         "encoding/json"
25         "fmt"
26         "time"
27
28         clientruntime "github.com/go-openapi/runtime/client"
29         "github.com/go-openapi/strfmt"
30         "github.com/prometheus/alertmanager/api/v2/client"
31         "github.com/prometheus/alertmanager/api/v2/client/alert"
32         "github.com/prometheus/alertmanager/api/v2/models"
33         "github.com/spf13/viper"
34
35         "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm"
36         app "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp"
37 )
38
39 func (a *AlarmManager) StartAlertTimer() {
40         tick := time.Tick(time.Duration(a.alertInterval) * time.Millisecond)
41         for range tick {
42                 a.mutex.Lock()
43                 for _, m := range a.activeAlarms {
44                         app.Logger.Info("Re-raising alarm: %v", m)
45                         a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
46                 }
47                 a.mutex.Unlock()
48         }
49 }
50
51 func (a *AlarmManager) Consume(rp *app.RMRParams) (err error) {
52         app.Logger.Info("Message received!")
53
54         defer app.Rmr.Free(rp.Mbuf)
55         switch rp.Mtype {
56         case alarm.RIC_ALARM_UPDATE:
57                 a.HandleAlarms(rp)
58         default:
59                 app.Logger.Info("Unknown Message Type '%d', discarding", rp.Mtype)
60         }
61
62         return nil
63 }
64
65 func (a *AlarmManager) HandleAlarms(rp *app.RMRParams) (*alert.PostAlertsOK, error) {
66         var m alarm.AlarmMessage
67         app.Logger.Info("Received JSON: %s", rp.Payload)
68         if err := json.Unmarshal(rp.Payload, &m); err != nil {
69                 app.Logger.Error("json.Unmarshal failed: %v", err)
70                 return nil, err
71         }
72         app.Logger.Info("newAlarm: %v", m)
73
74         return a.ProcessAlarm(&m)
75 }
76
77 func (a *AlarmManager) ProcessAlarm(m *alarm.AlarmMessage) (*alert.PostAlertsOK, error) {
78         if _, ok := alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok {
79                 app.Logger.Warn("Alarm (SP='%d') not recognized, suppressing ...", m.Alarm.SpecificProblem)
80                 return nil, nil
81         }
82
83         // Suppress duplicate alarms
84         idx, found := a.IsMatchFound(m.Alarm)
85         if found && m.AlarmAction == alarm.AlarmActionRaise  {
86                 app.Logger.Info("Duplicate alarm found, suppressing ...")
87                 if m.PerceivedSeverity == a.activeAlarms[idx].PerceivedSeverity {
88                         // Duplicate with same severity found
89                         return nil, nil
90                 } else {
91                         // Remove duplicate with different severity
92                         a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
93                 }
94         }
95
96
97         // Clear alarm if found from active alarm list
98         if m.AlarmAction == alarm.AlarmActionClear {
99                 if found {
100                         a.alarmHistory = append(a.alarmHistory, *m)
101                         a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
102
103                         if a.postClear {
104                                 return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusResolved, m.AlarmTime))
105                         }
106                 }
107                 app.Logger.Info("No matching active alarm found, suppressing ...")
108                 return nil, nil
109         }
110
111         // New alarm -> update active alarms and post to Alert Manager
112         if m.AlarmAction == alarm.AlarmActionRaise {
113                 a.UpdateAlarmLists(m)
114                 return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
115         }
116
117         return nil, nil
118 }
119
120 func (a *AlarmManager) IsMatchFound(newAlarm alarm.Alarm) (int, bool) {
121         for i, m := range a.activeAlarms {
122                 if m.ManagedObjectId == newAlarm.ManagedObjectId && m.ApplicationId == newAlarm.ApplicationId &&
123                         m.SpecificProblem == newAlarm.SpecificProblem && m.IdentifyingInfo == newAlarm.IdentifyingInfo {
124                         return i, true
125                 }
126         }
127         return -1, false
128 }
129
130 func (a *AlarmManager) RemoveAlarm(alarms []alarm.AlarmMessage, i int, listName string) []alarm.AlarmMessage {
131         a.mutex.Lock()
132         defer a.mutex.Unlock()
133
134         app.Logger.Info("Alarm '%+v' deleted from the '%s' list", alarms[i], listName)
135         copy(alarms[i:], alarms[i+1:])
136         return alarms[:len(alarms)-1]
137 }
138
139 func (a *AlarmManager) UpdateAlarmLists(newAlarm *alarm.AlarmMessage) {
140         a.mutex.Lock()
141         defer a.mutex.Unlock()
142
143         /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised.
144            The attempt to raise the alarm next time will be supressed when found as duplicate. */
145         if len(a.activeAlarms) >= a.maxActiveAlarms {
146                 app.Logger.Error("active alarm count exceeded maxActiveAlarms threshold")
147                 actAlarm := a.alarmClient.NewAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "clear alarms or raise threshold", "active alarms full")
148                 actAlarmMessage := alarm.AlarmMessage{Alarm: actAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
149                 a.activeAlarms = append(a.activeAlarms, actAlarmMessage)
150                 a.alarmHistory = append(a.alarmHistory, actAlarmMessage)
151         }
152
153         if len(a.alarmHistory) >= a.maxAlarmHistory {
154                 app.Logger.Error("alarm history count exceeded maxAlarmHistory threshold")
155                 histAlarm := a.alarmClient.NewAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "clear alarms or raise threshold", "alarm history full")
156                 histAlarmMessage := alarm.AlarmMessage{Alarm: histAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
157                 a.activeAlarms = append(a.activeAlarms, histAlarmMessage)
158                 a.alarmHistory = append(a.alarmHistory, histAlarmMessage)
159         }
160
161         // @todo: For now just keep the alarms (both active and history) in-memory. Use SDL later for persistence
162         a.activeAlarms = append(a.activeAlarms, *newAlarm)
163         a.alarmHistory = append(a.alarmHistory, *newAlarm)
164 }
165
166 func (a *AlarmManager) GenerateAlertLabels(newAlarm alarm.Alarm, status AlertStatus, alarmTime int64) (models.LabelSet, models.LabelSet) {
167         alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem]
168         amLabels := models.LabelSet{
169                 "status":      string(status),
170                 "alertname":   alarmDef.AlarmText,
171                 "severity":    string(newAlarm.PerceivedSeverity),
172                 "service":     fmt.Sprintf("%s:%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
173                 "system_name": fmt.Sprintf("RIC:%s:%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
174         }
175         amAnnotations := models.LabelSet{
176                 "alarm_id":        fmt.Sprintf("%d", alarmDef.AlarmId),
177                 "description":     fmt.Sprintf("%d:%s:%s", newAlarm.SpecificProblem, newAlarm.IdentifyingInfo, newAlarm.AdditionalInfo),
178                 "additional_info": newAlarm.AdditionalInfo,
179                 "summary":         alarmDef.EventType,
180                 "instructions":    alarmDef.OperationInstructions,
181                 "timestamp":       fmt.Sprintf("%s", time.Unix(0, alarmTime).Format("02/01/2006, 15:04:05")),
182         }
183
184         return amLabels, amAnnotations
185 }
186
187 func (a *AlarmManager) NewAlertmanagerClient() *client.Alertmanager {
188         cr := clientruntime.New(a.amHost, a.amBaseUrl, a.amSchemes)
189         return client.New(cr, strfmt.Default)
190 }
191
192 func (a *AlarmManager) PostAlert(amLabels, amAnnotations models.LabelSet) (*alert.PostAlertsOK, error) {
193         pa := &models.PostableAlert{
194                 Alert: models.Alert{
195                         GeneratorURL: strfmt.URI(""),
196                         Labels:       amLabels,
197                 },
198                 Annotations: amAnnotations,
199         }
200         alertParams := alert.NewPostAlertsParams().WithAlerts(models.PostableAlerts{pa})
201
202         app.Logger.Info("Posting alerts: labels: %+v, annotations: %+v", amLabels, amAnnotations)
203         ok, err := a.NewAlertmanagerClient().Alert.PostAlerts(alertParams)
204         if err != nil {
205                 app.Logger.Error("Posting alerts to '%s/%s' failed with error: %v", a.amHost, a.amBaseUrl, err)
206         }
207         return ok, err
208 }
209
210 func (a *AlarmManager) StatusCB() bool {
211         if !a.rmrReady {
212                 app.Logger.Info("RMR not ready yet!")
213         }
214
215         return a.rmrReady
216 }
217
218 func (a *AlarmManager) ConfigChangeCB(configparam string) {
219
220         a.maxActiveAlarms = app.Config.GetInt("controls.maxActiveAlarms")
221         a.maxAlarmHistory = app.Config.GetInt("controls.maxAlarmHistory")
222
223         app.Logger.Debug("ConfigChangeCB: maxActiveAlarms %v", a.maxActiveAlarms)
224         app.Logger.Debug("ConfigChangeCB: maxAlarmHistory = %v", a.maxAlarmHistory)
225
226         return
227 }
228
229 func (a *AlarmManager) Run(sdlcheck bool) {
230         app.Logger.SetMdc("alarmManager", fmt.Sprintf("%s:%s", Version, Hash))
231         app.SetReadyCB(func(d interface{}) { a.rmrReady = true }, true)
232         app.Resource.InjectStatusCb(a.StatusCB)
233         app.AddConfigChangeListener(a.ConfigChangeCB)
234
235         app.Resource.InjectRoute("/ric/v1/alarms", a.RaiseAlarm, "POST")
236         app.Resource.InjectRoute("/ric/v1/alarms", a.ClearAlarm, "DELETE")
237         app.Resource.InjectRoute("/ric/v1/alarms/active", a.GetActiveAlarms, "GET")
238         app.Resource.InjectRoute("/ric/v1/alarms/history", a.GetAlarmHistory, "GET")
239         app.Resource.InjectRoute("/ric/v1/alarms/config", a.SetAlarmConfig, "POST")
240         app.Resource.InjectRoute("/ric/v1/alarms/config", a.GetAlarmConfig, "GET")
241
242         // Start background timer for re-raising alerts
243         a.postClear = sdlcheck
244         go a.StartAlertTimer()
245         a.alarmClient, _ = alarm.InitAlarm("SEP", "ALARMMANAGER")
246
247         app.RunWithParams(a, sdlcheck)
248 }
249
250 func NewAlarmManager(amHost string, alertInterval int) *AlarmManager {
251         if alertInterval == 0 {
252                 alertInterval = viper.GetInt("controls.promAlertManager.alertInterval")
253         }
254
255         if amHost == "" {
256                 amHost = viper.GetString("controls.promAlertManager.address")
257         }
258
259         return &AlarmManager{
260                 rmrReady:        false,
261                 amHost:          amHost,
262                 amBaseUrl:       viper.GetString("controls.promAlertManager.baseUrl"),
263                 amSchemes:       []string{viper.GetString("controls.promAlertManager.schemes")},
264                 alertInterval:   alertInterval,
265                 activeAlarms:    make([]alarm.AlarmMessage, 0),
266                 alarmHistory:    make([]alarm.AlarmMessage, 0),
267                 maxActiveAlarms: app.Config.GetInt("controls.maxActiveAlarms"),
268                 maxAlarmHistory: app.Config.GetInt("controls.maxAlarmHistory"),
269         }
270 }
271
272 // Main function
273 func main() {
274         NewAlarmManager("", 0).Run(true)
275 }