Merge "LN0739_FM_FR12: support for options to dynamically create the AlarmDefinitions...
[ric-plt/alarm-go.git] / manager / cmd / manager.go
1 /*
2  *  Copyright (c) 2020 AT&T Intellectual Property.
3  *  Copyright (c) 2020 Nokia.
4  *
5  *  Licensed under the Apache License, Version 2.0 (the "License");
6  *  you may not use this file except in compliance with the License.
7  *  You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *  Unless required by applicable law or agreed to in writing, software
12  *  distributed under the License is distributed on an "AS IS" BASIS,
13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *  See the License for the specific language governing permissions and
15  *  limitations under the License.
16  *
17  * This source code is part of the near-RT RIC (RAN Intelligent Controller)
18  * platform project (RICP).
19  */
20
21 package main
22
23 import (
24         "encoding/json"
25         "fmt"
26         "time"
27
28         clientruntime "github.com/go-openapi/runtime/client"
29         "github.com/go-openapi/strfmt"
30         "github.com/prometheus/alertmanager/api/v2/client"
31         "github.com/prometheus/alertmanager/api/v2/client/alert"
32         "github.com/prometheus/alertmanager/api/v2/models"
33         "github.com/spf13/viper"
34
35         "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm"
36         app "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp"
37 )
38
39 func (a *AlarmManager) StartAlertTimer() {
40         tick := time.Tick(time.Duration(a.alertInterval) * time.Millisecond)
41         for range tick {
42                 a.mutex.Lock()
43                 for _, m := range a.activeAlarms {
44                         app.Logger.Info("Re-raising alarm: %v", m)
45                         a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
46                 }
47                 a.mutex.Unlock()
48         }
49 }
50
51 func (a *AlarmManager) Consume(rp *app.RMRParams) (err error) {
52         app.Logger.Info("Message received!")
53
54         defer app.Rmr.Free(rp.Mbuf)
55         switch rp.Mtype {
56         case alarm.RIC_ALARM_UPDATE:
57                 a.HandleAlarms(rp)
58         default:
59                 app.Logger.Info("Unknown Message Type '%d', discarding", rp.Mtype)
60         }
61
62         return nil
63 }
64
65 func (a *AlarmManager) HandleAlarms(rp *app.RMRParams) (*alert.PostAlertsOK, error) {
66         var m alarm.AlarmMessage
67         app.Logger.Info("Received JSON: %s", rp.Payload)
68         if err := json.Unmarshal(rp.Payload, &m); err != nil {
69                 app.Logger.Error("json.Unmarshal failed: %v", err)
70                 return nil, err
71         }
72         app.Logger.Info("newAlarm: %v", m)
73
74         return a.ProcessAlarm(&m)
75 }
76
77 func (a *AlarmManager) ProcessAlarm(m *alarm.AlarmMessage) (*alert.PostAlertsOK, error) {
78         if _, ok := alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok {
79                 app.Logger.Warn("Alarm (SP='%d') not recognized, suppressing ...", m.Alarm.SpecificProblem)
80                 return nil, nil
81         }
82
83         // Suppress duplicate alarms
84         idx, found := a.IsMatchFound(m.Alarm)
85         if found && m.AlarmAction == alarm.AlarmActionRaise  {
86                 app.Logger.Info("Duplicate alarm found, suppressing ...")
87                 if m.PerceivedSeverity == a.activeAlarms[idx].PerceivedSeverity {
88                         // Duplicate with same severity found
89                         return nil, nil
90                 } else {
91                         // Remove duplicate with different severity
92                         a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
93                 }
94         }
95
96
97         // Clear alarm if found from active alarm list
98         if m.AlarmAction == alarm.AlarmActionClear {
99                 if found {
100                         a.alarmHistory = append(a.alarmHistory, *m)
101                         a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
102
103                         if a.postClear {
104                                 return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusResolved, m.AlarmTime))
105                         }
106                 }
107                 app.Logger.Info("No matching active alarm found, suppressing ...")
108                 return nil, nil
109         }
110
111         // New alarm -> update active alarms and post to Alert Manager
112         if m.AlarmAction == alarm.AlarmActionRaise {
113                 a.UpdateAlarmLists(m)
114                 return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
115         }
116
117         return nil, nil
118 }
119
120 func (a *AlarmManager) IsMatchFound(newAlarm alarm.Alarm) (int, bool) {
121         for i, m := range a.activeAlarms {
122                 if m.ManagedObjectId == newAlarm.ManagedObjectId && m.ApplicationId == newAlarm.ApplicationId &&
123                         m.SpecificProblem == newAlarm.SpecificProblem && m.IdentifyingInfo == newAlarm.IdentifyingInfo {
124                         return i, true
125                 }
126         }
127         return -1, false
128 }
129
130 func (a *AlarmManager) RemoveAlarm(alarms []alarm.AlarmMessage, i int, listName string) []alarm.AlarmMessage {
131         a.mutex.Lock()
132         defer a.mutex.Unlock()
133
134         app.Logger.Info("Alarm '%+v' deleted from the '%s' list", alarms[i], listName)
135         copy(alarms[i:], alarms[i+1:])
136         return alarms[:len(alarms)-1]
137 }
138
139 func (a *AlarmManager) UpdateAlarmLists(newAlarm *alarm.AlarmMessage) {
140         a.mutex.Lock()
141         defer a.mutex.Unlock()
142
143         /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised.
144            The attempt to raise the alarm next time will be supressed when found as duplicate. */
145         if len(a.activeAlarms) >= a.maxActiveAlarms {
146                 app.Logger.Error("active alarm count exceeded maxActiveAlarms threshold")
147                 actAlarm := a.alarmClient.NewAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "clear alarms or raise threshold", "active alarms full")
148                 actAlarmMessage := alarm.AlarmMessage{Alarm: actAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
149                 a.activeAlarms = append(a.activeAlarms, actAlarmMessage)
150                 a.alarmHistory = append(a.alarmHistory, actAlarmMessage)
151         }
152
153         if len(a.alarmHistory) >= a.maxAlarmHistory {
154                 app.Logger.Error("alarm history count exceeded maxAlarmHistory threshold")
155                 histAlarm := a.alarmClient.NewAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "clear alarms or raise threshold", "alarm history full")
156                 histAlarmMessage := alarm.AlarmMessage{Alarm: histAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
157                 a.activeAlarms = append(a.activeAlarms, histAlarmMessage)
158                 a.alarmHistory = append(a.alarmHistory, histAlarmMessage)
159         }
160
161         // @todo: For now just keep the alarms (both active and history) in-memory. Use SDL later for persistence
162         a.activeAlarms = append(a.activeAlarms, *newAlarm)
163         a.alarmHistory = append(a.alarmHistory, *newAlarm)
164 }
165
166 func (a *AlarmManager) GenerateAlertLabels(newAlarm alarm.Alarm, status AlertStatus, alarmTime int64) (models.LabelSet, models.LabelSet) {
167         alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem]
168         amLabels := models.LabelSet{
169                 "status":      string(status),
170                 "alertname":   alarmDef.AlarmText,
171                 "severity":    string(newAlarm.PerceivedSeverity),
172                 "service":     fmt.Sprintf("%s:%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
173                 "system_name": fmt.Sprintf("RIC:%s:%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
174         }
175         amAnnotations := models.LabelSet{
176                 "alarm_id":        fmt.Sprintf("%d", alarmDef.AlarmId),
177                 "description":     fmt.Sprintf("%d:%s:%s", newAlarm.SpecificProblem, newAlarm.IdentifyingInfo, newAlarm.AdditionalInfo),
178                 "additional_info": newAlarm.AdditionalInfo,
179                 "summary":         alarmDef.EventType,
180                 "instructions":    alarmDef.OperationInstructions,
181                 "timestamp":       fmt.Sprintf("%s", time.Unix(0, alarmTime).Format("02/01/2006, 15:04:05")),
182         }
183
184         return amLabels, amAnnotations
185 }
186
187 func (a *AlarmManager) NewAlertmanagerClient() *client.Alertmanager {
188         cr := clientruntime.New(a.amHost, a.amBaseUrl, a.amSchemes)
189         return client.New(cr, strfmt.Default)
190 }
191
192 func (a *AlarmManager) PostAlert(amLabels, amAnnotations models.LabelSet) (*alert.PostAlertsOK, error) {
193         pa := &models.PostableAlert{
194                 Alert: models.Alert{
195                         GeneratorURL: strfmt.URI(""),
196                         Labels:       amLabels,
197                 },
198                 Annotations: amAnnotations,
199         }
200         alertParams := alert.NewPostAlertsParams().WithAlerts(models.PostableAlerts{pa})
201
202         app.Logger.Info("Posting alerts: labels: %+v, annotations: %+v", amLabels, amAnnotations)
203         ok, err := a.NewAlertmanagerClient().Alert.PostAlerts(alertParams)
204         if err != nil {
205                 app.Logger.Error("Posting alerts to '%s/%s' failed with error: %v", a.amHost, a.amBaseUrl, err)
206         }
207         return ok, err
208 }
209
210 func (a *AlarmManager) StatusCB() bool {
211         if !a.rmrReady {
212                 app.Logger.Info("RMR not ready yet!")
213         }
214
215         return a.rmrReady
216 }
217
218 func (a *AlarmManager) ConfigChangeCB(configparam string) {
219
220         a.maxActiveAlarms = app.Config.GetInt("controls.maxActiveAlarms")
221         a.maxAlarmHistory = app.Config.GetInt("controls.maxAlarmHistory")
222
223         app.Logger.Debug("ConfigChangeCB: maxActiveAlarms %v", a.maxActiveAlarms)
224         app.Logger.Debug("ConfigChangeCB: maxAlarmHistory = %v", a.maxAlarmHistory)
225
226         return
227 }
228
229 func (a *AlarmManager) Run(sdlcheck bool) {
230         app.Logger.SetMdc("alarmManager", fmt.Sprintf("%s:%s", Version, Hash))
231         app.SetReadyCB(func(d interface{}) { a.rmrReady = true }, true)
232         app.Resource.InjectStatusCb(a.StatusCB)
233         app.AddConfigChangeListener(a.ConfigChangeCB)
234
235         alarm.RICAlarmDefinitions = make(map[int]*alarm.AlarmDefinition)
236
237         app.Resource.InjectRoute("/ric/v1/alarms", a.RaiseAlarm, "POST")
238         app.Resource.InjectRoute("/ric/v1/alarms", a.ClearAlarm, "DELETE")
239         app.Resource.InjectRoute("/ric/v1/alarms/active", a.GetActiveAlarms, "GET")
240         app.Resource.InjectRoute("/ric/v1/alarms/history", a.GetAlarmHistory, "GET")
241         app.Resource.InjectRoute("/ric/v1/alarms/config", a.SetAlarmConfig, "POST")
242         app.Resource.InjectRoute("/ric/v1/alarms/config", a.GetAlarmConfig, "GET")
243         app.Resource.InjectRoute("/ric/v1/alarms/define", a.SetAlarmDefinition, "POST")
244         app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.DeleteAlarmDefinition, "DELETE")
245         app.Resource.InjectRoute("/ric/v1/alarms/define", a.GetAlarmDefinition, "GET")
246
247         // Start background timer for re-raising alerts
248         a.postClear = sdlcheck
249         go a.StartAlertTimer()
250         a.alarmClient, _ = alarm.InitAlarm("SEP", "ALARMMANAGER")
251
252         app.RunWithParams(a, sdlcheck)
253 }
254
255 func NewAlarmManager(amHost string, alertInterval int) *AlarmManager {
256         if alertInterval == 0 {
257                 alertInterval = viper.GetInt("controls.promAlertManager.alertInterval")
258         }
259
260         if amHost == "" {
261                 amHost = viper.GetString("controls.promAlertManager.address")
262         }
263
264         return &AlarmManager{
265                 rmrReady:        false,
266                 amHost:          amHost,
267                 amBaseUrl:       viper.GetString("controls.promAlertManager.baseUrl"),
268                 amSchemes:       []string{viper.GetString("controls.promAlertManager.schemes")},
269                 alertInterval:   alertInterval,
270                 activeAlarms:    make([]alarm.AlarmMessage, 0),
271                 alarmHistory:    make([]alarm.AlarmMessage, 0),
272                 maxActiveAlarms: app.Config.GetInt("controls.maxActiveAlarms"),
273                 maxAlarmHistory: app.Config.GetInt("controls.maxAlarmHistory"),
274         }
275 }
276
277 // Main function
278 func main() {
279         NewAlarmManager("", 0).Run(true)
280 }