Merge "LN0739_FM_FR13: support interface to external Prometheus alert manager"
[ric-plt/alarm-go.git] / manager / cmd / manager.go
1 /*
2  *  Copyright (c) 2020 AT&T Intellectual Property.
3  *  Copyright (c) 2020 Nokia.
4  *
5  *  Licensed under the Apache License, Version 2.0 (the "License");
6  *  you may not use this file except in compliance with the License.
7  *  You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *  Unless required by applicable law or agreed to in writing, software
12  *  distributed under the License is distributed on an "AS IS" BASIS,
13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *  See the License for the specific language governing permissions and
15  *  limitations under the License.
16  *
17  * This source code is part of the near-RT RIC (RAN Intelligent Controller)
18  * platform project (RICP).
19  */
20
21 package main
22
23 import (
24         "encoding/json"
25         "fmt"
26         "time"
27         "os"
28         "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm"
29         app "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp"
30         clientruntime "github.com/go-openapi/runtime/client"
31         "github.com/go-openapi/strfmt"
32         "github.com/prometheus/alertmanager/api/v2/client"
33         "github.com/prometheus/alertmanager/api/v2/client/alert"
34         "github.com/prometheus/alertmanager/api/v2/models"
35         "github.com/spf13/viper"
36         "io/ioutil"
37 )
38
39 func (a *AlarmManager) StartAlertTimer() {
40         tick := time.Tick(time.Duration(a.alertInterval) * time.Millisecond)
41         for range tick {
42                 a.mutex.Lock()
43                 for _, m := range a.activeAlarms {
44                         app.Logger.Info("Re-raising alarm: %v", m)
45                         a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
46                 }
47                 a.mutex.Unlock()
48         }
49 }
50
51 func (a *AlarmManager) Consume(rp *app.RMRParams) (err error) {
52         app.Logger.Info("Message received!")
53
54         defer app.Rmr.Free(rp.Mbuf)
55         switch rp.Mtype {
56         case alarm.RIC_ALARM_UPDATE:
57                 a.HandleAlarms(rp)
58         default:
59                 app.Logger.Info("Unknown Message Type '%d', discarding", rp.Mtype)
60         }
61
62         return nil
63 }
64
65 func (a *AlarmManager) HandleAlarms(rp *app.RMRParams) (*alert.PostAlertsOK, error) {
66         var m alarm.AlarmMessage
67         app.Logger.Info("Received JSON: %s", rp.Payload)
68         if err := json.Unmarshal(rp.Payload, &m); err != nil {
69                 app.Logger.Error("json.Unmarshal failed: %v", err)
70                 return nil, err
71         }
72         app.Logger.Info("newAlarm: %v", m)
73
74         return a.ProcessAlarm(&m)
75 }
76
77 func (a *AlarmManager) ProcessAlarm(m *alarm.AlarmMessage) (*alert.PostAlertsOK, error) {
78         if _, ok := alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok {
79                 app.Logger.Warn("Alarm (SP='%d') not recognized, suppressing ...", m.Alarm.SpecificProblem)
80                 return nil, nil
81         }
82
83         // Suppress duplicate alarms
84         idx, found := a.IsMatchFound(m.Alarm)
85         if found && m.AlarmAction == alarm.AlarmActionRaise {
86                 app.Logger.Info("Duplicate alarm found, suppressing ...")
87                 if m.PerceivedSeverity == a.activeAlarms[idx].PerceivedSeverity {
88                         // Duplicate with same severity found
89                         return nil, nil
90                 } else {
91                         // Remove duplicate with different severity
92                         a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
93                 }
94         }
95
96         // Clear alarm if found from active alarm list
97         if m.AlarmAction == alarm.AlarmActionClear {
98                 if found {
99                         a.alarmHistory = append(a.alarmHistory, *m)
100                         a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
101
102                         if a.postClear {
103                                 return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusResolved, m.AlarmTime))
104                         }
105                 }
106                 app.Logger.Info("No matching active alarm found, suppressing ...")
107                 return nil, nil
108         }
109
110         // New alarm -> update active alarms and post to Alert Manager
111         if m.AlarmAction == alarm.AlarmActionRaise {
112                 a.UpdateAlarmLists(m)
113                 return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
114         }
115
116         return nil, nil
117 }
118
119 func (a *AlarmManager) IsMatchFound(newAlarm alarm.Alarm) (int, bool) {
120         for i, m := range a.activeAlarms {
121                 if m.ManagedObjectId == newAlarm.ManagedObjectId && m.ApplicationId == newAlarm.ApplicationId &&
122                         m.SpecificProblem == newAlarm.SpecificProblem && m.IdentifyingInfo == newAlarm.IdentifyingInfo {
123                         return i, true
124                 }
125         }
126         return -1, false
127 }
128
129 func (a *AlarmManager) RemoveAlarm(alarms []alarm.AlarmMessage, i int, listName string) []alarm.AlarmMessage {
130         a.mutex.Lock()
131         defer a.mutex.Unlock()
132
133         app.Logger.Info("Alarm '%+v' deleted from the '%s' list", alarms[i], listName)
134         copy(alarms[i:], alarms[i+1:])
135         return alarms[:len(alarms)-1]
136 }
137
138 func (a *AlarmManager) UpdateAlarmLists(newAlarm *alarm.AlarmMessage) {
139         a.mutex.Lock()
140         defer a.mutex.Unlock()
141
142         /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised.
143            The attempt to raise the alarm next time will be supressed when found as duplicate. */
144         if len(a.activeAlarms) >= a.maxActiveAlarms {
145                 app.Logger.Error("active alarm count exceeded maxActiveAlarms threshold")
146                 actAlarm := a.alarmClient.NewAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "clear alarms or raise threshold", "active alarms full")
147                 actAlarmMessage := alarm.AlarmMessage{Alarm: actAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
148                 a.activeAlarms = append(a.activeAlarms, actAlarmMessage)
149                 a.alarmHistory = append(a.alarmHistory, actAlarmMessage)
150         }
151
152         if len(a.alarmHistory) >= a.maxAlarmHistory {
153                 app.Logger.Error("alarm history count exceeded maxAlarmHistory threshold")
154                 histAlarm := a.alarmClient.NewAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "clear alarms or raise threshold", "alarm history full")
155                 histAlarmMessage := alarm.AlarmMessage{Alarm: histAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
156                 a.activeAlarms = append(a.activeAlarms, histAlarmMessage)
157                 a.alarmHistory = append(a.alarmHistory, histAlarmMessage)
158         }
159
160         // @todo: For now just keep the alarms (both active and history) in-memory. Use SDL later for persistence
161         a.activeAlarms = append(a.activeAlarms, *newAlarm)
162         a.alarmHistory = append(a.alarmHistory, *newAlarm)
163 }
164
165 func (a *AlarmManager) GenerateAlertLabels(newAlarm alarm.Alarm, status AlertStatus, alarmTime int64) (models.LabelSet, models.LabelSet) {
166         alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem]
167         amLabels := models.LabelSet{
168                 "status":      string(status),
169                 "alertname":   alarmDef.AlarmText,
170                 "severity":    string(newAlarm.PerceivedSeverity),
171                 "service":     fmt.Sprintf("%s:%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
172                 "system_name": fmt.Sprintf("RIC:%s:%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
173         }
174         amAnnotations := models.LabelSet{
175                 "alarm_id":        fmt.Sprintf("%d", alarmDef.AlarmId),
176                 "description":     fmt.Sprintf("%d:%s:%s", newAlarm.SpecificProblem, newAlarm.IdentifyingInfo, newAlarm.AdditionalInfo),
177                 "additional_info": newAlarm.AdditionalInfo,
178                 "summary":         alarmDef.EventType,
179                 "instructions":    alarmDef.OperationInstructions,
180                 "timestamp":       fmt.Sprintf("%s", time.Unix(0, alarmTime).Format("02/01/2006, 15:04:05")),
181         }
182
183         return amLabels, amAnnotations
184 }
185
186 func (a *AlarmManager) NewAlertmanagerClient() *client.Alertmanager {
187         cr := clientruntime.New(a.amHost, a.amBaseUrl, a.amSchemes)
188         return client.New(cr, strfmt.Default)
189 }
190
191 func (a *AlarmManager) PostAlert(amLabels, amAnnotations models.LabelSet) (*alert.PostAlertsOK, error) {
192         pa := &models.PostableAlert{
193                 Alert: models.Alert{
194                         GeneratorURL: strfmt.URI(""),
195                         Labels:       amLabels,
196                 },
197                 Annotations: amAnnotations,
198         }
199         alertParams := alert.NewPostAlertsParams().WithAlerts(models.PostableAlerts{pa})
200
201         app.Logger.Info("Posting alerts: labels: %+v, annotations: %+v", amLabels, amAnnotations)
202         ok, err := a.NewAlertmanagerClient().Alert.PostAlerts(alertParams)
203         if err != nil {
204                 app.Logger.Error("Posting alerts to '%s/%s' failed with error: %v", a.amHost, a.amBaseUrl, err)
205         }
206         return ok, err
207 }
208
209 func (a *AlarmManager) StatusCB() bool {
210         if !a.rmrReady {
211                 app.Logger.Info("RMR not ready yet!")
212         }
213
214         return a.rmrReady
215 }
216
217 func (a *AlarmManager) ConfigChangeCB(configparam string) {
218
219         a.maxActiveAlarms = app.Config.GetInt("controls.maxActiveAlarms")
220         a.maxAlarmHistory = app.Config.GetInt("controls.maxAlarmHistory")
221         a.alertInterval = viper.GetInt("controls.promAlertManager.alertInterval")
222         a.amHost = viper.GetString("controls.promAlertManager.address")
223
224         app.Logger.Debug("ConfigChangeCB: maxActiveAlarms %v", a.maxActiveAlarms)
225         app.Logger.Debug("ConfigChangeCB: maxAlarmHistory = %v", a.maxAlarmHistory)
226         app.Logger.Debug("ConfigChangeCB: alertInterval %v", a.alertInterval)
227         app.Logger.Debug("ConfigChangeCB: amHost = %v", a.amHost)
228
229         return
230 }
231
232 func (a *AlarmManager) ReadAlarmDefinitionFromJson() {
233
234         filename := os.Getenv("DEF_FILE")
235         file, err := ioutil.ReadFile(filename)
236         if err == nil {
237                 data := RicAlarmDefinitions{}
238                 err = json.Unmarshal([]byte(file), &data)
239                 if err == nil {
240                         for _, alarmDefinition := range data.AlarmDefinitions {
241                                 _, exists := alarm.RICAlarmDefinitions[alarmDefinition.AlarmId]
242                                 if exists {
243                                         app.Logger.Error("ReadAlarmDefinitionFromJson: alarm definition already exists for %v", alarmDefinition.AlarmId)
244                                 } else {
245                                         app.Logger.Debug("ReadAlarmDefinitionFromJson: alarm  %v", alarmDefinition.AlarmId)
246                                         ricAlarmDefintion := new(alarm.AlarmDefinition)
247                                         ricAlarmDefintion.AlarmId = alarmDefinition.AlarmId
248                                         ricAlarmDefintion.AlarmText = alarmDefinition.AlarmText
249                                         ricAlarmDefintion.EventType = alarmDefinition.EventType
250                                         ricAlarmDefintion.OperationInstructions = alarmDefinition.OperationInstructions
251                                         alarm.RICAlarmDefinitions[alarmDefinition.AlarmId] = ricAlarmDefintion
252                                 }
253                         }
254                 } else {
255                         app.Logger.Error("json.Unmarshal failed with error %v", err)
256                 }
257         } else {
258                 app.Logger.Error("ioutil.ReadFile failed with error %v", err)
259         }
260 }
261
262 func (a *AlarmManager) Run(sdlcheck bool) {
263         app.Logger.SetMdc("alarmManager", fmt.Sprintf("%s:%s", Version, Hash))
264         app.SetReadyCB(func(d interface{}) { a.rmrReady = true }, true)
265         app.Resource.InjectStatusCb(a.StatusCB)
266         app.AddConfigChangeListener(a.ConfigChangeCB)
267
268         alarm.RICAlarmDefinitions = make(map[int]*alarm.AlarmDefinition)
269         a.ReadAlarmDefinitionFromJson()
270
271         app.Resource.InjectRoute("/ric/v1/alarms", a.RaiseAlarm, "POST")
272         app.Resource.InjectRoute("/ric/v1/alarms", a.ClearAlarm, "DELETE")
273         app.Resource.InjectRoute("/ric/v1/alarms/active", a.GetActiveAlarms, "GET")
274         app.Resource.InjectRoute("/ric/v1/alarms/history", a.GetAlarmHistory, "GET")
275         app.Resource.InjectRoute("/ric/v1/alarms/config", a.SetAlarmConfig, "POST")
276         app.Resource.InjectRoute("/ric/v1/alarms/config", a.GetAlarmConfig, "GET")
277         app.Resource.InjectRoute("/ric/v1/alarms/define", a.SetAlarmDefinition, "POST")
278         app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.DeleteAlarmDefinition, "DELETE")
279         app.Resource.InjectRoute("/ric/v1/alarms/define", a.GetAlarmDefinition, "GET")
280         app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.GetAlarmDefinition, "GET")
281
282         // Start background timer for re-raising alerts
283         a.postClear = sdlcheck
284         go a.StartAlertTimer()
285         a.alarmClient, _ = alarm.InitAlarm("SEP", "ALARMMANAGER")
286
287         app.RunWithParams(a, sdlcheck)
288 }
289
290 func NewAlarmManager(amHost string, alertInterval int) *AlarmManager {
291         if alertInterval == 0 {
292                 alertInterval = viper.GetInt("controls.promAlertManager.alertInterval")
293         }
294
295         if amHost == "" {
296                 amHost = viper.GetString("controls.promAlertManager.address")
297         }
298
299         return &AlarmManager{
300                 rmrReady:        false,
301                 amHost:          amHost,
302                 amBaseUrl:       viper.GetString("controls.promAlertManager.baseUrl"),
303                 amSchemes:       []string{viper.GetString("controls.promAlertManager.schemes")},
304                 alertInterval:   alertInterval,
305                 activeAlarms:    make([]alarm.AlarmMessage, 0),
306                 alarmHistory:    make([]alarm.AlarmMessage, 0),
307                 maxActiveAlarms: app.Config.GetInt("controls.maxActiveAlarms"),
308                 maxAlarmHistory: app.Config.GetInt("controls.maxAlarmHistory"),
309         }
310 }
311
312 // Main function
313 func main() {
314         NewAlarmManager("", 0).Run(true)
315 }