Merge "alarm manager to manage predefined platform alarm definitions in a json file...
[ric-plt/alarm-go.git] / manager / cmd / manager.go
1 /*
2  *  Copyright (c) 2020 AT&T Intellectual Property.
3  *  Copyright (c) 2020 Nokia.
4  *
5  *  Licensed under the Apache License, Version 2.0 (the "License");
6  *  you may not use this file except in compliance with the License.
7  *  You may obtain a copy of the License at
8  *
9  *     http://www.apache.org/licenses/LICENSE-2.0
10  *
11  *  Unless required by applicable law or agreed to in writing, software
12  *  distributed under the License is distributed on an "AS IS" BASIS,
13  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  *  See the License for the specific language governing permissions and
15  *  limitations under the License.
16  *
17  * This source code is part of the near-RT RIC (RAN Intelligent Controller)
18  * platform project (RICP).
19  */
20
21 package main
22
23 import (
24         "encoding/json"
25         "fmt"
26         "time"
27         "os"
28         "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm"
29         app "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp"
30         clientruntime "github.com/go-openapi/runtime/client"
31         "github.com/go-openapi/strfmt"
32         "github.com/prometheus/alertmanager/api/v2/client"
33         "github.com/prometheus/alertmanager/api/v2/client/alert"
34         "github.com/prometheus/alertmanager/api/v2/models"
35         "github.com/spf13/viper"
36         "io/ioutil"
37 )
38
39 func (a *AlarmManager) StartAlertTimer() {
40         tick := time.Tick(time.Duration(a.alertInterval) * time.Millisecond)
41         for range tick {
42                 a.mutex.Lock()
43                 for _, m := range a.activeAlarms {
44                         app.Logger.Info("Re-raising alarm: %v", m)
45                         a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
46                 }
47                 a.mutex.Unlock()
48         }
49 }
50
51 func (a *AlarmManager) Consume(rp *app.RMRParams) (err error) {
52         app.Logger.Info("Message received!")
53
54         defer app.Rmr.Free(rp.Mbuf)
55         switch rp.Mtype {
56         case alarm.RIC_ALARM_UPDATE:
57                 a.HandleAlarms(rp)
58         default:
59                 app.Logger.Info("Unknown Message Type '%d', discarding", rp.Mtype)
60         }
61
62         return nil
63 }
64
65 func (a *AlarmManager) HandleAlarms(rp *app.RMRParams) (*alert.PostAlertsOK, error) {
66         var m alarm.AlarmMessage
67         app.Logger.Info("Received JSON: %s", rp.Payload)
68         if err := json.Unmarshal(rp.Payload, &m); err != nil {
69                 app.Logger.Error("json.Unmarshal failed: %v", err)
70                 return nil, err
71         }
72         app.Logger.Info("newAlarm: %v", m)
73
74         return a.ProcessAlarm(&m)
75 }
76
77 func (a *AlarmManager) ProcessAlarm(m *alarm.AlarmMessage) (*alert.PostAlertsOK, error) {
78         if _, ok := alarm.RICAlarmDefinitions[m.Alarm.SpecificProblem]; !ok {
79                 app.Logger.Warn("Alarm (SP='%d') not recognized, suppressing ...", m.Alarm.SpecificProblem)
80                 return nil, nil
81         }
82
83         // Suppress duplicate alarms
84         idx, found := a.IsMatchFound(m.Alarm)
85         if found && m.AlarmAction == alarm.AlarmActionRaise {
86                 app.Logger.Info("Duplicate alarm found, suppressing ...")
87                 if m.PerceivedSeverity == a.activeAlarms[idx].PerceivedSeverity {
88                         // Duplicate with same severity found
89                         return nil, nil
90                 } else {
91                         // Remove duplicate with different severity
92                         a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
93                 }
94         }
95
96         // Clear alarm if found from active alarm list
97         if m.AlarmAction == alarm.AlarmActionClear {
98                 if found {
99                         a.alarmHistory = append(a.alarmHistory, *m)
100                         a.activeAlarms = a.RemoveAlarm(a.activeAlarms, idx, "active")
101
102                         if a.postClear {
103                                 return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusResolved, m.AlarmTime))
104                         }
105                 }
106                 app.Logger.Info("No matching active alarm found, suppressing ...")
107                 return nil, nil
108         }
109
110         // New alarm -> update active alarms and post to Alert Manager
111         if m.AlarmAction == alarm.AlarmActionRaise {
112                 a.UpdateAlarmLists(m)
113                 return a.PostAlert(a.GenerateAlertLabels(m.Alarm, AlertStatusActive, m.AlarmTime))
114         }
115
116         return nil, nil
117 }
118
119 func (a *AlarmManager) IsMatchFound(newAlarm alarm.Alarm) (int, bool) {
120         for i, m := range a.activeAlarms {
121                 if m.ManagedObjectId == newAlarm.ManagedObjectId && m.ApplicationId == newAlarm.ApplicationId &&
122                         m.SpecificProblem == newAlarm.SpecificProblem && m.IdentifyingInfo == newAlarm.IdentifyingInfo {
123                         return i, true
124                 }
125         }
126         return -1, false
127 }
128
129 func (a *AlarmManager) RemoveAlarm(alarms []alarm.AlarmMessage, i int, listName string) []alarm.AlarmMessage {
130         a.mutex.Lock()
131         defer a.mutex.Unlock()
132
133         app.Logger.Info("Alarm '%+v' deleted from the '%s' list", alarms[i], listName)
134         copy(alarms[i:], alarms[i+1:])
135         return alarms[:len(alarms)-1]
136 }
137
138 func (a *AlarmManager) UpdateAlarmLists(newAlarm *alarm.AlarmMessage) {
139         a.mutex.Lock()
140         defer a.mutex.Unlock()
141
142         /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised.
143            The attempt to raise the alarm next time will be supressed when found as duplicate. */
144         if len(a.activeAlarms) >= a.maxActiveAlarms {
145                 app.Logger.Error("active alarm count exceeded maxActiveAlarms threshold")
146                 actAlarm := a.alarmClient.NewAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "clear alarms or raise threshold", "active alarms full")
147                 actAlarmMessage := alarm.AlarmMessage{Alarm: actAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
148                 a.activeAlarms = append(a.activeAlarms, actAlarmMessage)
149                 a.alarmHistory = append(a.alarmHistory, actAlarmMessage)
150         }
151
152         if len(a.alarmHistory) >= a.maxAlarmHistory {
153                 app.Logger.Error("alarm history count exceeded maxAlarmHistory threshold")
154                 histAlarm := a.alarmClient.NewAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "clear alarms or raise threshold", "alarm history full")
155                 histAlarmMessage := alarm.AlarmMessage{Alarm: histAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())}
156                 a.activeAlarms = append(a.activeAlarms, histAlarmMessage)
157                 a.alarmHistory = append(a.alarmHistory, histAlarmMessage)
158         }
159
160         // @todo: For now just keep the alarms (both active and history) in-memory. Use SDL later for persistence
161         a.activeAlarms = append(a.activeAlarms, *newAlarm)
162         a.alarmHistory = append(a.alarmHistory, *newAlarm)
163 }
164
165 func (a *AlarmManager) GenerateAlertLabels(newAlarm alarm.Alarm, status AlertStatus, alarmTime int64) (models.LabelSet, models.LabelSet) {
166         alarmDef := alarm.RICAlarmDefinitions[newAlarm.SpecificProblem]
167         amLabels := models.LabelSet{
168                 "status":      string(status),
169                 "alertname":   alarmDef.AlarmText,
170                 "severity":    string(newAlarm.PerceivedSeverity),
171                 "service":     fmt.Sprintf("%s:%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
172                 "system_name": fmt.Sprintf("RIC:%s:%s", newAlarm.ManagedObjectId, newAlarm.ApplicationId),
173         }
174         amAnnotations := models.LabelSet{
175                 "alarm_id":        fmt.Sprintf("%d", alarmDef.AlarmId),
176                 "description":     fmt.Sprintf("%d:%s:%s", newAlarm.SpecificProblem, newAlarm.IdentifyingInfo, newAlarm.AdditionalInfo),
177                 "additional_info": newAlarm.AdditionalInfo,
178                 "summary":         alarmDef.EventType,
179                 "instructions":    alarmDef.OperationInstructions,
180                 "timestamp":       fmt.Sprintf("%s", time.Unix(0, alarmTime).Format("02/01/2006, 15:04:05")),
181         }
182
183         return amLabels, amAnnotations
184 }
185
186 func (a *AlarmManager) NewAlertmanagerClient() *client.Alertmanager {
187         cr := clientruntime.New(a.amHost, a.amBaseUrl, a.amSchemes)
188         return client.New(cr, strfmt.Default)
189 }
190
191 func (a *AlarmManager) PostAlert(amLabels, amAnnotations models.LabelSet) (*alert.PostAlertsOK, error) {
192         pa := &models.PostableAlert{
193                 Alert: models.Alert{
194                         GeneratorURL: strfmt.URI(""),
195                         Labels:       amLabels,
196                 },
197                 Annotations: amAnnotations,
198         }
199         alertParams := alert.NewPostAlertsParams().WithAlerts(models.PostableAlerts{pa})
200
201         app.Logger.Info("Posting alerts: labels: %+v, annotations: %+v", amLabels, amAnnotations)
202         ok, err := a.NewAlertmanagerClient().Alert.PostAlerts(alertParams)
203         if err != nil {
204                 app.Logger.Error("Posting alerts to '%s/%s' failed with error: %v", a.amHost, a.amBaseUrl, err)
205         }
206         return ok, err
207 }
208
209 func (a *AlarmManager) StatusCB() bool {
210         if !a.rmrReady {
211                 app.Logger.Info("RMR not ready yet!")
212         }
213
214         return a.rmrReady
215 }
216
217 func (a *AlarmManager) ConfigChangeCB(configparam string) {
218
219         a.maxActiveAlarms = app.Config.GetInt("controls.maxActiveAlarms")
220         a.maxAlarmHistory = app.Config.GetInt("controls.maxAlarmHistory")
221
222         app.Logger.Debug("ConfigChangeCB: maxActiveAlarms %v", a.maxActiveAlarms)
223         app.Logger.Debug("ConfigChangeCB: maxAlarmHistory = %v", a.maxAlarmHistory)
224
225         return
226 }
227
228 func (a *AlarmManager) ReadAlarmDefinitionFromJson() {
229
230         filename := os.Getenv("DEF_FILE")
231         file, err := ioutil.ReadFile(filename)
232         if err == nil {
233                 data := RicAlarmDefinitions{}
234                 err = json.Unmarshal([]byte(file), &data)
235                 if err == nil {
236                         for _, alarmDefinition := range data.AlarmDefinitions {
237                                 _, exists := alarm.RICAlarmDefinitions[alarmDefinition.AlarmId]
238                                 if exists {
239                                         app.Logger.Error("ReadAlarmDefinitionFromJson: alarm definition already exists for %v", alarmDefinition.AlarmId)
240                                 } else {
241                                         app.Logger.Debug("ReadAlarmDefinitionFromJson: alarm  %v", alarmDefinition.AlarmId)
242                                         ricAlarmDefintion := new(alarm.AlarmDefinition)
243                                         ricAlarmDefintion.AlarmId = alarmDefinition.AlarmId
244                                         ricAlarmDefintion.AlarmText = alarmDefinition.AlarmText
245                                         ricAlarmDefintion.EventType = alarmDefinition.EventType
246                                         ricAlarmDefintion.OperationInstructions = alarmDefinition.OperationInstructions
247                                         alarm.RICAlarmDefinitions[alarmDefinition.AlarmId] = ricAlarmDefintion
248                                 }
249                         }
250                 } else {
251                         app.Logger.Error("json.Unmarshal failed with error %v", err)
252                 }
253         } else {
254                 app.Logger.Error("ioutil.ReadFile failed with error %v", err)
255         }
256 }
257
258 func (a *AlarmManager) Run(sdlcheck bool) {
259         app.Logger.SetMdc("alarmManager", fmt.Sprintf("%s:%s", Version, Hash))
260         app.SetReadyCB(func(d interface{}) { a.rmrReady = true }, true)
261         app.Resource.InjectStatusCb(a.StatusCB)
262         app.AddConfigChangeListener(a.ConfigChangeCB)
263
264         alarm.RICAlarmDefinitions = make(map[int]*alarm.AlarmDefinition)
265         a.ReadAlarmDefinitionFromJson()
266
267         app.Resource.InjectRoute("/ric/v1/alarms", a.RaiseAlarm, "POST")
268         app.Resource.InjectRoute("/ric/v1/alarms", a.ClearAlarm, "DELETE")
269         app.Resource.InjectRoute("/ric/v1/alarms/active", a.GetActiveAlarms, "GET")
270         app.Resource.InjectRoute("/ric/v1/alarms/history", a.GetAlarmHistory, "GET")
271         app.Resource.InjectRoute("/ric/v1/alarms/config", a.SetAlarmConfig, "POST")
272         app.Resource.InjectRoute("/ric/v1/alarms/config", a.GetAlarmConfig, "GET")
273         app.Resource.InjectRoute("/ric/v1/alarms/define", a.SetAlarmDefinition, "POST")
274         app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.DeleteAlarmDefinition, "DELETE")
275         app.Resource.InjectRoute("/ric/v1/alarms/define", a.GetAlarmDefinition, "GET")
276         app.Resource.InjectRoute("/ric/v1/alarms/define/{alarmId}", a.GetAlarmDefinition, "GET")
277
278         // Start background timer for re-raising alerts
279         a.postClear = sdlcheck
280         go a.StartAlertTimer()
281         a.alarmClient, _ = alarm.InitAlarm("SEP", "ALARMMANAGER")
282
283         app.RunWithParams(a, sdlcheck)
284 }
285
286 func NewAlarmManager(amHost string, alertInterval int) *AlarmManager {
287         if alertInterval == 0 {
288                 alertInterval = viper.GetInt("controls.promAlertManager.alertInterval")
289         }
290
291         if amHost == "" {
292                 amHost = viper.GetString("controls.promAlertManager.address")
293         }
294
295         return &AlarmManager{
296                 rmrReady:        false,
297                 amHost:          amHost,
298                 amBaseUrl:       viper.GetString("controls.promAlertManager.baseUrl"),
299                 amSchemes:       []string{viper.GetString("controls.promAlertManager.schemes")},
300                 alertInterval:   alertInterval,
301                 activeAlarms:    make([]alarm.AlarmMessage, 0),
302                 alarmHistory:    make([]alarm.AlarmMessage, 0),
303                 maxActiveAlarms: app.Config.GetInt("controls.maxActiveAlarms"),
304                 maxAlarmHistory: app.Config.GetInt("controls.maxAlarmHistory"),
305         }
306 }
307
308 // Main function
309 func main() {
310         NewAlarmManager("", 0).Run(true)
311 }