From 541eb50ea18ab50528420dfe724fa3d12dc24914 Mon Sep 17 00:00:00 2001 From: vipin Date: Tue, 22 Sep 2020 12:04:59 +0000 Subject: [PATCH] LN0739_FM_FR8: relaxing the active alarm and alarm history restrictions - new alarms can still be added if max active alarm threshold or alarm history threshold is reached. Alarm manager raised new alarm under such situations. - Review comments closed. - Review comments closed. Change-Id: I885418dcc19c587d1139f8251eda735b4a2bba00 Signed-off-by: vipin --- alarm/types.go | 19 ++++++++++++++++ build/build_ubuntu.sh | 4 ++-- cli/alarm-cli.go | 33 +++++++++++++++++++++++++++ go.mod | 3 ++- manager/cmd/manager.go | 50 +++++++++++++++++++++++++++++++---------- manager/cmd/manager_test.go | 54 ++++++++++++++++++++++++++++++++++++++++++++- manager/cmd/restapi.go | 23 +++++++++++++++++++ manager/cmd/types.go | 22 ++++++++++-------- schemas/alarm-schema.json | 2 +- 9 files changed, 184 insertions(+), 26 deletions(-) diff --git a/alarm/types.go b/alarm/types.go index dc77d43..149671a 100755 --- a/alarm/types.go +++ b/alarm/types.go @@ -67,6 +67,11 @@ type AlarmMessage struct { AlarmTime int64 } +type AlarmConfigParams struct { + MaxActiveAlarms int `json:"maxactivealarms"` + MaxAlarmHistory int `json:"maxalarmhistory"` +} + // RICAlarm is an alarm instance type RICAlarm struct { moId string @@ -88,6 +93,8 @@ const ( TCP_CONNECTIVITY_LOST_TO_DBAAS int = 8005 E2_CONNECTIVITY_LOST_TO_GNODEB int = 8006 E2_CONNECTIVITY_LOST_TO_ENODEB int = 8007 + ACTIVE_ALARM_EXCEED_MAX_THRESHOLD int = 8008 + ALARM_HISTORY_EXCEED_MAX_THRESHOLD int = 8009 ) type AlarmDefinition struct { @@ -122,6 +129,18 @@ var RICAlarmDefinitions = map[int]AlarmDefinition{ EventType: "Communication error", OperationInstructions: "Not defined", }, + ACTIVE_ALARM_EXCEED_MAX_THRESHOLD: { + AlarmId: ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, + AlarmText: "ACTIVE ALARM EXCEED MAX THRESHOLD", + EventType: "Warning", + OperationInstructions: "Not defined", + }, + ALARM_HISTORY_EXCEED_MAX_THRESHOLD: { + AlarmId: ALARM_HISTORY_EXCEED_MAX_THRESHOLD, + AlarmText: "ALARM HISTORY EXCEED MAX THRESHOLD", + EventType: "Warning", + OperationInstructions: "Not defined", + }, } const ( diff --git a/build/build_ubuntu.sh b/build/build_ubuntu.sh index 3e94016..29b9d77 100755 --- a/build/build_ubuntu.sh +++ b/build/build_ubuntu.sh @@ -22,11 +22,11 @@ set -eux echo "--> build_ubuntu.sh starts" # Install RMR from deb packages at packagecloud.io -rmr=rmr_4.0.2_amd64.deb +rmr=rmr_4.1.2_amd64.deb wget --content-disposition https://packagecloud.io/o-ran-sc/release/packages/debian/stretch/$rmr/download.deb sudo dpkg -i $rmr rm $rmr -rmrdev=rmr-dev_4.0.2_amd64.deb +rmrdev=rmr-dev_4.1.2_amd64.deb wget --content-disposition https://packagecloud.io/o-ran-sc/release/packages/debian/stretch/$rmrdev/download.deb sudo dpkg -i $rmrdev rm $rmrdev diff --git a/cli/alarm-cli.go b/cli/alarm-cli.go index 665f968..97ccb91 100755 --- a/cli/alarm-cli.go +++ b/cli/alarm-cli.go @@ -74,6 +74,18 @@ func main() { postAlarm(flags, readAlarmParams(flags, true), alarm.AlarmActionClear) }) + // Configure an alarm manager + commando. + Register("configure"). + SetShortDescription("Configure alarm manager with given parameters"). + AddFlag("mal", "max active alarms", commando.Int, nil). + AddFlag("mah", "max alarm history", commando.Int, nil). + AddFlag("host", "Alarm manager host address", commando.String, "localhost"). + AddFlag("port", "Alarm manager host address", commando.String, "8080"). + SetAction(func(args map[string]commando.ArgValue, flags map[string]commando.FlagValue) { + postAlarmConfig(flags) + }) + // parse command-line arguments commando.Parse(nil) } @@ -162,3 +174,24 @@ func displayAlarms(alarms []alarm.AlarmMessage, isHistory bool) { t.SetStyle(table.StyleColoredBright) t.Render() } + +func postAlarmConfig(flags map[string]commando.FlagValue) { + host, _ := flags["host"].GetString() + port, _ := flags["port"].GetString() + maxactivealarms, _ := flags["mal"].GetInt() + maxalarmhistory, _ := flags["mah"].GetInt() + targetUrl := fmt.Sprintf("http://%s:%s/ric/v1/alarms/config", host, port) + + m := alarm.AlarmConfigParams{MaxActiveAlarms: maxactivealarms, MaxAlarmHistory: maxalarmhistory} + jsonData, err := json.Marshal(m) + if err != nil { + fmt.Println("json.Marshal failed: %v", err) + return + } + + resp, err := http.Post(targetUrl, "application/json", bytes.NewBuffer(jsonData)) + if err != nil || resp == nil { + fmt.Println("Couldn't fetch post alarm configuration due to error: %v", err) + return + } +} diff --git a/go.mod b/go.mod index 7d78729..2827b3e 100644 --- a/go.mod +++ b/go.mod @@ -16,7 +16,8 @@ require ( gerrit.o-ran-sc.org/r/ric-plt/xapp-frame v0.0.0-00010101000000-000000000000 github.com/go-openapi/runtime v0.19.11 github.com/go-openapi/strfmt v0.19.4 - github.com/jedib0t/go-pretty v4.3.0+incompatible + github.com/gorilla/mux v1.7.1 + github.com/jedib0t/go-pretty v4.3.0+incompatible // indirect github.com/mattn/go-runewidth v0.0.9 // indirect github.com/prometheus/alertmanager v0.20.0 github.com/spf13/viper v1.6.2 diff --git a/manager/cmd/manager.go b/manager/cmd/manager.go index 3ca2d84..9a42187 100755 --- a/manager/cmd/manager.go +++ b/manager/cmd/manager.go @@ -133,13 +133,22 @@ func (a *AlarmManager) UpdateAlarmLists(newAlarm *alarm.AlarmMessage) { a.mutex.Lock() defer a.mutex.Unlock() - // If maximum number of active alarms is reached, purge the oldest alarm - if len(a.activeAlarms) >= viper.GetInt("controls.maxActiveAlarms") { - a.activeAlarms = a.RemoveAlarm(a.activeAlarms, 0, "active") + /* If maximum number of active alarms is reached, an error log writing is made, and new alarm indicating the problem is raised. + The attempt to raise the alarm next time will be supressed when found as duplicate. */ + if len(a.activeAlarms) >= a.maxActiveAlarms { + app.Logger.Error("active alarm count exceeded maxActiveAlarms threshold") + actAlarm := a.alarmClient.NewAlarm(alarm.ACTIVE_ALARM_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "clear alarms or raise threshold", "active alarms full") + actAlarmMessage := alarm.AlarmMessage{Alarm: actAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())} + a.activeAlarms = append(a.activeAlarms, actAlarmMessage) + a.alarmHistory = append(a.alarmHistory, actAlarmMessage) } - if len(a.alarmHistory) >= viper.GetInt("controls.maxAlarmHistory") { - a.alarmHistory = a.RemoveAlarm(a.alarmHistory, 0, "history") + if len(a.alarmHistory) >= a.maxAlarmHistory { + app.Logger.Error("alarm history count exceeded maxAlarmHistory threshold") + histAlarm := a.alarmClient.NewAlarm(alarm.ALARM_HISTORY_EXCEED_MAX_THRESHOLD, alarm.SeverityWarning, "clear alarms or raise threshold", "alarm history full") + histAlarmMessage := alarm.AlarmMessage{Alarm: histAlarm, AlarmAction: alarm.AlarmActionRaise, AlarmTime: (time.Now().UnixNano())} + a.activeAlarms = append(a.activeAlarms, histAlarmMessage) + a.alarmHistory = append(a.alarmHistory, histAlarmMessage) } // @todo: For now just keep the alarms (both active and history) in-memory. Use SDL later for persistence @@ -199,19 +208,34 @@ func (a *AlarmManager) StatusCB() bool { return a.rmrReady } +func (a *AlarmManager) ConfigChangeCB(configparam string) { + + a.maxActiveAlarms = app.Config.GetInt("controls.maxActiveAlarms") + a.maxAlarmHistory = app.Config.GetInt("controls.maxAlarmHistory") + + app.Logger.Debug("ConfigChangeCB: maxActiveAlarms %v", a.maxActiveAlarms) + app.Logger.Debug("ConfigChangeCB: maxAlarmHistory = %v", a.maxAlarmHistory) + + return +} + func (a *AlarmManager) Run(sdlcheck bool) { app.Logger.SetMdc("alarmManager", fmt.Sprintf("%s:%s", Version, Hash)) app.SetReadyCB(func(d interface{}) { a.rmrReady = true }, true) app.Resource.InjectStatusCb(a.StatusCB) + app.AddConfigChangeListener(a.ConfigChangeCB) app.Resource.InjectRoute("/ric/v1/alarms", a.RaiseAlarm, "POST") app.Resource.InjectRoute("/ric/v1/alarms", a.ClearAlarm, "DELETE") app.Resource.InjectRoute("/ric/v1/alarms/active", a.GetActiveAlarms, "GET") app.Resource.InjectRoute("/ric/v1/alarms/history", a.GetAlarmHistory, "GET") + app.Resource.InjectRoute("/ric/v1/alarms/config", a.SetAlarmConfig, "POST") + app.Resource.InjectRoute("/ric/v1/alarms/config", a.GetAlarmConfig, "GET") // Start background timer for re-raising alerts a.postClear = sdlcheck go a.StartAlertTimer() + a.alarmClient, _ = alarm.InitAlarm("SEP", "ALARMMANAGER") app.RunWithParams(a, sdlcheck) } @@ -226,13 +250,15 @@ func NewAlarmManager(amHost string, alertInterval int) *AlarmManager { } return &AlarmManager{ - rmrReady: false, - amHost: amHost, - amBaseUrl: viper.GetString("controls.promAlertManager.baseUrl"), - amSchemes: []string{viper.GetString("controls.promAlertManager.schemes")}, - alertInterval: alertInterval, - activeAlarms: make([]alarm.AlarmMessage, 0), - alarmHistory: make([]alarm.AlarmMessage, 0), + rmrReady: false, + amHost: amHost, + amBaseUrl: viper.GetString("controls.promAlertManager.baseUrl"), + amSchemes: []string{viper.GetString("controls.promAlertManager.schemes")}, + alertInterval: alertInterval, + activeAlarms: make([]alarm.AlarmMessage, 0), + alarmHistory: make([]alarm.AlarmMessage, 0), + maxActiveAlarms: app.Config.GetInt("controls.maxActiveAlarms"), + maxAlarmHistory: app.Config.GetInt("controls.maxAlarmHistory"), } } diff --git a/manager/cmd/manager_test.go b/manager/cmd/manager_test.go index c048f4b..26cf5c5 100755 --- a/manager/cmd/manager_test.go +++ b/manager/cmd/manager_test.go @@ -33,7 +33,7 @@ import ( "strings" "testing" "time" - + "github.com/gorilla/mux" "gerrit.o-ran-sc.org/r/ric-plt/alarm-go/alarm" "gerrit.o-ran-sc.org/r/ric-plt/xapp-frame/pkg/xapp" "github.com/prometheus/alertmanager/api/v2/models" @@ -56,6 +56,7 @@ func TestMain(M *testing.M) { } alarmer, _ = alarm.InitAlarm("my-pod", "my-app") + alarmManager.alarmClient = alarmer time.Sleep(time.Duration(5) * time.Second) eventChan = make(chan string) @@ -130,8 +131,10 @@ func TestAlarmsSuppresedSucess(t *testing.T) { assert.Nil(t, alarmer.Raise(a), "raise failed") VerifyAlarm(t, a, 1) + assert.Nil(t, alarmer.Clear(a), "clear failed") } + func TestInvalidAlarms(t *testing.T) { a := alarmer.NewAlarm(1111, alarm.SeverityMajor, "Some App data", "eth 0 1") assert.Nil(t, alarmer.Raise(a), "raise failed") @@ -153,6 +156,38 @@ func TestStatusCallback(t *testing.T) { assert.Equal(t, true, alarmManager.StatusCB()) } +func TestActiveAlarmMaxThresholds(t *testing.T) { + xapp.Logger.Info("TestActiveAlarmMaxThresholds") + ts := CreatePromAlertSimulator(t, "POST", "/api/v2/alerts", http.StatusOK, models.LabelSet{}) + alarmManager.maxActiveAlarms = 0 + alarmManager.maxAlarmHistory = 10 + + a := alarmer.NewAlarm(alarm.E2_CONNECTIVITY_LOST_TO_GNODEB, alarm.SeverityCritical, "Some Application data", "eth 0 2") + assert.Nil(t, alarmer.Raise(a), "raise failed") + + var alarmConfigParams alarm.AlarmConfigParams + req, _ := http.NewRequest("GET", "/ric/v1/alarms/config", nil) + req = mux.SetURLVars(req, nil) + handleFunc := http.HandlerFunc(alarmManager.GetAlarmConfig) + response := executeRequest(req, handleFunc) + + // Check HTTP Status Code + checkResponseCode(t, http.StatusOK, response.Code) + + // Decode the json output from handler + json.NewDecoder(response.Body).Decode(&alarmConfigParams) + if alarmConfigParams.MaxActiveAlarms != 0 || alarmConfigParams.MaxAlarmHistory != 10 { + t.Errorf("Incorrect alarm thresholds") + } + + time.Sleep(time.Duration(1) * time.Second) + alarmManager.maxActiveAlarms = 5000 + alarmManager.maxAlarmHistory = 20000 + VerifyAlarm(t, a, 2) + VerifyAlarm(t, a, 2) + ts.Close() +} + func VerifyAlarm(t *testing.T, a alarm.Alarm, expectedCount int) string { receivedAlert := waitForEvent() @@ -204,3 +239,20 @@ func fireEvent(t *testing.T, body io.ReadCloser) { eventChan <- fmt.Sprintf("%s", reqBody) } + +func executeRequest(req *http.Request, handleR http.HandlerFunc) *httptest.ResponseRecorder { + rr := httptest.NewRecorder() + + handleR.ServeHTTP(rr, req) + + return rr +} + +func checkResponseCode(t *testing.T, expected, actual int) bool { + if expected != actual { + t.Errorf("Expected response code %d. Got %d\n", expected, actual) + return false + } + return true +} + diff --git a/manager/cmd/restapi.go b/manager/cmd/restapi.go index 6463a2c..c455e87 100755 --- a/manager/cmd/restapi.go +++ b/manager/cmd/restapi.go @@ -104,3 +104,26 @@ func (a *AlarmManager) HandleViaRmr(d alarm.Alarm, isRaiseAlarm bool) error { return nil } + +func (a *AlarmManager) SetAlarmConfig(w http.ResponseWriter, r *http.Request) { + var m alarm.AlarmConfigParams + if err := json.NewDecoder(r.Body).Decode(&m); err != nil { + app.Logger.Error("json.NewDecoder failed: %v", err) + } else { + a.maxActiveAlarms = m.MaxActiveAlarms + a.maxAlarmHistory = m.MaxAlarmHistory + app.Logger.Debug("new maxActiveAlarms = %v", a.maxActiveAlarms) + app.Logger.Debug("new maxAlarmHistory = %v", a.maxAlarmHistory) + a.respondWithJSON(w, http.StatusOK, err) + } +} + +func (a *AlarmManager) GetAlarmConfig(w http.ResponseWriter, r *http.Request) { + var m alarm.AlarmConfigParams + + m.MaxActiveAlarms = a.maxActiveAlarms + m.MaxAlarmHistory = a.maxAlarmHistory + + a.respondWithJSON(w, http.StatusOK, m) + return +} diff --git a/manager/cmd/types.go b/manager/cmd/types.go index 68b9e0d..40a110f 100755 --- a/manager/cmd/types.go +++ b/manager/cmd/types.go @@ -27,15 +27,18 @@ import ( ) type AlarmManager struct { - amHost string - amBaseUrl string - amSchemes []string - alertInterval int - activeAlarms []alarm.AlarmMessage - alarmHistory []alarm.AlarmMessage - mutex sync.Mutex - rmrReady bool - postClear bool + amHost string + amBaseUrl string + amSchemes []string + alertInterval int + activeAlarms []alarm.AlarmMessage + alarmHistory []alarm.AlarmMessage + mutex sync.Mutex + rmrReady bool + postClear bool + maxActiveAlarms int + maxAlarmHistory int + alarmClient *alarm.RICAlarm } type AlertStatus string @@ -47,3 +50,4 @@ const ( var Version string var Hash string + diff --git a/schemas/alarm-schema.json b/schemas/alarm-schema.json index 5e999ad..ca01b1c 100644 --- a/schemas/alarm-schema.json +++ b/schemas/alarm-schema.json @@ -70,7 +70,7 @@ "identifyingInfo": { "type": "string", "title": "The identifyingInfo schema", - "description": "Identifying additional information, which is part of alarm identity.", + "description": "Identifying additional information which is part of alarm identity.", "default": "" }, "AlarmAction": { -- 2.16.6