From 0719e9e18b09451bee9a6d81c4faccf69d425740 Mon Sep 17 00:00:00 2001 From: Tommy Carpenter Date: Thu, 16 Jan 2020 11:23:02 -0500 Subject: [PATCH] Cleanups only (no code changes) Documentation/comment/whitespace changes Also updates the helm commands in the int tests to helm v3 Change-Id: I54bfd44a8e2a4bd1f79e300c32b18b8a0abfc635 Signed-off-by: Tommy Carpenter --- a1/a1rmr.py | 8 ++++---- a1/controller.py | 5 +---- a1/data.py | 9 +-------- docs/overview.rst | 20 ++++++++++++++++++++ tox-integration.ini | 15 +++++++-------- 5 files changed, 33 insertions(+), 24 deletions(-) diff --git a/a1/a1rmr.py b/a1/a1rmr.py index 2e5dace..58ec1c0 100644 --- a/a1/a1rmr.py +++ b/a1/a1rmr.py @@ -2,8 +2,8 @@ a1s rmr functionality """ # ================================================================================== -# Copyright (c) 2019 Nokia -# Copyright (c) 2018-2019 AT&T Intellectual Property. +# Copyright (c) 2019-2020 Nokia +# Copyright (c) 2018-2020 AT&T Intellectual Property. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -31,8 +31,6 @@ mdc_logger = Logger(name=__name__) RETRY_TIMES = int(os.environ.get("A1_RMR_RETRY_TIMES", 4)) - - A1_POLICY_REQUEST = 20010 A1_POLICY_RESPONSE = 20011 A1_POLICY_QUERY = 20012 @@ -53,6 +51,8 @@ class _RmrLoop: self.keep_going = True self.rcv_func = None self.last_ran = time.time() + + # see docs/overview#resiliency for a discussion of this self.instance_send_queue = queue.Queue() # thread safe queue https://docs.python.org/3/library/queue.html # intialize rmr context diff --git a/a1/controller.py b/a1/controller.py index 2cb9283..b4d85df 100644 --- a/a1/controller.py +++ b/a1/controller.py @@ -61,6 +61,7 @@ def get_healthcheck(): 1. whether the a1 webserver is up (if it isn't, this won't even be called, so even entering this function confirms it is) 2. checks whether the rmr thread is running and has completed a loop recently TODO: make "seconds" to pass in a configurable parameter? + TODO: I've requested that SDL provide a "I'm connected to the backend" healthcheck that can be integrated here """ if a1rmr.healthcheck_rmr_thread(): return "", 200 @@ -172,10 +173,6 @@ def delete_policy_instance(policy_type_id, policy_instance_id): """ def delete_instance_handler(): - """ - here we send out the DELETEs but we don't delete the instance until a GET is called where we check the statuses - We also set the status as deleted which would be reflected in a GET to ../status (before the DELETE completes) - """ data.delete_policy_instance(policy_type_id, policy_instance_id) # queue rmr send (best effort) diff --git a/a1/data.py b/a1/data.py index d737680..300f19d 100644 --- a/a1/data.py +++ b/a1/data.py @@ -1,10 +1,5 @@ """ Represents A1s database and database access functions. -In the future, this may change to use a different backend, possibly dramatically. -Hopefully, the access functions are a good api so nothing else has to change when this happens - -For now, the database is in memory. -We use dict data structures (KV) with the expectation of having to move this into Redis """ # ================================================================================== # Copyright (c) 2019-2020 Nokia @@ -27,9 +22,7 @@ import time from threading import Thread import msgpack from mdclogpy import Logger - from ricsdl.syncstorage import SyncStorage - from a1.exceptions import PolicyTypeNotFound, PolicyInstanceNotFound, PolicyTypeAlreadyExists, CantDeleteNonEmptyType mdc_logger = Logger(name=__name__) @@ -272,7 +265,7 @@ def get_instance_list(policy_type_id): def delete_policy_instance(policy_type_id, policy_instance_id): """ - initially sets has_been_deleted + initially sets has_been_deleted in the status then launches a thread that waits until the relevent timer expires, and finally deletes the instance """ _instance_is_valid(policy_type_id, policy_instance_id) diff --git a/docs/overview.rst b/docs/overview.rst index 5ba3e31..d686fd9 100644 --- a/docs/overview.rst +++ b/docs/overview.rst @@ -73,3 +73,23 @@ In some cases, the spec is deficient and we are "ahead", in other cases this doe 7. [Spec is ahead] The spec defines that a query of all policy instances should return the full bodies, however right now the RIC A1m returns a list of IDs (assuming subsequent queries can fetch the bodies). 8. [?] The spec document details some very specific "types", but the RIC A1m allows these to be loaded in (see #1). For example, spec section 4.2.6.2. We believe this should be removed from the spec and rather defined as a type. Xapps can be created that define new types, so the spec will quickly become "stale" if "types" are defined in the spec. + + +Resiliency +---------- + +A1 is resilient to the majority of failures, but not all currently (though a solution is known). + +A1 uses the RIC SDL library to persist all policy state information: this includes the policy types, policy instances, and policy statuses. +If state is built up in A1, and A1 fails (where Kubernetes will then restart it), none of this state is lost. + +The tiny bit of state that *is currently* in A1 (volatile) is it's "next second" job queue. +Specifically, when policy instances are created or deleted, A1 creates jobs in a job queue (in memory). +An rmr thread polls that thread every second, dequeues the jobs, and performs them. + +If A1 were killed at *exactly* the right time, you could have jobs lost, meaning the PUT or DELETE of an instance wouldn't actually take. +This isn't drastic, as the operations are idempotent and could always be re-performed. + +In order for A1 to be considered completely resilient, this job queue would need to be moved to SDL. +SDL uses Redis as a backend, and Redis natively supports queues via LIST, LPUSH, RPOP. +I've asked the SDL team to consider an extension to SDL to support these Redis operations. diff --git a/tox-integration.ini b/tox-integration.ini index 49d8f65..62670ef 100644 --- a/tox-integration.ini +++ b/tox-integration.ini @@ -33,9 +33,10 @@ changedir=integration_tests commands_pre= echo "WARNING: make sure you're running with latest docker builds!" sleep 5 - helm install --devel testreceiver -n testreceiver - helm install --devel a1mediator -n a1 - helm install --devel dbaas-service -n dbaas +# helm v3 is helm install [name] [chart] + helm install --devel testreceiver testreceiver + helm install --devel a1 a1mediator + helm install --devel dbaas dbaas-service # wait for helm charts sleep 30 ./portforward.sh @@ -44,9 +45,10 @@ commands= echo "linting" helm lint a1mediator helm lint testreceiver + helm lint dbaas-service echo "running tavern" # run tavern - pytest --tavern-beta-new-traceback + pytest --tavern-beta-new-traceback test_a1.tavern.yaml echo "running ab" # run apache bench ab -n 100 -c 10 -v 4 http://localhost:10000/a1-p/healthcheck @@ -55,10 +57,7 @@ commands_post= integration_tests/getlogs.sh echo "teardown" helm delete testreceiver - helm del --purge testreceiver helm delete a1 - helm del --purge a1 - helm del dbaas - helm del --purge dbaas + helm delete dbaas pkill -9 kubectl sleep 10 -- 2.16.6