From 0719e9e18b09451bee9a6d81c4faccf69d425740 Mon Sep 17 00:00:00 2001
From: Tommy Carpenter <tc677g@att.com>
Date: Thu, 16 Jan 2020 11:23:02 -0500
Subject: [PATCH] Cleanups only (no code changes)

Documentation/comment/whitespace changes
Also updates the helm commands in the int tests to helm v3

Change-Id: I54bfd44a8e2a4bd1f79e300c32b18b8a0abfc635
Signed-off-by: Tommy Carpenter <tc677g@att.com>
---
 a1/a1rmr.py         |  8 ++++----
 a1/controller.py    |  5 +----
 a1/data.py          |  9 +--------
 docs/overview.rst   | 20 ++++++++++++++++++++
 tox-integration.ini | 15 +++++++--------
 5 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/a1/a1rmr.py b/a1/a1rmr.py
index 2e5dace..58ec1c0 100644
--- a/a1/a1rmr.py
+++ b/a1/a1rmr.py
@@ -2,8 +2,8 @@
 a1s rmr functionality
 """
 # ==================================================================================
-#       Copyright (c) 2019 Nokia
-#       Copyright (c) 2018-2019 AT&T Intellectual Property.
+#       Copyright (c) 2019-2020 Nokia
+#       Copyright (c) 2018-2020 AT&T Intellectual Property.
 #
 #   Licensed under the Apache License, Version 2.0 (the "License");
 #   you may not use this file except in compliance with the License.
@@ -31,8 +31,6 @@ mdc_logger = Logger(name=__name__)
 
 
 RETRY_TIMES = int(os.environ.get("A1_RMR_RETRY_TIMES", 4))
-
-
 A1_POLICY_REQUEST = 20010
 A1_POLICY_RESPONSE = 20011
 A1_POLICY_QUERY = 20012
@@ -53,6 +51,8 @@ class _RmrLoop:
         self.keep_going = True
         self.rcv_func = None
         self.last_ran = time.time()
+
+        # see docs/overview#resiliency for a discussion of this
         self.instance_send_queue = queue.Queue()  # thread safe queue https://docs.python.org/3/library/queue.html
 
         # intialize rmr context
diff --git a/a1/controller.py b/a1/controller.py
index 2cb9283..b4d85df 100644
--- a/a1/controller.py
+++ b/a1/controller.py
@@ -61,6 +61,7 @@ def get_healthcheck():
     1. whether the a1 webserver is up (if it isn't, this won't even be called, so even entering this function confirms it is)
     2. checks whether the rmr thread is running and has completed a loop recently
     TODO: make "seconds" to pass in a configurable parameter?
+    TODO: I've requested that SDL provide a "I'm connected to the backend" healthcheck that can be integrated here
     """
     if a1rmr.healthcheck_rmr_thread():
         return "", 200
@@ -172,10 +173,6 @@ def delete_policy_instance(policy_type_id, policy_instance_id):
     """
 
     def delete_instance_handler():
-        """
-        here we send out the DELETEs but we don't delete the instance until a GET is called where we check the statuses
-        We also set the status as deleted which would be reflected in a GET to ../status (before the DELETE completes)
-        """
         data.delete_policy_instance(policy_type_id, policy_instance_id)
 
         # queue rmr send (best effort)
diff --git a/a1/data.py b/a1/data.py
index d737680..300f19d 100644
--- a/a1/data.py
+++ b/a1/data.py
@@ -1,10 +1,5 @@
 """
 Represents A1s database and database access functions.
-In the future, this may change to use a different backend, possibly dramatically.
-Hopefully, the access functions are a good api so nothing else has to change when this happens
-
-For now, the database is in memory.
-We use dict data structures (KV) with the expectation of having to move this into Redis
 """
 # ==================================================================================
 #       Copyright (c) 2019-2020 Nokia
@@ -27,9 +22,7 @@ import time
 from threading import Thread
 import msgpack
 from mdclogpy import Logger
-
 from ricsdl.syncstorage import SyncStorage
-
 from a1.exceptions import PolicyTypeNotFound, PolicyInstanceNotFound, PolicyTypeAlreadyExists, CantDeleteNonEmptyType
 
 mdc_logger = Logger(name=__name__)
@@ -272,7 +265,7 @@ def get_instance_list(policy_type_id):
 
 def delete_policy_instance(policy_type_id, policy_instance_id):
     """
-    initially sets has_been_deleted
+    initially sets has_been_deleted in the status
     then launches a thread that waits until the relevent timer expires, and finally deletes the instance
     """
     _instance_is_valid(policy_type_id, policy_instance_id)
diff --git a/docs/overview.rst b/docs/overview.rst
index 5ba3e31..d686fd9 100644
--- a/docs/overview.rst
+++ b/docs/overview.rst
@@ -73,3 +73,23 @@ In some cases, the spec is deficient and we are "ahead", in other cases this doe
 7. [Spec is ahead] The spec defines that a query of all policy instances should return the full bodies, however right now the RIC A1m returns a list of IDs (assuming subsequent queries can fetch the bodies).
 
 8. [?] The spec document details some very specific "types", but the RIC A1m allows these to be loaded in (see #1). For example, spec section 4.2.6.2. We believe this should be removed from the spec and rather defined as a type. Xapps can be created that define new types, so the spec will quickly become "stale" if "types" are defined in the spec.
+
+
+Resiliency
+----------
+
+A1 is resilient to the majority of failures, but not all currently (though a solution is known).
+
+A1 uses the RIC SDL library to persist all policy state information: this includes the policy types, policy instances, and policy statuses.
+If state is built up in A1, and A1 fails (where Kubernetes will then restart it), none of this state is lost.
+
+The tiny bit of state that *is currently* in A1 (volatile) is it's "next second" job queue.
+Specifically, when policy instances are created or deleted, A1 creates jobs in a job queue (in memory).
+An rmr thread polls that thread every second, dequeues the jobs, and performs them.
+
+If A1 were killed at *exactly* the right time, you could have jobs lost, meaning the PUT or DELETE of an instance wouldn't actually take.
+This isn't drastic, as the operations are idempotent and could always be re-performed.
+
+In order for A1 to be considered completely resilient, this job queue would need to be moved to SDL.
+SDL uses Redis as a backend, and Redis natively supports queues via LIST, LPUSH, RPOP.
+I've asked the SDL team to consider an extension to SDL to support these Redis operations.
diff --git a/tox-integration.ini b/tox-integration.ini
index 49d8f65..62670ef 100644
--- a/tox-integration.ini
+++ b/tox-integration.ini
@@ -33,9 +33,10 @@ changedir=integration_tests
 commands_pre=
     echo "WARNING: make sure you're running with latest docker builds!"
     sleep 5
-    helm install --devel testreceiver -n testreceiver
-    helm install --devel a1mediator -n a1
-    helm install --devel dbaas-service -n dbaas
+# helm v3 is helm install [name] [chart]
+    helm install --devel testreceiver testreceiver
+    helm install --devel a1 a1mediator
+    helm install --devel dbaas dbaas-service
 # wait for helm charts
     sleep 30
     ./portforward.sh
@@ -44,9 +45,10 @@ commands=
     echo "linting"
     helm lint a1mediator
     helm lint testreceiver
+    helm lint dbaas-service
     echo "running tavern"
 # run tavern
-    pytest --tavern-beta-new-traceback
+    pytest --tavern-beta-new-traceback test_a1.tavern.yaml
     echo "running ab"
 # run apache bench
     ab -n 100 -c 10 -v 4 http://localhost:10000/a1-p/healthcheck
@@ -55,10 +57,7 @@ commands_post=
     integration_tests/getlogs.sh
     echo "teardown"
     helm delete testreceiver
-    helm del --purge testreceiver
     helm delete a1
-    helm del --purge a1
-    helm del dbaas
-    helm del --purge dbaas
+    helm delete dbaas
     pkill -9 kubectl
     sleep 10
-- 
2.16.6