From 0b42dfc507b22b49669f360883a1cecaa50cda7b Mon Sep 17 00:00:00 2001
From: Tommy Carpenter <tc677g@att.com>
Date: Thu, 24 Oct 2019 10:13:54 -0400
Subject: [PATCH] Threading pt3:

* Only external change here is to healthcheck the rmr thread as part of a1s healthcheck. k8s will now respin a1 if that is failing.
* Refactors (simplifies) how we wait for rmr initialization; it is now called as part of __init__
* Refactors (simplifies) how the thread is actually launched; it is now internal to the object and also a part of __init__
* Cleans up unit testing; a1rmr now exposes a replace_rcv_func; useful for unit testing, harmless if not called otherwise
* Upgrades to rmr-python 1.0.0 for simpler message allocation

Change-Id: I7106f8f35e94ba1c638d782609974bf4d4ba7c22
Signed-off-by: Tommy Carpenter <tc677g@att.com>
---
 a1/a1rmr.py                             | 199 +++++++++++++++-----------------
 a1/controller.py                        |  11 +-
 container-tag.yaml                      |   2 +-
 docs/release-notes.rst                  |  10 ++
 integration_tests/Dockerfile            |   2 +-
 integration_tests/a1mediator/Chart.yaml |   2 +-
 integration_tests/receiver.py           |   2 +-
 setup.py                                |   4 +-
 tests/test_controller.py                |  18 ++-
 9 files changed, 121 insertions(+), 129 deletions(-)

diff --git a/a1/a1rmr.py b/a1/a1rmr.py
index 359d2e0..fd0d87f 100644
--- a/a1/a1rmr.py
+++ b/a1/a1rmr.py
@@ -1,3 +1,6 @@
+"""
+a1s rmr functionality
+"""
 # ==================================================================================
 #       Copyright (c) 2019 Nokia
 #       Copyright (c) 2018-2019 AT&T Intellectual Property.
@@ -29,124 +32,73 @@ logger = get_module_logger(__name__)
 
 RETRY_TIMES = int(os.environ.get("RMR_RETRY_TIMES", 4))
 
-_SEND_QUEUE = queue.Queue()  # thread safe queue https://docs.python.org/3/library/queue.html
+# Note; yes, globals are bad, but this is a private (to this module) global
+# No other module can import/access this (well, python doesn't enforce this, but all linters will complain)
+__RMR_LOOP__ = None
 
 
-def _init_rmr():
+class _RmrLoop:
     """
-    init an rmr context
-    This gets monkeypatched out for unit testing
+    class represents an rmr loop that constantly reads from rmr and performs operations based on waiting messages
+    this launches a thread, it should probably only be called once; the public facing method to access these ensures this
     """
-    # rmr.RMRFL_MTCALL puts RMR into a multithreaded mode, where a receiving thread populates an
-    # internal ring of messages, and receive calls read from that
-    # currently the size is 2048 messages, so this is fine for the foreseeable future
-    logger.debug("Waiting for rmr to initialize..")
-    mrc = rmr.rmr_init(b"4562", rmr.RMR_MAX_RCV_BYTES, rmr.RMRFL_MTCALL)
-    while rmr.rmr_ready(mrc) == 0:
-        time.sleep(0.5)
 
-    return mrc
-
-
-def _send(mrc, payload, message_type=0):
-    """
-    Sends a message up to RETRY_TIMES
-    If the message is sent successfully, it returns the transactionid
-    Does nothing otherwise
-    """
-    # TODO: investigate moving this below and allocating the space based on the payload size
-    sbuf = rmr.rmr_alloc_msg(mrc, 4096)
-    payload = payload if isinstance(payload, bytes) else payload.encode("utf-8")
-
-    # retry RETRY_TIMES to send the message
-    for _ in range(0, RETRY_TIMES):
-        # setup the send message
-        rmr.set_payload_and_length(payload, sbuf)
-        rmr.generate_and_set_transaction_id(sbuf)
-        sbuf.contents.state = 0
-        sbuf.contents.mtype = message_type
-        pre_send_summary = rmr.message_summary(sbuf)
-        logger.debug("Pre message send summary: %s", pre_send_summary)
-        transaction_id = pre_send_summary["transaction id"]  # save the transactionid because we need it later
-
-        # send
-        sbuf = rmr.rmr_send_msg(mrc, sbuf)
-        post_send_summary = rmr.message_summary(sbuf)
-        logger.debug("Post message send summary: %s", rmr.message_summary(sbuf))
-
-        # check success or failure
-        if post_send_summary["message state"] == 0 and post_send_summary["message status"] == "RMR_OK":
-            # we are good
-            logger.debug("Message sent successfully!")
-            rmr.rmr_free_msg(sbuf)
-            return transaction_id
-
-    # we failed all RETRY_TIMES
-    logger.debug("Send failed all %s times, stopping", RETRY_TIMES)
-    rmr.rmr_free_msg(sbuf)
-    return None
-
-
-# Public
-
-
-def queue_work(item):
-    """
-    push an item into the work queue
-    currently the only type of work is to send out messages
-    """
-    _SEND_QUEUE.put(item)
-
-
-class RmrLoop:
-    """
-    class represents an rmr loop meant to be called as a longstanding separate thread
-    """
-
-    def __init__(self, _init_func_override=None, rcv_func_override=None):
-        self._rmr_is_ready = False
-        self._keep_going = True
-        self._init_func_override = _init_func_override  # useful for unit testing
-        self._rcv_func_override = rcv_func_override  # useful for unit testing to mock certain recieve scenarios
-        self._rcv_func = None
-
-    def rmr_is_ready(self):
-        """returns whether rmr has been initialized"""
-        return self._rmr_is_ready
-
-    def stop(self):
-        """sets a flag for the loop to end"""
-        self._keep_going = False
+    def __init__(self, init_func_override=None, rcv_func_override=None):
+        self.keep_going = True
+        self.rcv_func = None
+        self.last_ran = time.time()
+        self.work_queue = queue.Queue()  # thread safe queue https://docs.python.org/3/library/queue.html
+
+        # intialize rmr context
+        if init_func_override:
+            self.mrc = init_func_override()
+        else:
+            logger.debug("Waiting for rmr to initialize..")
+            # rmr.RMRFL_MTCALL puts RMR into a multithreaded mode, where a receiving thread populates an
+            # internal ring of messages, and receive calls read from that
+            # currently the size is 2048 messages, so this is fine for the foreseeable future
+            self.mrc = rmr.rmr_init(b"4562", rmr.RMR_MAX_RCV_BYTES, rmr.RMRFL_MTCALL)
+            while rmr.rmr_ready(self.mrc) == 0:
+                time.sleep(0.5)
+
+        # set the receive function
+        self.rcv_func = rcv_func_override if rcv_func_override else lambda: helpers.rmr_rcvall_msgs(self.mrc, [21024])
+
+        # start the work loop
+        self.thread = Thread(target=self.loop)
+        self.thread.start()
 
     def loop(self):
         """
-        This loop runs in an a1 thread forever, and has 3 jobs:
+        This loop runs forever, and has 3 jobs:
         - send out any messages that have to go out (create instance, delete instance)
         - read a1s mailbox and update the status of all instances based on acks from downstream policy handlers
         - clean up the database (eg delete the instance) under certain conditions based on those statuses (NOT DONE YET)
         """
-
-        # get a context
-        mrc = self._init_func_override() if self._init_func_override else _init_rmr()
-        self._rmr_is_ready = True
-        logger.debug("Rmr is ready")
-
-        # set the receive function called below
-        self._rcv_func = (
-            self._rcv_func_override if self._rcv_func_override else lambda: helpers.rmr_rcvall_msgs(mrc, [21024])
-        )
-
         # loop forever
         logger.debug("Work loop starting")
-        while self._keep_going:
+        while self.keep_going:
+
             # send out all messages waiting for us
-            while not _SEND_QUEUE.empty():
-                work_item = _SEND_QUEUE.get(block=False, timeout=None)
-                _send(mrc, payload=work_item["payload"], message_type=work_item["msg type"])
+            while not self.work_queue.empty():
+                work_item = self.work_queue.get(block=False, timeout=None)
+
+                pay = work_item["payload"].encode("utf-8")
+                for _ in range(0, RETRY_TIMES):
+                    # Waiting on an rmr bugfix regarding the over-allocation: https://rancodev.atlassian.net/browse/RICPLT-2490
+                    sbuf = rmr.rmr_alloc_msg(self.mrc, 4096, pay, True, work_item["msg type"])
+                    pre_send_summary = rmr.message_summary(sbuf)
+                    sbuf = rmr.rmr_send_msg(self.mrc, sbuf)  # send
+                    post_send_summary = rmr.message_summary(sbuf)
+                    logger.debug("Pre-send summary: %s, Post-send summary: %s", pre_send_summary, post_send_summary)
+                    rmr.rmr_free_msg(sbuf)  # free
+                    if post_send_summary["message state"] == 0 and post_send_summary["message status"] == "RMR_OK":
+                        logger.debug("Message sent successfully!")
+                        break
 
             # read our mailbox and update statuses
             updated_instances = set()
-            for msg in self._rcv_func():
+            for msg in self.rcv_func():
                 try:
                     pay = json.loads(msg["payload"])
                     pti = pay["policy_type_id"]
@@ -162,18 +114,47 @@ class RmrLoop:
             for ut in updated_instances:
                 data.clean_up_instance(ut[0], ut[1])
 
-        # TODO: what's a reasonable sleep time? we don't want to hammer redis too much, and a1 isn't a real time component
-        time.sleep(1)
+            # TODO: what's a reasonable sleep time? we don't want to hammer redis too much, and a1 isn't a real time component
+            self.last_ran = time.time()
+            time.sleep(1)
+
+
+# Public
 
 
 def start_rmr_thread(init_func_override=None, rcv_func_override=None):
     """
     Start a1s rmr thread
-    Also called during unit testing
     """
-    rmr_loop = RmrLoop(init_func_override, rcv_func_override)
-    thread = Thread(target=rmr_loop.loop)
-    thread.start()
-    while not rmr_loop.rmr_is_ready():
-        time.sleep(0.5)
-    return rmr_loop  # return the handle; useful during unit testing
+    global __RMR_LOOP__
+    if __RMR_LOOP__ is None:
+        __RMR_LOOP__ = _RmrLoop(init_func_override, rcv_func_override)
+
+
+def stop_rmr_thread():
+    """
+    stops the rmr thread
+    """
+    __RMR_LOOP__.keep_going = False
+
+
+def queue_work(item):
+    """
+    push an item into the work queue
+    currently the only type of work is to send out messages
+    """
+    __RMR_LOOP__.work_queue.put(item)
+
+
+def healthcheck_rmr_thread(seconds=30):
+    """
+    returns a boolean representing whether the rmr loop is healthy, by checking two attributes:
+    1. is it running?,
+    2. is it stuck in a long (> seconds) loop?
+    """
+    return __RMR_LOOP__.thread.is_alive() and ((time.time() - __RMR_LOOP__.last_ran) < seconds)
+
+
+def replace_rcv_func(rcv_func):
+    """purely for the ease of unit testing to test different rcv scenarios"""
+    __RMR_LOOP__.rcv_func = rcv_func
diff --git a/a1/controller.py b/a1/controller.py
index b3ce88e..75d8456 100644
--- a/a1/controller.py
+++ b/a1/controller.py
@@ -63,9 +63,14 @@ def _gen_body_to_handler(operation, policy_type_id, policy_instance_id, payload=
 def get_healthcheck():
     """
     Handles healthcheck GET
-    Currently, this basically checks the server is alive
-    """
-    return "", 200
+    Currently, this checks:
+    1. whether the a1 webserver is up (if it isn't, this won't even be called, so even entering this function confirms it is)
+    2. checks whether the rmr thread is running and has completed a loop recently
+    TODO: make "seconds" to pass in a configurable parameter?
+    """
+    if a1rmr.healthcheck_rmr_thread():
+        return "", 200
+    return "rmr thread is unhealthy", 500
 
 
 # Policy types
diff --git a/container-tag.yaml b/container-tag.yaml
index 554b483..2bb40a9 100644
--- a/container-tag.yaml
+++ b/container-tag.yaml
@@ -1,4 +1,4 @@
 # The Jenkins job uses this string for the tag in the image name
 # for example nexus3.o-ran-sc.org:10004/my-image-name:my-tag
 ---
-tag: 1.0.3
+tag: 1.0.4
diff --git a/docs/release-notes.rst b/docs/release-notes.rst
index 59676d0..5173a25 100644
--- a/docs/release-notes.rst
+++ b/docs/release-notes.rst
@@ -29,6 +29,16 @@ and this project adheres to `Semantic Versioning <http://semver.org/>`__.
 
     * Represents a resillent version of 1.0.0 that uses Redis for persistence
 
+[1.0.4]
+
+::
+
+    * Only external change here is to healthcheck the rmr thread as part of a1s healthcheck. k8s will now respin a1 if that is failing.
+    * Refactors (simplifies) how we wait for rmr initialization; it is now called as part of __init__
+    * Refactors (simplifies) how the thread is actually launched; it is now internal to the object and also a part of __init__
+    * Cleans up unit testing; a1rmr now exposes a replace_rcv_func; useful for unit testing, harmless if not called otherwise
+    * Upgrades to rmr-python 1.0.0 for simpler message allocation
+
 [1.0.3] - 10/22/2019
 
 ::
diff --git a/integration_tests/Dockerfile b/integration_tests/Dockerfile
index e7bfe80..87a8dce 100644
--- a/integration_tests/Dockerfile
+++ b/integration_tests/Dockerfile
@@ -34,7 +34,7 @@ COPY receiver.py /
 
 # Install RMr python bindings
 RUN pip install --upgrade pip
-RUN pip install rmr==0.13.5
+RUN pip install rmr==1.0.0
 
 # rmr setups
 RUN mkdir -p /opt/route/
diff --git a/integration_tests/a1mediator/Chart.yaml b/integration_tests/a1mediator/Chart.yaml
index d0c73be..e59dc91 100644
--- a/integration_tests/a1mediator/Chart.yaml
+++ b/integration_tests/a1mediator/Chart.yaml
@@ -1,4 +1,4 @@
 apiVersion: v1
 description: A1 Helm chart for Kubernetes
 name: a1mediator
-version: 1.0.3
+version: 1.0.4
diff --git a/integration_tests/receiver.py b/integration_tests/receiver.py
index d73dcd9..fedc690 100644
--- a/integration_tests/receiver.py
+++ b/integration_tests/receiver.py
@@ -35,7 +35,7 @@ while rmr.rmr_ready(mrc) == 0:
 
 print("listening ON {}".format(PORT))
 while True:
-    sbuf = rmr.rmr_alloc_msg(mrc, 4096)
+    sbuf = rmr.rmr_alloc_msg(mrc, 10)
     sbuf = rmr.rmr_torcv_msg(mrc, sbuf, 1000)
     summary = rmr.message_summary(sbuf)
     if summary["message state"] == 12 and summary["message status"] == "RMR_ERR_TIMEOUT":
diff --git a/setup.py b/setup.py
index f8a2923..2263cc4 100644
--- a/setup.py
+++ b/setup.py
@@ -18,13 +18,13 @@ from setuptools import setup, find_packages
 
 setup(
     name="a1",
-    version="1.0.3",
+    version="1.0.4",
     packages=find_packages(exclude=["tests.*", "tests"]),
     author="Tommy Carpenter",
     description="RIC A1 Mediator for policy/intent changes",
     url="https://gerrit.o-ran-sc.org/r/admin/repos/ric-plt/a1",
     entry_points={"console_scripts": ["run.py=a1.run:main"]},
     # we require jsonschema, should be in that list, but connexion already requires a specific version of it
-    install_requires=["requests", "Flask", "connexion[swagger-ui]", "gevent", "msgpack", "rmr>=0.13.5"],
+    install_requires=["requests", "Flask", "connexion[swagger-ui]", "gevent", "msgpack", "rmr>=1.0.0"],
     package_data={"a1": ["openapi.yaml"]},
 )
diff --git a/tests/test_controller.py b/tests/test_controller.py
index 63831b0..756a0d0 100644
--- a/tests/test_controller.py
+++ b/tests/test_controller.py
@@ -77,7 +77,7 @@ def _test_put_patch(monkeypatch):
     monkeypatch.setattr("rmr.rmr.rmr_free_msg", noop)
 
     # we need to repatch alloc (already patched in patch_rmr) to fix the transactionid, alloc is called in send and recieve
-    def fake_alloc(_unused, _alsounused):
+    def fake_alloc(_unused1, _unused2, _unused3, _unused4, _unused5):
         sbuf = rmr_mocks.Rmr_mbuf_t()
         sbuf.contents.xaction = b"d49b53e478b711e9a1130242ac110002"
         return sbuf
@@ -94,18 +94,14 @@ def _test_put_patch(monkeypatch):
 # Module level Hack
 
 
-RMR_THREAD = None
-
-
 def setup_module():
     """module level setup"""
-    global RMR_THREAD
 
     def noop():
         pass
 
     # launch the thread with a fake init func and a patched rcv func; we will "repatch" later
-    RMR_THREAD = a1rmr.start_rmr_thread(init_func_override=noop, rcv_func_override=_fake_dequeue_none)
+    a1rmr.start_rmr_thread(init_func_override=noop, rcv_func_override=_fake_dequeue_none)
 
 
 # Actual Tests
@@ -188,7 +184,7 @@ def test_workflow(client, monkeypatch, adm_type_good, adm_instance_good):
     get_instance_good("NOT IN EFFECT")
 
     # now pretend we did get a good ACK
-    RMR_THREAD._rcv_func = _fake_dequeue
+    a1rmr.replace_rcv_func(_fake_dequeue)
     time.sleep(1)  # wait for the rmr thread
     get_instance_good("IN EFFECT")
 
@@ -207,7 +203,7 @@ def test_workflow(client, monkeypatch, adm_type_good, adm_instance_good):
     get_instance_good("IN EFFECT")
 
     # now pretend we deleted successfully
-    RMR_THREAD._rcv_func = _fake_dequeue_deleted
+    a1rmr.replace_rcv_func(_fake_dequeue_deleted)
     time.sleep(1)  # wait for the rmr thread
     # list still 200 but no instance
     res = client.get(ADM_CTRL_POLICIES)
@@ -251,7 +247,7 @@ def test_bad_instances(client, monkeypatch, adm_type_good):
     assert res.status_code == 404
 
     # get a non existent instance
-    RMR_THREAD._rcv_func = _fake_dequeue
+    a1rmr.replace_rcv_func(_fake_dequeue)
     time.sleep(1)
     res = client.get(ADM_CTRL_INSTANCE + "DARKNESS")
     assert res.status_code == 404
@@ -261,7 +257,7 @@ def test_bad_instances(client, monkeypatch, adm_type_good):
     assert res.status_code == 204
 
 
-def test_illegal_types(client, monkeypatch, adm_type_good):
+def test_illegal_types(client, adm_type_good):
     """
     Test illegal types
     """
@@ -281,4 +277,4 @@ def test_healthcheck(client):
 
 def teardown_module():
     """module teardown"""
-    RMR_THREAD.stop()
+    a1rmr.stop_rmr_thread()
-- 
2.16.6