From 0b42dfc507b22b49669f360883a1cecaa50cda7b Mon Sep 17 00:00:00 2001 From: Tommy Carpenter Date: Thu, 24 Oct 2019 10:13:54 -0400 Subject: [PATCH] Threading pt3: * Only external change here is to healthcheck the rmr thread as part of a1s healthcheck. k8s will now respin a1 if that is failing. * Refactors (simplifies) how we wait for rmr initialization; it is now called as part of __init__ * Refactors (simplifies) how the thread is actually launched; it is now internal to the object and also a part of __init__ * Cleans up unit testing; a1rmr now exposes a replace_rcv_func; useful for unit testing, harmless if not called otherwise * Upgrades to rmr-python 1.0.0 for simpler message allocation Change-Id: I7106f8f35e94ba1c638d782609974bf4d4ba7c22 Signed-off-by: Tommy Carpenter --- a1/a1rmr.py | 199 +++++++++++++++----------------- a1/controller.py | 11 +- container-tag.yaml | 2 +- docs/release-notes.rst | 10 ++ integration_tests/Dockerfile | 2 +- integration_tests/a1mediator/Chart.yaml | 2 +- integration_tests/receiver.py | 2 +- setup.py | 4 +- tests/test_controller.py | 18 ++- 9 files changed, 121 insertions(+), 129 deletions(-) diff --git a/a1/a1rmr.py b/a1/a1rmr.py index 359d2e0..fd0d87f 100644 --- a/a1/a1rmr.py +++ b/a1/a1rmr.py @@ -1,3 +1,6 @@ +""" +a1s rmr functionality +""" # ================================================================================== # Copyright (c) 2019 Nokia # Copyright (c) 2018-2019 AT&T Intellectual Property. @@ -29,124 +32,73 @@ logger = get_module_logger(__name__) RETRY_TIMES = int(os.environ.get("RMR_RETRY_TIMES", 4)) -_SEND_QUEUE = queue.Queue() # thread safe queue https://docs.python.org/3/library/queue.html +# Note; yes, globals are bad, but this is a private (to this module) global +# No other module can import/access this (well, python doesn't enforce this, but all linters will complain) +__RMR_LOOP__ = None -def _init_rmr(): +class _RmrLoop: """ - init an rmr context - This gets monkeypatched out for unit testing + class represents an rmr loop that constantly reads from rmr and performs operations based on waiting messages + this launches a thread, it should probably only be called once; the public facing method to access these ensures this """ - # rmr.RMRFL_MTCALL puts RMR into a multithreaded mode, where a receiving thread populates an - # internal ring of messages, and receive calls read from that - # currently the size is 2048 messages, so this is fine for the foreseeable future - logger.debug("Waiting for rmr to initialize..") - mrc = rmr.rmr_init(b"4562", rmr.RMR_MAX_RCV_BYTES, rmr.RMRFL_MTCALL) - while rmr.rmr_ready(mrc) == 0: - time.sleep(0.5) - return mrc - - -def _send(mrc, payload, message_type=0): - """ - Sends a message up to RETRY_TIMES - If the message is sent successfully, it returns the transactionid - Does nothing otherwise - """ - # TODO: investigate moving this below and allocating the space based on the payload size - sbuf = rmr.rmr_alloc_msg(mrc, 4096) - payload = payload if isinstance(payload, bytes) else payload.encode("utf-8") - - # retry RETRY_TIMES to send the message - for _ in range(0, RETRY_TIMES): - # setup the send message - rmr.set_payload_and_length(payload, sbuf) - rmr.generate_and_set_transaction_id(sbuf) - sbuf.contents.state = 0 - sbuf.contents.mtype = message_type - pre_send_summary = rmr.message_summary(sbuf) - logger.debug("Pre message send summary: %s", pre_send_summary) - transaction_id = pre_send_summary["transaction id"] # save the transactionid because we need it later - - # send - sbuf = rmr.rmr_send_msg(mrc, sbuf) - post_send_summary = rmr.message_summary(sbuf) - logger.debug("Post message send summary: %s", rmr.message_summary(sbuf)) - - # check success or failure - if post_send_summary["message state"] == 0 and post_send_summary["message status"] == "RMR_OK": - # we are good - logger.debug("Message sent successfully!") - rmr.rmr_free_msg(sbuf) - return transaction_id - - # we failed all RETRY_TIMES - logger.debug("Send failed all %s times, stopping", RETRY_TIMES) - rmr.rmr_free_msg(sbuf) - return None - - -# Public - - -def queue_work(item): - """ - push an item into the work queue - currently the only type of work is to send out messages - """ - _SEND_QUEUE.put(item) - - -class RmrLoop: - """ - class represents an rmr loop meant to be called as a longstanding separate thread - """ - - def __init__(self, _init_func_override=None, rcv_func_override=None): - self._rmr_is_ready = False - self._keep_going = True - self._init_func_override = _init_func_override # useful for unit testing - self._rcv_func_override = rcv_func_override # useful for unit testing to mock certain recieve scenarios - self._rcv_func = None - - def rmr_is_ready(self): - """returns whether rmr has been initialized""" - return self._rmr_is_ready - - def stop(self): - """sets a flag for the loop to end""" - self._keep_going = False + def __init__(self, init_func_override=None, rcv_func_override=None): + self.keep_going = True + self.rcv_func = None + self.last_ran = time.time() + self.work_queue = queue.Queue() # thread safe queue https://docs.python.org/3/library/queue.html + + # intialize rmr context + if init_func_override: + self.mrc = init_func_override() + else: + logger.debug("Waiting for rmr to initialize..") + # rmr.RMRFL_MTCALL puts RMR into a multithreaded mode, where a receiving thread populates an + # internal ring of messages, and receive calls read from that + # currently the size is 2048 messages, so this is fine for the foreseeable future + self.mrc = rmr.rmr_init(b"4562", rmr.RMR_MAX_RCV_BYTES, rmr.RMRFL_MTCALL) + while rmr.rmr_ready(self.mrc) == 0: + time.sleep(0.5) + + # set the receive function + self.rcv_func = rcv_func_override if rcv_func_override else lambda: helpers.rmr_rcvall_msgs(self.mrc, [21024]) + + # start the work loop + self.thread = Thread(target=self.loop) + self.thread.start() def loop(self): """ - This loop runs in an a1 thread forever, and has 3 jobs: + This loop runs forever, and has 3 jobs: - send out any messages that have to go out (create instance, delete instance) - read a1s mailbox and update the status of all instances based on acks from downstream policy handlers - clean up the database (eg delete the instance) under certain conditions based on those statuses (NOT DONE YET) """ - - # get a context - mrc = self._init_func_override() if self._init_func_override else _init_rmr() - self._rmr_is_ready = True - logger.debug("Rmr is ready") - - # set the receive function called below - self._rcv_func = ( - self._rcv_func_override if self._rcv_func_override else lambda: helpers.rmr_rcvall_msgs(mrc, [21024]) - ) - # loop forever logger.debug("Work loop starting") - while self._keep_going: + while self.keep_going: + # send out all messages waiting for us - while not _SEND_QUEUE.empty(): - work_item = _SEND_QUEUE.get(block=False, timeout=None) - _send(mrc, payload=work_item["payload"], message_type=work_item["msg type"]) + while not self.work_queue.empty(): + work_item = self.work_queue.get(block=False, timeout=None) + + pay = work_item["payload"].encode("utf-8") + for _ in range(0, RETRY_TIMES): + # Waiting on an rmr bugfix regarding the over-allocation: https://rancodev.atlassian.net/browse/RICPLT-2490 + sbuf = rmr.rmr_alloc_msg(self.mrc, 4096, pay, True, work_item["msg type"]) + pre_send_summary = rmr.message_summary(sbuf) + sbuf = rmr.rmr_send_msg(self.mrc, sbuf) # send + post_send_summary = rmr.message_summary(sbuf) + logger.debug("Pre-send summary: %s, Post-send summary: %s", pre_send_summary, post_send_summary) + rmr.rmr_free_msg(sbuf) # free + if post_send_summary["message state"] == 0 and post_send_summary["message status"] == "RMR_OK": + logger.debug("Message sent successfully!") + break # read our mailbox and update statuses updated_instances = set() - for msg in self._rcv_func(): + for msg in self.rcv_func(): try: pay = json.loads(msg["payload"]) pti = pay["policy_type_id"] @@ -162,18 +114,47 @@ class RmrLoop: for ut in updated_instances: data.clean_up_instance(ut[0], ut[1]) - # TODO: what's a reasonable sleep time? we don't want to hammer redis too much, and a1 isn't a real time component - time.sleep(1) + # TODO: what's a reasonable sleep time? we don't want to hammer redis too much, and a1 isn't a real time component + self.last_ran = time.time() + time.sleep(1) + + +# Public def start_rmr_thread(init_func_override=None, rcv_func_override=None): """ Start a1s rmr thread - Also called during unit testing """ - rmr_loop = RmrLoop(init_func_override, rcv_func_override) - thread = Thread(target=rmr_loop.loop) - thread.start() - while not rmr_loop.rmr_is_ready(): - time.sleep(0.5) - return rmr_loop # return the handle; useful during unit testing + global __RMR_LOOP__ + if __RMR_LOOP__ is None: + __RMR_LOOP__ = _RmrLoop(init_func_override, rcv_func_override) + + +def stop_rmr_thread(): + """ + stops the rmr thread + """ + __RMR_LOOP__.keep_going = False + + +def queue_work(item): + """ + push an item into the work queue + currently the only type of work is to send out messages + """ + __RMR_LOOP__.work_queue.put(item) + + +def healthcheck_rmr_thread(seconds=30): + """ + returns a boolean representing whether the rmr loop is healthy, by checking two attributes: + 1. is it running?, + 2. is it stuck in a long (> seconds) loop? + """ + return __RMR_LOOP__.thread.is_alive() and ((time.time() - __RMR_LOOP__.last_ran) < seconds) + + +def replace_rcv_func(rcv_func): + """purely for the ease of unit testing to test different rcv scenarios""" + __RMR_LOOP__.rcv_func = rcv_func diff --git a/a1/controller.py b/a1/controller.py index b3ce88e..75d8456 100644 --- a/a1/controller.py +++ b/a1/controller.py @@ -63,9 +63,14 @@ def _gen_body_to_handler(operation, policy_type_id, policy_instance_id, payload= def get_healthcheck(): """ Handles healthcheck GET - Currently, this basically checks the server is alive - """ - return "", 200 + Currently, this checks: + 1. whether the a1 webserver is up (if it isn't, this won't even be called, so even entering this function confirms it is) + 2. checks whether the rmr thread is running and has completed a loop recently + TODO: make "seconds" to pass in a configurable parameter? + """ + if a1rmr.healthcheck_rmr_thread(): + return "", 200 + return "rmr thread is unhealthy", 500 # Policy types diff --git a/container-tag.yaml b/container-tag.yaml index 554b483..2bb40a9 100644 --- a/container-tag.yaml +++ b/container-tag.yaml @@ -1,4 +1,4 @@ # The Jenkins job uses this string for the tag in the image name # for example nexus3.o-ran-sc.org:10004/my-image-name:my-tag --- -tag: 1.0.3 +tag: 1.0.4 diff --git a/docs/release-notes.rst b/docs/release-notes.rst index 59676d0..5173a25 100644 --- a/docs/release-notes.rst +++ b/docs/release-notes.rst @@ -29,6 +29,16 @@ and this project adheres to `Semantic Versioning `__. * Represents a resillent version of 1.0.0 that uses Redis for persistence +[1.0.4] + +:: + + * Only external change here is to healthcheck the rmr thread as part of a1s healthcheck. k8s will now respin a1 if that is failing. + * Refactors (simplifies) how we wait for rmr initialization; it is now called as part of __init__ + * Refactors (simplifies) how the thread is actually launched; it is now internal to the object and also a part of __init__ + * Cleans up unit testing; a1rmr now exposes a replace_rcv_func; useful for unit testing, harmless if not called otherwise + * Upgrades to rmr-python 1.0.0 for simpler message allocation + [1.0.3] - 10/22/2019 :: diff --git a/integration_tests/Dockerfile b/integration_tests/Dockerfile index e7bfe80..87a8dce 100644 --- a/integration_tests/Dockerfile +++ b/integration_tests/Dockerfile @@ -34,7 +34,7 @@ COPY receiver.py / # Install RMr python bindings RUN pip install --upgrade pip -RUN pip install rmr==0.13.5 +RUN pip install rmr==1.0.0 # rmr setups RUN mkdir -p /opt/route/ diff --git a/integration_tests/a1mediator/Chart.yaml b/integration_tests/a1mediator/Chart.yaml index d0c73be..e59dc91 100644 --- a/integration_tests/a1mediator/Chart.yaml +++ b/integration_tests/a1mediator/Chart.yaml @@ -1,4 +1,4 @@ apiVersion: v1 description: A1 Helm chart for Kubernetes name: a1mediator -version: 1.0.3 +version: 1.0.4 diff --git a/integration_tests/receiver.py b/integration_tests/receiver.py index d73dcd9..fedc690 100644 --- a/integration_tests/receiver.py +++ b/integration_tests/receiver.py @@ -35,7 +35,7 @@ while rmr.rmr_ready(mrc) == 0: print("listening ON {}".format(PORT)) while True: - sbuf = rmr.rmr_alloc_msg(mrc, 4096) + sbuf = rmr.rmr_alloc_msg(mrc, 10) sbuf = rmr.rmr_torcv_msg(mrc, sbuf, 1000) summary = rmr.message_summary(sbuf) if summary["message state"] == 12 and summary["message status"] == "RMR_ERR_TIMEOUT": diff --git a/setup.py b/setup.py index f8a2923..2263cc4 100644 --- a/setup.py +++ b/setup.py @@ -18,13 +18,13 @@ from setuptools import setup, find_packages setup( name="a1", - version="1.0.3", + version="1.0.4", packages=find_packages(exclude=["tests.*", "tests"]), author="Tommy Carpenter", description="RIC A1 Mediator for policy/intent changes", url="https://gerrit.o-ran-sc.org/r/admin/repos/ric-plt/a1", entry_points={"console_scripts": ["run.py=a1.run:main"]}, # we require jsonschema, should be in that list, but connexion already requires a specific version of it - install_requires=["requests", "Flask", "connexion[swagger-ui]", "gevent", "msgpack", "rmr>=0.13.5"], + install_requires=["requests", "Flask", "connexion[swagger-ui]", "gevent", "msgpack", "rmr>=1.0.0"], package_data={"a1": ["openapi.yaml"]}, ) diff --git a/tests/test_controller.py b/tests/test_controller.py index 63831b0..756a0d0 100644 --- a/tests/test_controller.py +++ b/tests/test_controller.py @@ -77,7 +77,7 @@ def _test_put_patch(monkeypatch): monkeypatch.setattr("rmr.rmr.rmr_free_msg", noop) # we need to repatch alloc (already patched in patch_rmr) to fix the transactionid, alloc is called in send and recieve - def fake_alloc(_unused, _alsounused): + def fake_alloc(_unused1, _unused2, _unused3, _unused4, _unused5): sbuf = rmr_mocks.Rmr_mbuf_t() sbuf.contents.xaction = b"d49b53e478b711e9a1130242ac110002" return sbuf @@ -94,18 +94,14 @@ def _test_put_patch(monkeypatch): # Module level Hack -RMR_THREAD = None - - def setup_module(): """module level setup""" - global RMR_THREAD def noop(): pass # launch the thread with a fake init func and a patched rcv func; we will "repatch" later - RMR_THREAD = a1rmr.start_rmr_thread(init_func_override=noop, rcv_func_override=_fake_dequeue_none) + a1rmr.start_rmr_thread(init_func_override=noop, rcv_func_override=_fake_dequeue_none) # Actual Tests @@ -188,7 +184,7 @@ def test_workflow(client, monkeypatch, adm_type_good, adm_instance_good): get_instance_good("NOT IN EFFECT") # now pretend we did get a good ACK - RMR_THREAD._rcv_func = _fake_dequeue + a1rmr.replace_rcv_func(_fake_dequeue) time.sleep(1) # wait for the rmr thread get_instance_good("IN EFFECT") @@ -207,7 +203,7 @@ def test_workflow(client, monkeypatch, adm_type_good, adm_instance_good): get_instance_good("IN EFFECT") # now pretend we deleted successfully - RMR_THREAD._rcv_func = _fake_dequeue_deleted + a1rmr.replace_rcv_func(_fake_dequeue_deleted) time.sleep(1) # wait for the rmr thread # list still 200 but no instance res = client.get(ADM_CTRL_POLICIES) @@ -251,7 +247,7 @@ def test_bad_instances(client, monkeypatch, adm_type_good): assert res.status_code == 404 # get a non existent instance - RMR_THREAD._rcv_func = _fake_dequeue + a1rmr.replace_rcv_func(_fake_dequeue) time.sleep(1) res = client.get(ADM_CTRL_INSTANCE + "DARKNESS") assert res.status_code == 404 @@ -261,7 +257,7 @@ def test_bad_instances(client, monkeypatch, adm_type_good): assert res.status_code == 204 -def test_illegal_types(client, monkeypatch, adm_type_good): +def test_illegal_types(client, adm_type_good): """ Test illegal types """ @@ -281,4 +277,4 @@ def test_healthcheck(client): def teardown_module(): """module teardown""" - RMR_THREAD.stop() + a1rmr.stop_rmr_thread() -- 2.16.6