Cleanups only (no code changes)

[ric-plt/a1.git] / a1 / a1rmr.py
diff --git a/a1/a1rmr.py b/a1/a1rmr.py

index 589affd..58ec1c0 100644 (file)
--- a/a1/a1rmr.py
+++ b/a1/a1rmr.py
@@ -1,6 +1,9 @@
+"""
+a1s rmr functionality
+"""
  # ==================================================================================
-#       Copyright (c) 2019 Nokia
-#       Copyright (c) 2018-2019 AT&T Intellectual Property.
+#       Copyright (c) 2019-2020 Nokia
+#       Copyright (c) 2018-2020 AT&T Intellectual Property.
  #
  #   Licensed under the Apache License, Version 2.0 (the "License");
  #   you may not use this file except in compliance with the License.
@@ -15,149 +18,200 @@
  #   limitations under the License.
  # ==================================================================================
  import os
-import gevent
-from rmr import rmr
-from a1 import get_module_logger
-from a1.exceptions import MessageSendFailure, ExpectedAckNotReceived
+import queue
+import time
+import json
+from threading import Thread
+from rmr import rmr, helpers
+from mdclogpy import Logger
+from a1 import data, messages
+from a1.exceptions import PolicyTypeNotFound, PolicyInstanceNotFound
  
-logger = get_module_logger(__name__)
+mdc_logger = Logger(name=__name__)
  
  
-RMR_RCV_RETRY_INTERVAL = int(os.environ.get("RMR_RCV_RETRY_INTERVAL", 1000))
-RETRY_TIMES = int(os.environ.get("RMR_RETRY_TIMES", 4))
-MRC = None
+RETRY_TIMES = int(os.environ.get("A1_RMR_RETRY_TIMES", 4))
+A1_POLICY_REQUEST = 20010
+A1_POLICY_RESPONSE = 20011
+A1_POLICY_QUERY = 20012
  
  
-RECEIVED_MESSAGES = []  # used to store messages we need but havent been procedded yet
-WAITING_TRANSIDS = {}  # used to store transactionids we are waiting for, so we can filter other stuff out
+# Note; yes, globals are bad, but this is a private (to this module) global
+# No other module can import/access this (well, python doesn't enforce this, but all linters will complain)
+__RMR_LOOP__ = None
  
  
-def _dequeue_all_waiting_messages():
+class _RmrLoop:
      """
-    dequeue all waiting rmr messages from rmr, put them into RECEIVED_MESSAGES
+    class represents an rmr loop that constantly reads from rmr and performs operations based on waiting messages
+    this launches a thread, it should probably only be called once; the public facing method to access these ensures this
      """
-    new_messages = []
-    sbuf = rmr.rmr_alloc_msg(MRC, 4096)
-    while True:
-        sbuf = rmr.rmr_torcv_msg(MRC, sbuf, 0)  # set the timeout to 0 so this doesn't block!!
-        summary = rmr.message_summary(sbuf)
-        if summary["message state"] == 12 and summary["message status"] == "RMR_ERR_TIMEOUT":
-            break
-        elif summary["transaction id"] in WAITING_TRANSIDS:  # message is relevent
-            new_messages.append(summary)
-        else:
-            logger.debug("A message was received by a1, but a1 was not expecting it! It's being dropped: %s", summary)
-            # do nothing with message, effectively dropped
-    return new_messages
  
+    def __init__(self, init_func_override=None, rcv_func_override=None):
+        self.keep_going = True
+        self.rcv_func = None
+        self.last_ran = time.time()
  
-def _check_if_ack_received(target_transid, target_type):
-    """
-    Try to recieve the latest messages, then search the current queue for the target ACK
-    TODO: probably a slightly more efficient data structure than list. Maybe a dict by message type
-        However, in the near term, where there are not many xapps under A1, this is fine. Revisit later.
-    TODO: do we need to deal with duplicate ACKs for the same transaction id?
-        Is it possible if the downstream xapp uses rmr_rts? Might be harmless to sit in queue.. might slow things
+        # see docs/overview#resiliency for a discussion of this
+        self.instance_send_queue = queue.Queue()  # thread safe queue https://docs.python.org/3/library/queue.html
  
+        # intialize rmr context
+        if init_func_override:
+            self.mrc = init_func_override()
+        else:
+            mdc_logger.debug("Waiting for rmr to initialize..")
+            # rmr.RMRFL_MTCALL puts RMR into a multithreaded mode, where a receiving thread populates an
+            # internal ring of messages, and receive calls read from that
+            # currently the size is 2048 messages, so this is fine for the foreseeable future
+            self.mrc = rmr.rmr_init(b"4562", rmr.RMR_MAX_RCV_BYTES, rmr.RMRFL_MTCALL)
+            while rmr.rmr_ready(self.mrc) == 0:
+                time.sleep(0.5)
+
+        # set the receive function
+        self.rcv_func = (
+            rcv_func_override
+            if rcv_func_override
+            else lambda: helpers.rmr_rcvall_msgs_raw(self.mrc, [A1_POLICY_RESPONSE, A1_POLICY_QUERY])
+        )
+
+        # start the work loop
+        self.thread = Thread(target=self.loop)
+        self.thread.start()
+
+    def _assert_good_send(self, sbuf, pre_send_summary):
+        """
+        common helper function for _send_msg and _rts_msg
+        """
+        post_send_summary = rmr.message_summary(sbuf)
+        if post_send_summary["message state"] == 0 and post_send_summary["message status"] == "RMR_OK":
+            return True
+        mdc_logger.debug("Message NOT sent!")
+        mdc_logger.debug("Pre-send summary: {0}, Post-send summary: {1}".format(pre_send_summary, post_send_summary))
+        return False
+
+    def _send_msg(self, pay, mtype, subid):
+        """
+        sends a msg
+        """
+        for _ in range(0, RETRY_TIMES):
+            sbuf = rmr.rmr_alloc_msg(self.mrc, len(pay), payload=pay, gen_transaction_id=True, mtype=mtype, sub_id=subid)
+            sbuf.contents.sub_id = subid
+            pre_send_summary = rmr.message_summary(sbuf)
+            sbuf = rmr.rmr_send_msg(self.mrc, sbuf)  # send
+            if self._assert_good_send(sbuf, pre_send_summary):
+                rmr.rmr_free_msg(sbuf)  # free
+                break
+
+    def _rts_msg(self, pay, sbuf_rts, mtype):
+        """
+        sends a message using rts
+        we do not call free here because we may rts many times; it is called after the rts loop
+        """
+        for _ in range(0, RETRY_TIMES):
+            pre_send_summary = rmr.message_summary(sbuf_rts)
+            sbuf_rts = rmr.rmr_rts_msg(self.mrc, sbuf_rts, payload=pay, mtype=mtype)
+            if self._assert_good_send(sbuf_rts, pre_send_summary):
+                break
+        return sbuf_rts  # in some cases rts may return a new sbuf
+
+    def loop(self):
+        """
+        This loop runs forever, and has 3 jobs:
+        - send out any messages that have to go out (create instance, delete instance)
+        - read a1s mailbox and update the status of all instances based on acks from downstream policy handlers
+        - clean up the database (eg delete the instance) under certain conditions based on those statuses (NOT DONE YET)
+        """
+        # loop forever
+        mdc_logger.debug("Work loop starting")
+        while self.keep_going:
+
+            # send out all messages waiting for us
+            while not self.instance_send_queue.empty():
+                work_item = self.instance_send_queue.get(block=False, timeout=None)
+                payload = json.dumps(messages.a1_to_handler(*work_item)).encode("utf-8")
+                self._send_msg(payload, A1_POLICY_REQUEST, work_item[1])
+
+            # read our mailbox
+            for (msg, sbuf) in self.rcv_func():
+                # TODO: in the future we may also have to catch SDL errors
+                try:
+                    mtype = msg["message type"]
+                except (KeyError, TypeError, json.decoder.JSONDecodeError):
+                    mdc_logger.debug("Dropping malformed policy ack/query message: {0}".format(msg))
+
+                if mtype == A1_POLICY_RESPONSE:
+                    try:
+                        # got a policy response, update status
+                        pay = json.loads(msg["payload"])
+                        data.set_policy_instance_status(
+                            pay["policy_type_id"], pay["policy_instance_id"], pay["handler_id"], pay["status"]
+                        )
+                        mdc_logger.debug("Successfully received status update: {0}".format(pay))
+                    except (PolicyTypeNotFound, PolicyInstanceNotFound):
+                        mdc_logger.debug("Received a response  for a non-existent instance")
+                    except (KeyError, TypeError, json.decoder.JSONDecodeError):
+                        mdc_logger.debug("Dropping malformed policy ack message: {0}".format(msg))
+
+                elif mtype == A1_POLICY_QUERY:
+                    try:
+                        # got a query, do a lookup and send out all instances
+                        pti = json.loads(msg["payload"])["policy_type_id"]
+                        mdc_logger.debug("Received query for: {0}".format(pti))
+                        for pii in data.get_instance_list(pti):
+                            instance = data.get_policy_instance(pti, pii)
+                            payload = json.dumps(messages.a1_to_handler("CREATE", pti, pii, instance)).encode("utf-8")
+                            sbuf = self._rts_msg(payload, sbuf, A1_POLICY_REQUEST)
+                    except (PolicyTypeNotFound, PolicyInstanceNotFound):
+                        mdc_logger.debug("Received a query for a non-existent type: {0}".format(msg))
+                    except (KeyError, TypeError, json.decoder.JSONDecodeError):
+                        mdc_logger.debug("Dropping malformed policy query message: {0}".format(msg))
+
+                else:
+                    mdc_logger.debug("Received message type {0} but A1 does not handle this".format(mtype))
+
+                # we must free each sbuf
+                rmr.rmr_free_msg(sbuf)
+
+            self.last_ran = time.time()
+            time.sleep(1)
+
+
+# Public
+
+
+def start_rmr_thread(init_func_override=None, rcv_func_override=None):
      """
-    new_messages = _dequeue_all_waiting_messages()  # dequeue all waiting messages
-    global RECEIVED_MESSAGES  # this is ugly, but fine.. we just need an in memory list across the async calls
-    RECEIVED_MESSAGES += new_messages
-    for index, summary in enumerate(RECEIVED_MESSAGES):  # Search the queue for the target message
-        if (
-            summary["message state"] == 0
-            and summary["message status"] == "RMR_OK"
-            and summary["message type"] == target_type
-            and summary["transaction id"] == target_transid
-        ):  # Found; delete it from queue
-            del RECEIVED_MESSAGES[index]
-            return summary
-    return None
-
-
-def init_rmr():
-    """
-    called from run; not called for unit tests
+    Start a1s rmr thread
      """
-    global MRC
-    MRC = rmr.rmr_init(b"4562", rmr.RMR_MAX_RCV_BYTES, 0x00)
-
-    while rmr.rmr_ready(MRC) == 0:
-        gevent.sleep(1)
-        logger.debug("not yet ready")
+    global __RMR_LOOP__
+    if __RMR_LOOP__ is None:
+        __RMR_LOOP__ = _RmrLoop(init_func_override, rcv_func_override)
  
  
-def send(payload, message_type=0):
+def stop_rmr_thread():
      """
-    sends a message up to RETRY_TIMES
-    If the message is sent successfully, it returns the transactionid
-    Raises an exception (MessageSendFailure) otherwise
+    stops the rmr thread
      """
-    # we may be called many times in asyncronous loops, so for now, it is safer not to share buffers. We can investifgate later whether this is really a problem.
-    sbuf = rmr.rmr_alloc_msg(MRC, 4096)
-    payload = payload if isinstance(payload, bytes) else payload.encode("utf-8")
-
-    # retry RETRY_TIMES to send the message
-    tried = 0
-    while True:
-        # setup the send message
-        rmr.set_payload_and_length(payload, sbuf)
-        rmr.generate_and_set_transaction_id(sbuf)
-        sbuf.contents.state = 0
-        sbuf.contents.mtype = message_type
-        pre_send_summary = rmr.message_summary(sbuf)
-        logger.debug("Pre message send summary: %s", pre_send_summary)
-        transaction_id = pre_send_summary["transaction id"]  # save the transactionid because we need it later
-
-        # send
-        sbuf = rmr.rmr_send_msg(MRC, sbuf)
-        post_send_summary = rmr.message_summary(sbuf)
-        logger.debug("Post message send summary: %s", rmr.message_summary(sbuf))
-
-        # check success or failure
-        if post_send_summary["message state"] == 0 and post_send_summary["message status"] == "RMR_OK":
-            return transaction_id  # we are good
-        if post_send_summary["message state"] == 10 and post_send_summary["message status"] == "RMR_ERR_RETRY":
-            # in this state, we should retry
-            if tried == RETRY_TIMES:
-                # we have tried RETRY_TIMES and we are still not getting a good state, raise an exception and let the caller deal with it
-                raise MessageSendFailure(str(post_send_summary))
-            else:
-                tried += 1
-        else:
-            # we hit a state where we should not even retry
-            raise MessageSendFailure(str(post_send_summary))
+    __RMR_LOOP__.keep_going = False
  
  
-def send_ack_retry(payload, expected_ack_message_type, message_type=0):
+def queue_instance_send(item):
      """
-    send a message and check for an ACK.
-    If no ACK is recieved, defer execution for RMR_RCV_RETRY_INTERVAL ms, then check again.
-    If no ack is received before the timeout (set by _rmr_init), send again and try again up to RETRY_TIMES
+    push an item into the work queue
+    currently the only type of work is to send out messages
+    """
+    __RMR_LOOP__.instance_send_queue.put(item)
  
-    It is critical here to set the RMR_TIMEOUT to 0 in the rmr_rcv_to function, which causes that function NOT to block.
-    Instead, if the message isn't there, we give up execution for the interval, which allows the gevent server to process other requests in the meantime.
  
-    Amazing props to https://sdiehl.github.io/gevent-tutorial/
-    (which also runs this whole server)
+def healthcheck_rmr_thread(seconds=30):
+    """
+    returns a boolean representing whether the rmr loop is healthy, by checking two attributes:
+    1. is it running?,
+    2. is it stuck in a long (> seconds) loop?
      """
+    return __RMR_LOOP__.thread.is_alive() and ((time.time() - __RMR_LOOP__.last_ran) < seconds)
  
-    # try to send the msg to the downstream policy handler
-    expected_transaction_id = send(payload, message_type)
-    WAITING_TRANSIDS[expected_transaction_id] = 1
-
-    gevent.sleep(0.01)  # wait 10ms before we try the first recieve
-    for _ in range(0, RETRY_TIMES):
-        logger.debug("Seeing if return message is fufilled")
-        summary = _check_if_ack_received(expected_transaction_id, expected_ack_message_type)
-        if summary:
-            logger.debug("Target ack Message received!: %s", summary)
-            logger.debug("current queue size is %d", len(RECEIVED_MESSAGES))
-            del WAITING_TRANSIDS[expected_transaction_id]
-            return summary["payload"]
-        else:
-            logger.debug("Deffering execution for %s seconds", str(RMR_RCV_RETRY_INTERVAL / 1000))
-            gevent.sleep(RMR_RCV_RETRY_INTERVAL / 1000)
  
-    # we still didn't get the ACK we want
-    raise ExpectedAckNotReceived()
+def replace_rcv_func(rcv_func):
+    """purely for the ease of unit testing to test different rcv scenarios"""
+    __RMR_LOOP__.rcv_func = rcv_func