X-Git-Url: https://gerrit.o-ran-sc.org/r/gitweb?a=blobdiff_plain;f=a1%2Fa1rmr.py;h=359d2e0659a68ee871cc3612f90326bcb221fc46;hb=8bcc51a6d44d40a1a338fb6a721b5ee8f992f323;hp=589affdeceb4dae5ea31faebde49cfd2fb89464f;hpb=5ad8f03e1fc7683bb59da31f59edc2f6c0b2372b;p=ric-plt%2Fa1.git diff --git a/a1/a1rmr.py b/a1/a1rmr.py index 589affd..359d2e0 100644 --- a/a1/a1rmr.py +++ b/a1/a1rmr.py @@ -15,91 +15,51 @@ # limitations under the License. # ================================================================================== import os -import gevent -from rmr import rmr +import queue +import time +import json +from threading import Thread +from rmr import rmr, helpers from a1 import get_module_logger -from a1.exceptions import MessageSendFailure, ExpectedAckNotReceived +from a1 import data +from a1.exceptions import PolicyTypeNotFound, PolicyInstanceNotFound logger = get_module_logger(__name__) -RMR_RCV_RETRY_INTERVAL = int(os.environ.get("RMR_RCV_RETRY_INTERVAL", 1000)) RETRY_TIMES = int(os.environ.get("RMR_RETRY_TIMES", 4)) -MRC = None +_SEND_QUEUE = queue.Queue() # thread safe queue https://docs.python.org/3/library/queue.html -RECEIVED_MESSAGES = [] # used to store messages we need but havent been procedded yet -WAITING_TRANSIDS = {} # used to store transactionids we are waiting for, so we can filter other stuff out - -def _dequeue_all_waiting_messages(): - """ - dequeue all waiting rmr messages from rmr, put them into RECEIVED_MESSAGES +def _init_rmr(): """ - new_messages = [] - sbuf = rmr.rmr_alloc_msg(MRC, 4096) - while True: - sbuf = rmr.rmr_torcv_msg(MRC, sbuf, 0) # set the timeout to 0 so this doesn't block!! - summary = rmr.message_summary(sbuf) - if summary["message state"] == 12 and summary["message status"] == "RMR_ERR_TIMEOUT": - break - elif summary["transaction id"] in WAITING_TRANSIDS: # message is relevent - new_messages.append(summary) - else: - logger.debug("A message was received by a1, but a1 was not expecting it! It's being dropped: %s", summary) - # do nothing with message, effectively dropped - return new_messages - - -def _check_if_ack_received(target_transid, target_type): + init an rmr context + This gets monkeypatched out for unit testing """ - Try to recieve the latest messages, then search the current queue for the target ACK - TODO: probably a slightly more efficient data structure than list. Maybe a dict by message type - However, in the near term, where there are not many xapps under A1, this is fine. Revisit later. - TODO: do we need to deal with duplicate ACKs for the same transaction id? - Is it possible if the downstream xapp uses rmr_rts? Might be harmless to sit in queue.. might slow things + # rmr.RMRFL_MTCALL puts RMR into a multithreaded mode, where a receiving thread populates an + # internal ring of messages, and receive calls read from that + # currently the size is 2048 messages, so this is fine for the foreseeable future + logger.debug("Waiting for rmr to initialize..") + mrc = rmr.rmr_init(b"4562", rmr.RMR_MAX_RCV_BYTES, rmr.RMRFL_MTCALL) + while rmr.rmr_ready(mrc) == 0: + time.sleep(0.5) - """ - new_messages = _dequeue_all_waiting_messages() # dequeue all waiting messages - global RECEIVED_MESSAGES # this is ugly, but fine.. we just need an in memory list across the async calls - RECEIVED_MESSAGES += new_messages - for index, summary in enumerate(RECEIVED_MESSAGES): # Search the queue for the target message - if ( - summary["message state"] == 0 - and summary["message status"] == "RMR_OK" - and summary["message type"] == target_type - and summary["transaction id"] == target_transid - ): # Found; delete it from queue - del RECEIVED_MESSAGES[index] - return summary - return None + return mrc -def init_rmr(): - """ - called from run; not called for unit tests +def _send(mrc, payload, message_type=0): """ - global MRC - MRC = rmr.rmr_init(b"4562", rmr.RMR_MAX_RCV_BYTES, 0x00) - - while rmr.rmr_ready(MRC) == 0: - gevent.sleep(1) - logger.debug("not yet ready") - - -def send(payload, message_type=0): - """ - sends a message up to RETRY_TIMES + Sends a message up to RETRY_TIMES If the message is sent successfully, it returns the transactionid - Raises an exception (MessageSendFailure) otherwise + Does nothing otherwise """ - # we may be called many times in asyncronous loops, so for now, it is safer not to share buffers. We can investifgate later whether this is really a problem. - sbuf = rmr.rmr_alloc_msg(MRC, 4096) + # TODO: investigate moving this below and allocating the space based on the payload size + sbuf = rmr.rmr_alloc_msg(mrc, 4096) payload = payload if isinstance(payload, bytes) else payload.encode("utf-8") # retry RETRY_TIMES to send the message - tried = 0 - while True: + for _ in range(0, RETRY_TIMES): # setup the send message rmr.set_payload_and_length(payload, sbuf) rmr.generate_and_set_transaction_id(sbuf) @@ -110,54 +70,110 @@ def send(payload, message_type=0): transaction_id = pre_send_summary["transaction id"] # save the transactionid because we need it later # send - sbuf = rmr.rmr_send_msg(MRC, sbuf) + sbuf = rmr.rmr_send_msg(mrc, sbuf) post_send_summary = rmr.message_summary(sbuf) logger.debug("Post message send summary: %s", rmr.message_summary(sbuf)) # check success or failure if post_send_summary["message state"] == 0 and post_send_summary["message status"] == "RMR_OK": - return transaction_id # we are good - if post_send_summary["message state"] == 10 and post_send_summary["message status"] == "RMR_ERR_RETRY": - # in this state, we should retry - if tried == RETRY_TIMES: - # we have tried RETRY_TIMES and we are still not getting a good state, raise an exception and let the caller deal with it - raise MessageSendFailure(str(post_send_summary)) - else: - tried += 1 - else: - # we hit a state where we should not even retry - raise MessageSendFailure(str(post_send_summary)) - - -def send_ack_retry(payload, expected_ack_message_type, message_type=0): - """ - send a message and check for an ACK. - If no ACK is recieved, defer execution for RMR_RCV_RETRY_INTERVAL ms, then check again. - If no ack is received before the timeout (set by _rmr_init), send again and try again up to RETRY_TIMES + # we are good + logger.debug("Message sent successfully!") + rmr.rmr_free_msg(sbuf) + return transaction_id + + # we failed all RETRY_TIMES + logger.debug("Send failed all %s times, stopping", RETRY_TIMES) + rmr.rmr_free_msg(sbuf) + return None + + +# Public - It is critical here to set the RMR_TIMEOUT to 0 in the rmr_rcv_to function, which causes that function NOT to block. - Instead, if the message isn't there, we give up execution for the interval, which allows the gevent server to process other requests in the meantime. - Amazing props to https://sdiehl.github.io/gevent-tutorial/ - (which also runs this whole server) +def queue_work(item): """ + push an item into the work queue + currently the only type of work is to send out messages + """ + _SEND_QUEUE.put(item) - # try to send the msg to the downstream policy handler - expected_transaction_id = send(payload, message_type) - WAITING_TRANSIDS[expected_transaction_id] = 1 - gevent.sleep(0.01) # wait 10ms before we try the first recieve - for _ in range(0, RETRY_TIMES): - logger.debug("Seeing if return message is fufilled") - summary = _check_if_ack_received(expected_transaction_id, expected_ack_message_type) - if summary: - logger.debug("Target ack Message received!: %s", summary) - logger.debug("current queue size is %d", len(RECEIVED_MESSAGES)) - del WAITING_TRANSIDS[expected_transaction_id] - return summary["payload"] - else: - logger.debug("Deffering execution for %s seconds", str(RMR_RCV_RETRY_INTERVAL / 1000)) - gevent.sleep(RMR_RCV_RETRY_INTERVAL / 1000) - - # we still didn't get the ACK we want - raise ExpectedAckNotReceived() +class RmrLoop: + """ + class represents an rmr loop meant to be called as a longstanding separate thread + """ + + def __init__(self, _init_func_override=None, rcv_func_override=None): + self._rmr_is_ready = False + self._keep_going = True + self._init_func_override = _init_func_override # useful for unit testing + self._rcv_func_override = rcv_func_override # useful for unit testing to mock certain recieve scenarios + self._rcv_func = None + + def rmr_is_ready(self): + """returns whether rmr has been initialized""" + return self._rmr_is_ready + + def stop(self): + """sets a flag for the loop to end""" + self._keep_going = False + + def loop(self): + """ + This loop runs in an a1 thread forever, and has 3 jobs: + - send out any messages that have to go out (create instance, delete instance) + - read a1s mailbox and update the status of all instances based on acks from downstream policy handlers + - clean up the database (eg delete the instance) under certain conditions based on those statuses (NOT DONE YET) + """ + + # get a context + mrc = self._init_func_override() if self._init_func_override else _init_rmr() + self._rmr_is_ready = True + logger.debug("Rmr is ready") + + # set the receive function called below + self._rcv_func = ( + self._rcv_func_override if self._rcv_func_override else lambda: helpers.rmr_rcvall_msgs(mrc, [21024]) + ) + + # loop forever + logger.debug("Work loop starting") + while self._keep_going: + # send out all messages waiting for us + while not _SEND_QUEUE.empty(): + work_item = _SEND_QUEUE.get(block=False, timeout=None) + _send(mrc, payload=work_item["payload"], message_type=work_item["msg type"]) + + # read our mailbox and update statuses + updated_instances = set() + for msg in self._rcv_func(): + try: + pay = json.loads(msg["payload"]) + pti = pay["policy_type_id"] + pii = pay["policy_instance_id"] + data.set_status(pti, pii, pay["handler_id"], pay["status"]) + updated_instances.add((pti, pii)) + except (PolicyTypeNotFound, PolicyInstanceNotFound, KeyError, json.decoder.JSONDecodeError): + # TODO: in the future we may also have to catch SDL errors + logger.debug(("Dropping malformed or non applicable message", msg)) + + # for all updated instances, see if we can trigger a delete + # should be no catch needed here, since the status update would have failed if it was a bad pair + for ut in updated_instances: + data.clean_up_instance(ut[0], ut[1]) + + # TODO: what's a reasonable sleep time? we don't want to hammer redis too much, and a1 isn't a real time component + time.sleep(1) + + +def start_rmr_thread(init_func_override=None, rcv_func_override=None): + """ + Start a1s rmr thread + Also called during unit testing + """ + rmr_loop = RmrLoop(init_func_override, rcv_func_override) + thread = Thread(target=rmr_loop.loop) + thread.start() + while not rmr_loop.rmr_is_ready(): + time.sleep(0.5) + return rmr_loop # return the handle; useful during unit testing