# limitations under the License.
# ==================================================================================
import os
-import gevent
-from rmr import rmr
+import queue
+import time
+import json
+from threading import Thread
+from rmr import rmr, helpers
from a1 import get_module_logger
-from a1.exceptions import MessageSendFailure, ExpectedAckNotReceived
+from a1 import data
+from a1.exceptions import PolicyTypeNotFound, PolicyInstanceNotFound
logger = get_module_logger(__name__)
-RMR_RCV_RETRY_INTERVAL = int(os.environ.get("RMR_RCV_RETRY_INTERVAL", 1000))
RETRY_TIMES = int(os.environ.get("RMR_RETRY_TIMES", 4))
-MRC = None
+_SEND_QUEUE = queue.Queue() # thread safe queue https://docs.python.org/3/library/queue.html
-RECEIVED_MESSAGES = [] # used to store messages we need but havent been procedded yet
-WAITING_TRANSIDS = {} # used to store transactionids we are waiting for, so we can filter other stuff out
-
-def _dequeue_all_waiting_messages():
- """
- dequeue all waiting rmr messages from rmr, put them into RECEIVED_MESSAGES
+def _init_rmr():
"""
- new_messages = []
- sbuf = rmr.rmr_alloc_msg(MRC, 4096)
- while True:
- sbuf = rmr.rmr_torcv_msg(MRC, sbuf, 0) # set the timeout to 0 so this doesn't block!!
- summary = rmr.message_summary(sbuf)
- if summary["message state"] == 12 and summary["message status"] == "RMR_ERR_TIMEOUT":
- break
- elif summary["transaction id"] in WAITING_TRANSIDS: # message is relevent
- new_messages.append(summary)
- else:
- logger.debug("A message was received by a1, but a1 was not expecting it! It's being dropped: %s", summary)
- # do nothing with message, effectively dropped
- return new_messages
-
-
-def _check_if_ack_received(target_transid, target_type):
+ init an rmr context
+ This gets monkeypatched out for unit testing
"""
- Try to recieve the latest messages, then search the current queue for the target ACK
- TODO: probably a slightly more efficient data structure than list. Maybe a dict by message type
- However, in the near term, where there are not many xapps under A1, this is fine. Revisit later.
- TODO: do we need to deal with duplicate ACKs for the same transaction id?
- Is it possible if the downstream xapp uses rmr_rts? Might be harmless to sit in queue.. might slow things
+ # rmr.RMRFL_MTCALL puts RMR into a multithreaded mode, where a receiving thread populates an
+ # internal ring of messages, and receive calls read from that
+ # currently the size is 2048 messages, so this is fine for the foreseeable future
+ logger.debug("Waiting for rmr to initialize..")
+ mrc = rmr.rmr_init(b"4562", rmr.RMR_MAX_RCV_BYTES, rmr.RMRFL_MTCALL)
+ while rmr.rmr_ready(mrc) == 0:
+ time.sleep(0.5)
- """
- new_messages = _dequeue_all_waiting_messages() # dequeue all waiting messages
- global RECEIVED_MESSAGES # this is ugly, but fine.. we just need an in memory list across the async calls
- RECEIVED_MESSAGES += new_messages
- for index, summary in enumerate(RECEIVED_MESSAGES): # Search the queue for the target message
- if (
- summary["message state"] == 0
- and summary["message status"] == "RMR_OK"
- and summary["message type"] == target_type
- and summary["transaction id"] == target_transid
- ): # Found; delete it from queue
- del RECEIVED_MESSAGES[index]
- return summary
- return None
+ return mrc
-def init_rmr():
- """
- called from run; not called for unit tests
+def _send(mrc, payload, message_type=0):
"""
- global MRC
- MRC = rmr.rmr_init(b"4562", rmr.RMR_MAX_RCV_BYTES, 0x00)
-
- while rmr.rmr_ready(MRC) == 0:
- gevent.sleep(1)
- logger.debug("not yet ready")
-
-
-def send(payload, message_type=0):
- """
- sends a message up to RETRY_TIMES
+ Sends a message up to RETRY_TIMES
If the message is sent successfully, it returns the transactionid
- Raises an exception (MessageSendFailure) otherwise
+ Does nothing otherwise
"""
- # we may be called many times in asyncronous loops, so for now, it is safer not to share buffers. We can investifgate later whether this is really a problem.
- sbuf = rmr.rmr_alloc_msg(MRC, 4096)
+ # TODO: investigate moving this below and allocating the space based on the payload size
+ sbuf = rmr.rmr_alloc_msg(mrc, 4096)
payload = payload if isinstance(payload, bytes) else payload.encode("utf-8")
# retry RETRY_TIMES to send the message
- tried = 0
- while True:
+ for _ in range(0, RETRY_TIMES):
# setup the send message
rmr.set_payload_and_length(payload, sbuf)
rmr.generate_and_set_transaction_id(sbuf)
transaction_id = pre_send_summary["transaction id"] # save the transactionid because we need it later
# send
- sbuf = rmr.rmr_send_msg(MRC, sbuf)
+ sbuf = rmr.rmr_send_msg(mrc, sbuf)
post_send_summary = rmr.message_summary(sbuf)
logger.debug("Post message send summary: %s", rmr.message_summary(sbuf))
# check success or failure
if post_send_summary["message state"] == 0 and post_send_summary["message status"] == "RMR_OK":
- return transaction_id # we are good
- if post_send_summary["message state"] == 10 and post_send_summary["message status"] == "RMR_ERR_RETRY":
- # in this state, we should retry
- if tried == RETRY_TIMES:
- # we have tried RETRY_TIMES and we are still not getting a good state, raise an exception and let the caller deal with it
- raise MessageSendFailure(str(post_send_summary))
- else:
- tried += 1
- else:
- # we hit a state where we should not even retry
- raise MessageSendFailure(str(post_send_summary))
-
-
-def send_ack_retry(payload, expected_ack_message_type, message_type=0):
- """
- send a message and check for an ACK.
- If no ACK is recieved, defer execution for RMR_RCV_RETRY_INTERVAL ms, then check again.
- If no ack is received before the timeout (set by _rmr_init), send again and try again up to RETRY_TIMES
+ # we are good
+ logger.debug("Message sent successfully!")
+ rmr.rmr_free_msg(sbuf)
+ return transaction_id
+
+ # we failed all RETRY_TIMES
+ logger.debug("Send failed all %s times, stopping", RETRY_TIMES)
+ rmr.rmr_free_msg(sbuf)
+ return None
+
+
+# Public
- It is critical here to set the RMR_TIMEOUT to 0 in the rmr_rcv_to function, which causes that function NOT to block.
- Instead, if the message isn't there, we give up execution for the interval, which allows the gevent server to process other requests in the meantime.
- Amazing props to https://sdiehl.github.io/gevent-tutorial/
- (which also runs this whole server)
+def queue_work(item):
"""
+ push an item into the work queue
+ currently the only type of work is to send out messages
+ """
+ _SEND_QUEUE.put(item)
- # try to send the msg to the downstream policy handler
- expected_transaction_id = send(payload, message_type)
- WAITING_TRANSIDS[expected_transaction_id] = 1
- gevent.sleep(0.01) # wait 10ms before we try the first recieve
- for _ in range(0, RETRY_TIMES):
- logger.debug("Seeing if return message is fufilled")
- summary = _check_if_ack_received(expected_transaction_id, expected_ack_message_type)
- if summary:
- logger.debug("Target ack Message received!: %s", summary)
- logger.debug("current queue size is %d", len(RECEIVED_MESSAGES))
- del WAITING_TRANSIDS[expected_transaction_id]
- return summary["payload"]
- else:
- logger.debug("Deffering execution for %s seconds", str(RMR_RCV_RETRY_INTERVAL / 1000))
- gevent.sleep(RMR_RCV_RETRY_INTERVAL / 1000)
-
- # we still didn't get the ACK we want
- raise ExpectedAckNotReceived()
+class RmrLoop:
+ """
+ class represents an rmr loop meant to be called as a longstanding separate thread
+ """
+
+ def __init__(self, _init_func_override=None, rcv_func_override=None):
+ self._rmr_is_ready = False
+ self._keep_going = True
+ self._init_func_override = _init_func_override # useful for unit testing
+ self._rcv_func_override = rcv_func_override # useful for unit testing to mock certain recieve scenarios
+ self._rcv_func = None
+
+ def rmr_is_ready(self):
+ """returns whether rmr has been initialized"""
+ return self._rmr_is_ready
+
+ def stop(self):
+ """sets a flag for the loop to end"""
+ self._keep_going = False
+
+ def loop(self):
+ """
+ This loop runs in an a1 thread forever, and has 3 jobs:
+ - send out any messages that have to go out (create instance, delete instance)
+ - read a1s mailbox and update the status of all instances based on acks from downstream policy handlers
+ - clean up the database (eg delete the instance) under certain conditions based on those statuses (NOT DONE YET)
+ """
+
+ # get a context
+ mrc = self._init_func_override() if self._init_func_override else _init_rmr()
+ self._rmr_is_ready = True
+ logger.debug("Rmr is ready")
+
+ # set the receive function called below
+ self._rcv_func = (
+ self._rcv_func_override if self._rcv_func_override else lambda: helpers.rmr_rcvall_msgs(mrc, [21024])
+ )
+
+ # loop forever
+ logger.debug("Work loop starting")
+ while self._keep_going:
+ # send out all messages waiting for us
+ while not _SEND_QUEUE.empty():
+ work_item = _SEND_QUEUE.get(block=False, timeout=None)
+ _send(mrc, payload=work_item["payload"], message_type=work_item["msg type"])
+
+ # read our mailbox and update statuses
+ updated_instances = set()
+ for msg in self._rcv_func():
+ try:
+ pay = json.loads(msg["payload"])
+ pti = pay["policy_type_id"]
+ pii = pay["policy_instance_id"]
+ data.set_status(pti, pii, pay["handler_id"], pay["status"])
+ updated_instances.add((pti, pii))
+ except (PolicyTypeNotFound, PolicyInstanceNotFound, KeyError, json.decoder.JSONDecodeError):
+ # TODO: in the future we may also have to catch SDL errors
+ logger.debug(("Dropping malformed or non applicable message", msg))
+
+ # for all updated instances, see if we can trigger a delete
+ # should be no catch needed here, since the status update would have failed if it was a bad pair
+ for ut in updated_instances:
+ data.clean_up_instance(ut[0], ut[1])
+
+ # TODO: what's a reasonable sleep time? we don't want to hammer redis too much, and a1 isn't a real time component
+ time.sleep(1)
+
+
+def start_rmr_thread(init_func_override=None, rcv_func_override=None):
+ """
+ Start a1s rmr thread
+ Also called during unit testing
+ """
+ rmr_loop = RmrLoop(init_func_override, rcv_func_override)
+ thread = Thread(target=rmr_loop.loop)
+ thread.start()
+ while not rmr_loop.rmr_is_ready():
+ time.sleep(0.5)
+ return rmr_loop # return the handle; useful during unit testing