Add sdl healthcheck to a1's healthcheck

[ric-plt/a1.git] / a1 / controller.py
diff --git a/a1/controller.py b/a1/controller.py

index 1022320..4210266 100644 (file)
--- a/a1/controller.py
+++ b/a1/controller.py
@@ -2,8 +2,8 @@
  Main a1 controller
  """
  # ==================================================================================
-#       Copyright (c) 2019 Nokia
-#       Copyright (c) 2018-2019 AT&T Intellectual Property.
+#       Copyright (c) 2019-2020 Nokia
+#       Copyright (c) 2018-2020 AT&T Intellectual Property.
  #
  #   Licensed under the Apache License, Version 2.0 (the "License");
  #   you may not use this file except in compliance with the License.
@@ -17,12 +17,11 @@ Main a1 controller
  #   See the License for the specific language governing permissions and
  #   limitations under the License.
  # ==================================================================================
-import json
-from flask import Response
  from jsonschema import validate
  from jsonschema.exceptions import ValidationError
  import connexion
  from mdclogpy import Logger
+from ricsdl.exceptions import RejectedByBackend, NotConnected, BackendError
  from a1 import a1rmr, exceptions, data
  
  
@@ -39,22 +38,17 @@ def _try_func_return(func):
          return "", 400
      except (exceptions.PolicyTypeNotFound, exceptions.PolicyInstanceNotFound):
          return "", 404
-    except BaseException as exc:
-        # catch all, should never happen...
-        mdc_logger.exception(exc)
-        return Response(status=500)
+    except (RejectedByBackend, NotConnected, BackendError):
+        """
+        These are SDL errors. At the time of development here, we do not have a good understanding which of these errors are "try again later it may work"
+        and which are "never going to work". There is some discussion that RejectedByBackend is in the latter category, suggesting it should map to 400,
+        but until we understand the root cause of these errors, it's confusing to clients to give them a 400 (a "your fault" code) because they won't know how to fix
+        For now, we log, and 503, and investigate the logs later to improve the handling/reporting.
+        """
+        # mdc_logger.exception(exc)  # waiting for https://jira.o-ran-sc.org/browse/RIC-39
+        return "", 503
  
-
-def _gen_body_to_handler(operation, policy_type_id, policy_instance_id, payload=None):
-    """
-    used to create the payloads that get sent to downstream policy handlers
-    """
-    return {
-        "operation": operation,
-        "policy_type_id": policy_type_id,
-        "policy_instance_id": policy_instance_id,
-        "payload": payload,
-    }
+    # let other types of unexpected exceptions blow up and log
  
  
  # Healthcheck
@@ -66,11 +60,13 @@ def get_healthcheck():
      Currently, this checks:
      1. whether the a1 webserver is up (if it isn't, this won't even be called, so even entering this function confirms it is)
      2. checks whether the rmr thread is running and has completed a loop recently
-    TODO: make "seconds" to pass in a configurable parameter?
+    3. checks that our SDL connection is healthy
      """
-    if a1rmr.healthcheck_rmr_thread():
-        return "", 200
-    return "rmr thread is unhealthy", 500
+    if not a1rmr.healthcheck_rmr_thread():
+        return "rmr thread is unhealthy", 500
+    if not data.SDL.healthcheck():
+        return "sdl connection is unhealthy", 500
+    return "", 200
  
  
  # Policy types
@@ -164,9 +160,8 @@ def create_or_replace_policy_instance(policy_type_id, policy_instance_id):
          # store the instance
          data.store_policy_instance(policy_type_id, policy_instance_id, instance)
  
-        # send rmr (best effort)
-        body = _gen_body_to_handler("CREATE", policy_type_id, policy_instance_id, payload=instance)
-        a1rmr.queue_work({"payload": json.dumps(body), "ptid": policy_type_id})
+        # queue rmr send (best effort)
+        a1rmr.queue_instance_send(("CREATE", policy_type_id, policy_instance_id, instance))
  
          return "", 202
  
@@ -179,15 +174,10 @@ def delete_policy_instance(policy_type_id, policy_instance_id):
      """
  
      def delete_instance_handler():
-        """
-        here we send out the DELETEs but we don't delete the instance until a GET is called where we check the statuses
-        We also set the status as deleted which would be reflected in a GET to ../status (before the DELETE completes)
-        """
          data.delete_policy_instance(policy_type_id, policy_instance_id)
  
-        # send rmr (best effort)
-        body = _gen_body_to_handler("DELETE", policy_type_id, policy_instance_id)
-        a1rmr.queue_work({"payload": json.dumps(body), "ptid": policy_type_id})
+        # queue rmr send (best effort)
+        a1rmr.queue_instance_send(("DELETE", policy_type_id, policy_instance_id, ""))
  
          return "", 202