files/ceph-init-wrapper.sh

   1 #!/bin/bash
   2 #
   3 # Copyright (c) 2019 Wind River Systems, Inc.
   4 #
   5 # SPDX-License-Identifier: Apache-2.0
   6 #
   7 # This script is a helper wrapper for pmon monitoring of ceph
   8 # processes. The "/etc/init.d/ceph" script does not know if ceph is
   9 # running on the node. For example when the node is locked, ceph
  10 # processes are not running. In that case we do not want pmond to
  11 # monitor these processes.
  12 #
  13 # The script "/etc/services.d/<node>/ceph.sh" will create the file
  14 # "/var/run/.ceph_started" when ceph is running and remove it when
  15 # is not.
  16 #
  17 # The script also extracts  one or more ceph process names  that are
  18 # reported as 'not running' or 'dead' or 'failed'  by '/etc/intit.d/ceph status'
  19 # and writes the names to a text file: /tmp/ceph_status_failure.txt for
  20 # pmond to access. The pmond adds the text to logs and alarms. Example of text
  21 # samples written to file by this script are:
  22 #   'osd.1'
  23 #   'osd.1, osd.2'
  24 #   'mon.storage-0'
  25 #   'mon.storage-0, osd.2'
  26 #
  27 # Moreover, for processes that are reported as 'hung' by '/etc/intit.d/ceph status'
  28 # the script will try increase their logging to 'debug' for a configurable interval.
  29 # With logging increased it will outputs a few stack traces then, at the end of this
  30 # interval, it dumps its stack core and kills it.
  31 #
  32 # Return values;
  33 # zero -   /etc/init.d/ceph returned success or ceph is not running on the node
  34 # non-zero /etc/init.d/ceph returned a failure or invalid syntax
  35 #
  36
  37 source /usr/bin/tsconfig
  38 source /etc/platform/platform.conf
  39
  40 CEPH_SCRIPT="/etc/init.d/ceph"
  41 CEPH_FILE="$VOLATILE_PATH/.ceph_started"
  42 CEPH_GET_MON_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_mon_status"
  43 CEPH_GET_OSD_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_osd_status"
  44 CEPH_STATUS_FAILURE_TEXT_FILE="/tmp/ceph_status_failure.txt"
  45
  46 BINDIR=/usr/bin
  47 SBINDIR=/usr/sbin
  48 LIBDIR=/usr/lib64/ceph
  49 ETCDIR=/etc/ceph
  50 source $LIBDIR/ceph_common.sh
  51
  52 LOG_PATH=/var/log/ceph
  53 LOG_FILE=$LOG_PATH/ceph-process-states.log
  54 LOG_LEVEL=NORMAL  # DEBUG
  55 verbose=0
  56
  57 DATA_PATH=$VOLATILE_PATH/ceph_hang    # folder where we keep state information
  58 mkdir -p $DATA_PATH                   # make sure folder exists
  59
  60 MONITORING_INTERVAL=15
  61 TRACE_LOOP_INTERVAL=5
  62 CEPH_STATUS_TIMEOUT=20
  63
  64 LOCK_CEPH_MON_SERVICE_FILE="$VOLATILE_PATH/.ceph_mon_status"
  65 LOCK_CEPH_OSD_SERVICE_FILE="$VOLATILE_PATH/.ceph_osd_status"
  66 LOCK_CEPH_MON_STATUS_FILE="$VOLATILE_PATH/.ceph_mon_service"
  67 LOCK_CEPH_OSD_STATUS_FILE="$VOLATILE_PATH/.ceph_osd_service"
  68
  69 # Seconds to wait for ceph status to finish before
  70 # continuing to execute a service action
  71 MONITOR_STATUS_TIMEOUT=30
  72 MAX_STATUS_TIMEOUT=120
  73
  74 RC=0
  75
  76 # SM can only pass arguments through environment variable
  77 # when ARGS is not empty use it to extend command line arguments
  78 args=("$@")
  79 if [ ! -z $ARGS ]; then
  80     IFS=";" read -r -a new_args <<< "$ARGS"
  81     args+=("${new_args[@]}")
  82 fi
  83
  84 with_service_lock ()
  85 {
  86     local target="$1"; shift
  87     [ -z "${target}" ] && target="mon osd"
  88
  89     # Run in sub-shell so we don't leak file descriptors
  90     # used for locking service actions
  91     (
  92         # Grab service locks
  93         wlog "-" INFO "Grab service locks"
  94         [[ "${target}" == *"mon"* ]] && flock ${LOCK_CEPH_MON_SERVICE_FD}
  95         [[ "${target}" == *"osd"* ]] && flock ${LOCK_CEPH_OSD_SERVICE_FD}
  96
  97         # Try to lock status with a timeout in case status is stuck
  98         wlog "-" INFO "Lock service status"
  99         deadline=$((SECONDS + MAX_STATUS_TIMEOUT + 1))
 100         if [[ "${target}" == *"mon"* ]]; then
 101             flock --exclusive --timeout ${MONITOR_STATUS_TIMEOUT} ${LOCK_CEPH_MON_STATUS_FD}
 102         fi
 103         if [[ "${target}" == *"osd"* ]]; then
 104             timeout=$((deadline - SECONDS))
 105             if [[ $timeout -gt 0 ]]; then
 106                 flock --exclusive --timeout ${timeout} ${LOCK_CEPH_OSD_STATUS_FD}
 107             fi
 108         fi
 109
 110         # Close lock file descriptors so they are
 111         # not inherited by the spawned process then
 112         # run service action
 113         wlog "-" INFO "Run service action: $@"
 114         "$@" {LOCK_CEPH_MON_SERVICE_FD}>&- \
 115              {LOCK_CEPH_MON_STATUS_FD}>&- \
 116              {LOCK_CEPH_OSD_SERVICE_FD}>&- \
 117              {LOCK_CEPH_OSD_STATUS_FD}>&-
 118
 119     ) {LOCK_CEPH_MON_SERVICE_FD}>${LOCK_CEPH_MON_SERVICE_FILE} \
 120       {LOCK_CEPH_MON_STATUS_FD}>${LOCK_CEPH_MON_STATUS_FILE} \
 121       {LOCK_CEPH_OSD_SERVICE_FD}>${LOCK_CEPH_OSD_SERVICE_FILE} \
 122       {LOCK_CEPH_OSD_STATUS_FD}>${LOCK_CEPH_OSD_STATUS_FILE}
 123     RC=$?
 124 }
 125
 126 start ()
 127 {
 128     if [ ! -f ${CEPH_FILE} ]; then
 129         # Ceph is not running on this node, return success
 130         exit 0
 131     fi
 132     wlog "-" INFO "Ceph START $1 command received"
 133     with_service_lock "$1" ${CEPH_SCRIPT} start $1
 134     wlog "-" INFO "Ceph START $1 command finished."
 135 }
 136
 137 stop ()
 138 {
 139     wlog "-" INFO "Ceph STOP $1 command received."
 140     with_service_lock "$1" ${CEPH_SCRIPT} stop $1
 141     wlog "-" INFO "Ceph STOP $1 command finished."
 142 }
 143
 144 restart ()
 145 {
 146     if [ ! -f ${CEPH_FILE} ]; then
 147         # Ceph is not running on this node, return success
 148         exit 0
 149     fi
 150     wlog "-" INFO "Ceph RESTART $1 command received."
 151     with_service_lock "$1" ${CEPH_SCRIPT} restart $1
 152     wlog "-" INFO "Ceph RESTART $1 command finished."
 153 }
 154
 155 log_and_restart_blocked_osds ()
 156 {
 157     # Log info about the blocked osd daemons and then restart it
 158     local names=$1
 159     local message=$2
 160     for name in $names; do
 161         wlog $name "INFO" "$message"
 162         ${CEPH_SCRIPT} restart $name
 163     done
 164 }
 165
 166 log_and_kill_hung_procs ()
 167 {
 168     # Log info about the hung processes and then kill them; later on pmon will restart them
 169     local names=$1
 170     for name in $names; do
 171         type=`echo $name | cut -c 1-3`   # e.g. 'mon', if $item is 'mon1'
 172         id=`echo $name | cut -c 4- | sed 's/^\\.//'`
 173         get_conf run_dir "/var/run/ceph" "run dir"
 174         get_conf pid_file "$run_dir/$type.$id.pid" "pid file"
 175         pid=$(cat $pid_file)
 176         wlog $name "INFO" "Dealing with hung process (pid:$pid)"
 177
 178         # monitoring interval
 179         wlog $name "INFO" "Increasing log level"
 180         execute_ceph_cmd ret $name "ceph daemon $name config set debug_$type 20/20"
 181         monitoring=$MONITORING_INTERVAL
 182         while [ $monitoring -gt 0 ]; do
 183             if [ $(($monitoring % $TRACE_LOOP_INTERVAL)) -eq 0 ]; then
 184                 date=$(date "+%Y-%m-%d_%H-%M-%S")
 185                 log_file="$LOG_PATH/hang_trace_${name}_${pid}_${date}.log"
 186                 wlog $name "INFO" "Dumping stack trace to: $log_file"
 187                 $(pstack $pid >$log_file) &
 188             fi
 189             let monitoring-=1
 190             sleep 1
 191         done
 192         wlog $name "INFO" "Trigger core dump"
 193         kill -ABRT $pid &>/dev/null
 194         rm -f $pid_file # process is dead, core dump is archiving, preparing for restart
 195         # Wait for pending systemd core dumps
 196         sleep 2 # hope systemd_coredump has started meanwhile
 197         deadline=$(( $(date '+%s') + 300 ))
 198         while [[ $(date '+%s') -lt "${deadline}" ]]; do
 199             systemd_coredump_pid=$(pgrep -f "systemd-coredump.*${pid}.*ceph-${type}")
 200             [[ -z "${systemd_coredump_pid}" ]] && break
 201             wlog $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}"
 202             sleep 2
 203         done
 204         kill -KILL $pid &>/dev/null
 205     done
 206 }
 207
 208 status ()
 209 {
 210     local target="$1"  # no shift here
 211     [ -z "${target}" ] && target="mon osd"
 212
 213     if [ ! -f ${CEPH_FILE} ]; then
 214         # Ceph is not running on this node, return success
 215         exit 0
 216     fi
 217
 218     if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]] && [[ "$1" == "osd" ]]; then
 219         timeout $CEPH_STATUS_TIMEOUT ceph -s
 220         if [ "$?" -ne 0 ]; then
 221             # Ceph cluster is not accessible. Don't panic, controller swact
 222             # may be in progress.
 223             wlog "-" INFO "Ceph is down, ignoring OSD status."
 224             exit 0
 225         fi
 226     fi
 227
 228     # Report success while ceph mon is running a service action
 229     # otherwise mark ceph mon status is in progress
 230     exec {LOCK_CEPH_MON_STATUS_FD}>${LOCK_CEPH_MON_STATUS_FILE}
 231     if [[ "${target}" == *"mon"* ]]; then
 232         flock --shared --nonblock ${LOCK_CEPH_MON_SERVICE_FILE} true
 233         if [[ $? -ne 0 ]]; then
 234             exit 0
 235         fi
 236         # Lock will be released when script exits
 237         flock --shared ${LOCK_CEPH_MON_STATUS_FD}
 238     fi
 239     # Report success while ceph mon is running a service action
 240     # otherwise mark ceph osd status is in progress
 241     exec {LOCK_CEPH_OSD_STATUS_FD}>${LOCK_CEPH_OSD_STATUS_FILE}
 242     if [[ "${target}" == *"osd"* ]]; then
 243         flock --shared --nonblock ${LOCK_CEPH_OSD_SERVICE_FILE} true
 244         if [[ $? -ne 0 ]]; then
 245             exit 0
 246         fi
 247         # Lock will be released when script exits
 248         flock --shared ${LOCK_CEPH_OSD_STATUS_FD}
 249     fi
 250
 251     result=`${CEPH_SCRIPT} status $1 {LOCK_CEPH_MON_STATUS_FD}>&- {LOCK_CEPH_OSD_STATUS_FD}>&-`
 252     RC=$?
 253     if [ "$RC" -ne 0 ]; then
 254         erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
 255         hung_procs=`echo "$result" | sort | uniq | awk ' /hung/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
 256         blocked_ops_procs=`echo "$result" | sort | uniq | awk ' /blocked ops/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
 257         stuck_peering_procs=`echo "$result" | sort | uniq | awk ' /stuck peering/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
 258         invalid=0
 259         host=`hostname`
 260         if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
 261             # On 2 node configuration we have a floating monitor
 262             host="controller"
 263         fi
 264         for i in $(echo $erred_procs $hung_procs); do
 265             if [[ "$i" =~ osd.?[0-9]?[0-9]|mon.$host ]]; then
 266                 continue
 267             else
 268                 invalid=1
 269             fi
 270         done
 271
 272         log_and_restart_blocked_osds "$blocked_ops_procs"\
 273             "Restarting OSD with blocked operations"
 274         log_and_restart_blocked_osds "$stuck_peering_procs"\
 275             "Restarting OSD stuck peering"
 276         log_and_kill_hung_procs $hung_procs
 277
 278         rm -f $CEPH_STATUS_FAILURE_TEXT_FILE
 279         if [ $invalid -eq 0 ]; then
 280             text=""
 281             for i in $erred_procs; do
 282                 text+="$i, "
 283             done
 284             for i in $hung_procs; do
 285                 text+="$i (process hang), "
 286             done
 287             echo "$text" | tr -d '\n' > $CEPH_STATUS_FAILURE_TEXT_FILE
 288         else
 289             echo "$host: '${CEPH_SCRIPT} status $1' result contains invalid process names: $erred_procs"
 290             echo "Undetermined osd or monitor id" > $CEPH_STATUS_FAILURE_TEXT_FILE
 291         fi
 292     fi
 293
 294     if [[ $RC == 0 ]] && [[ "$1" == "mon" ]] && [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
 295         # SM needs exit code != 0 from 'status mon' argument of the init script on
 296         # standby controller otherwise it thinks that the monitor is running and
 297         # tries to stop it.
 298         # '/etc/init.d/ceph status mon' checks the status of monitors configured in
 299         # /etc/ceph/ceph.conf and if it should be running on current host.
 300         # If it should not be running it just exits with code 0. This is what
 301         # happens on the standby controller.
 302         # When floating monitor is running on active controller /var/lib/ceph/mon of
 303         # standby is not mounted (Ceph monitor partition is DRBD synced).
 304         test -e "/var/lib/ceph/mon/ceph-controller"
 305         if [ "$?" -ne 0 ]; then
 306             exit 3
 307         fi
 308     fi
 309 }
 310
 311
 312 case "${args[0]}" in
 313     start)
 314         start ${args[1]}
 315         ;;
 316     stop)
 317         stop ${args[1]}
 318         ;;
 319     restart)
 320         restart ${args[1]}
 321         ;;
 322     status)
 323         status ${args[1]}
 324         ;;
 325     *)
 326         echo "Usage: $0 {start|stop|restart|status} [{mon|osd|osd.<number>|mon.<hostname>}]"
 327         exit 1
 328         ;;
 329 esac
 330
 331 exit $RC