3 # Copyright (c) 2019 Wind River Systems, Inc.
5 # SPDX-License-Identifier: Apache-2.0
7 # This script is a helper wrapper for pmon monitoring of ceph
8 # processes. The "/etc/init.d/ceph" script does not know if ceph is
9 # running on the node. For example when the node is locked, ceph
10 # processes are not running. In that case we do not want pmond to
11 # monitor these processes.
13 # The script "/etc/services.d/<node>/ceph.sh" will create the file
14 # "/var/run/.ceph_started" when ceph is running and remove it when
17 # The script also extracts one or more ceph process names that are
18 # reported as 'not running' or 'dead' or 'failed' by '/etc/intit.d/ceph status'
19 # and writes the names to a text file: /tmp/ceph_status_failure.txt for
20 # pmond to access. The pmond adds the text to logs and alarms. Example of text
21 # samples written to file by this script are:
25 # 'mon.storage-0, osd.2'
27 # Moreover, for processes that are reported as 'hung' by '/etc/intit.d/ceph status'
28 # the script will try increase their logging to 'debug' for a configurable interval.
29 # With logging increased it will outputs a few stack traces then, at the end of this
30 # interval, it dumps its stack core and kills it.
33 # zero - /etc/init.d/ceph returned success or ceph is not running on the node
34 # non-zero /etc/init.d/ceph returned a failure or invalid syntax
37 source /usr/bin/tsconfig
38 source /etc/platform/platform.conf
40 CEPH_SCRIPT="/etc/init.d/ceph"
41 CEPH_FILE="$VOLATILE_PATH/.ceph_started"
42 CEPH_GET_MON_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_mon_status"
43 CEPH_GET_OSD_STATUS_FILE="$VOLATILE_PATH/.ceph_getting_osd_status"
44 CEPH_STATUS_FAILURE_TEXT_FILE="/tmp/ceph_status_failure.txt"
48 LIBDIR=/usr/lib64/ceph
50 source $LIBDIR/ceph_common.sh
52 LOG_PATH=/var/log/ceph
53 LOG_FILE=$LOG_PATH/ceph-process-states.log
54 LOG_LEVEL=NORMAL # DEBUG
57 DATA_PATH=$VOLATILE_PATH/ceph_hang # folder where we keep state information
58 mkdir -p $DATA_PATH # make sure folder exists
60 MONITORING_INTERVAL=15
62 CEPH_STATUS_TIMEOUT=20
64 LOCK_CEPH_MON_SERVICE_FILE="$VOLATILE_PATH/.ceph_mon_status"
65 LOCK_CEPH_OSD_SERVICE_FILE="$VOLATILE_PATH/.ceph_osd_status"
66 LOCK_CEPH_MON_STATUS_FILE="$VOLATILE_PATH/.ceph_mon_service"
67 LOCK_CEPH_OSD_STATUS_FILE="$VOLATILE_PATH/.ceph_osd_service"
69 # Seconds to wait for ceph status to finish before
70 # continuing to execute a service action
71 MONITOR_STATUS_TIMEOUT=30
72 MAX_STATUS_TIMEOUT=120
76 # SM can only pass arguments through environment variable
77 # when ARGS is not empty use it to extend command line arguments
79 if [ ! -z $ARGS ]; then
80 IFS=";" read -r -a new_args <<< "$ARGS"
81 args+=("${new_args[@]}")
86 local target="$1"; shift
87 [ -z "${target}" ] && target="mon osd"
89 # Run in sub-shell so we don't leak file descriptors
90 # used for locking service actions
93 wlog "-" INFO "Grab service locks"
94 [[ "${target}" == *"mon"* ]] && flock ${LOCK_CEPH_MON_SERVICE_FD}
95 [[ "${target}" == *"osd"* ]] && flock ${LOCK_CEPH_OSD_SERVICE_FD}
97 # Try to lock status with a timeout in case status is stuck
98 wlog "-" INFO "Lock service status"
99 deadline=$((SECONDS + MAX_STATUS_TIMEOUT + 1))
100 if [[ "${target}" == *"mon"* ]]; then
101 flock --exclusive --timeout ${MONITOR_STATUS_TIMEOUT} ${LOCK_CEPH_MON_STATUS_FD}
103 if [[ "${target}" == *"osd"* ]]; then
104 timeout=$((deadline - SECONDS))
105 if [[ $timeout -gt 0 ]]; then
106 flock --exclusive --timeout ${timeout} ${LOCK_CEPH_OSD_STATUS_FD}
110 # Close lock file descriptors so they are
111 # not inherited by the spawned process then
113 wlog "-" INFO "Run service action: $@"
114 "$@" {LOCK_CEPH_MON_SERVICE_FD}>&- \
115 {LOCK_CEPH_MON_STATUS_FD}>&- \
116 {LOCK_CEPH_OSD_SERVICE_FD}>&- \
117 {LOCK_CEPH_OSD_STATUS_FD}>&-
119 ) {LOCK_CEPH_MON_SERVICE_FD}>${LOCK_CEPH_MON_SERVICE_FILE} \
120 {LOCK_CEPH_MON_STATUS_FD}>${LOCK_CEPH_MON_STATUS_FILE} \
121 {LOCK_CEPH_OSD_SERVICE_FD}>${LOCK_CEPH_OSD_SERVICE_FILE} \
122 {LOCK_CEPH_OSD_STATUS_FD}>${LOCK_CEPH_OSD_STATUS_FILE}
128 if [ ! -f ${CEPH_FILE} ]; then
129 # Ceph is not running on this node, return success
132 wlog "-" INFO "Ceph START $1 command received"
133 with_service_lock "$1" ${CEPH_SCRIPT} start $1
134 wlog "-" INFO "Ceph START $1 command finished."
139 wlog "-" INFO "Ceph STOP $1 command received."
140 with_service_lock "$1" ${CEPH_SCRIPT} stop $1
141 wlog "-" INFO "Ceph STOP $1 command finished."
146 if [ ! -f ${CEPH_FILE} ]; then
147 # Ceph is not running on this node, return success
150 wlog "-" INFO "Ceph RESTART $1 command received."
151 with_service_lock "$1" ${CEPH_SCRIPT} restart $1
152 wlog "-" INFO "Ceph RESTART $1 command finished."
155 log_and_restart_blocked_osds ()
157 # Log info about the blocked osd daemons and then restart it
160 for name in $names; do
161 wlog $name "INFO" "$message"
162 ${CEPH_SCRIPT} restart $name
166 log_and_kill_hung_procs ()
168 # Log info about the hung processes and then kill them; later on pmon will restart them
170 for name in $names; do
171 type=`echo $name | cut -c 1-3` # e.g. 'mon', if $item is 'mon1'
172 id=`echo $name | cut -c 4- | sed 's/^\\.//'`
173 get_conf run_dir "/var/run/ceph" "run dir"
174 get_conf pid_file "$run_dir/$type.$id.pid" "pid file"
176 wlog $name "INFO" "Dealing with hung process (pid:$pid)"
178 # monitoring interval
179 wlog $name "INFO" "Increasing log level"
180 execute_ceph_cmd ret $name "ceph daemon $name config set debug_$type 20/20"
181 monitoring=$MONITORING_INTERVAL
182 while [ $monitoring -gt 0 ]; do
183 if [ $(($monitoring % $TRACE_LOOP_INTERVAL)) -eq 0 ]; then
184 date=$(date "+%Y-%m-%d_%H-%M-%S")
185 log_file="$LOG_PATH/hang_trace_${name}_${pid}_${date}.log"
186 wlog $name "INFO" "Dumping stack trace to: $log_file"
187 $(pstack $pid >$log_file) &
192 wlog $name "INFO" "Trigger core dump"
193 kill -ABRT $pid &>/dev/null
194 rm -f $pid_file # process is dead, core dump is archiving, preparing for restart
195 # Wait for pending systemd core dumps
196 sleep 2 # hope systemd_coredump has started meanwhile
197 deadline=$(( $(date '+%s') + 300 ))
198 while [[ $(date '+%s') -lt "${deadline}" ]]; do
199 systemd_coredump_pid=$(pgrep -f "systemd-coredump.*${pid}.*ceph-${type}")
200 [[ -z "${systemd_coredump_pid}" ]] && break
201 wlog $name "INFO" "systemd-coredump ceph-${type} in progress: pid ${systemd_coredump_pid}"
204 kill -KILL $pid &>/dev/null
210 local target="$1" # no shift here
211 [ -z "${target}" ] && target="mon osd"
213 if [ ! -f ${CEPH_FILE} ]; then
214 # Ceph is not running on this node, return success
218 if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]] && [[ "$1" == "osd" ]]; then
219 timeout $CEPH_STATUS_TIMEOUT ceph -s
220 if [ "$?" -ne 0 ]; then
221 # Ceph cluster is not accessible. Don't panic, controller swact
222 # may be in progress.
223 wlog "-" INFO "Ceph is down, ignoring OSD status."
228 # Report success while ceph mon is running a service action
229 # otherwise mark ceph mon status is in progress
230 exec {LOCK_CEPH_MON_STATUS_FD}>${LOCK_CEPH_MON_STATUS_FILE}
231 if [[ "${target}" == *"mon"* ]]; then
232 flock --shared --nonblock ${LOCK_CEPH_MON_SERVICE_FILE} true
233 if [[ $? -ne 0 ]]; then
236 # Lock will be released when script exits
237 flock --shared ${LOCK_CEPH_MON_STATUS_FD}
239 # Report success while ceph mon is running a service action
240 # otherwise mark ceph osd status is in progress
241 exec {LOCK_CEPH_OSD_STATUS_FD}>${LOCK_CEPH_OSD_STATUS_FILE}
242 if [[ "${target}" == *"osd"* ]]; then
243 flock --shared --nonblock ${LOCK_CEPH_OSD_SERVICE_FILE} true
244 if [[ $? -ne 0 ]]; then
247 # Lock will be released when script exits
248 flock --shared ${LOCK_CEPH_OSD_STATUS_FD}
251 result=`${CEPH_SCRIPT} status $1 {LOCK_CEPH_MON_STATUS_FD}>&- {LOCK_CEPH_OSD_STATUS_FD}>&-`
253 if [ "$RC" -ne 0 ]; then
254 erred_procs=`echo "$result" | sort | uniq | awk ' /not running|dead|failed/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
255 hung_procs=`echo "$result" | sort | uniq | awk ' /hung/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
256 blocked_ops_procs=`echo "$result" | sort | uniq | awk ' /blocked ops/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
257 stuck_peering_procs=`echo "$result" | sort | uniq | awk ' /stuck peering/ {printf "%s ", $1}' | sed 's/://g' | sed 's/, $//g'`
260 if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
261 # On 2 node configuration we have a floating monitor
264 for i in $(echo $erred_procs $hung_procs); do
265 if [[ "$i" =~ osd.?[0-9]?[0-9]|mon.$host ]]; then
272 log_and_restart_blocked_osds "$blocked_ops_procs"\
273 "Restarting OSD with blocked operations"
274 log_and_restart_blocked_osds "$stuck_peering_procs"\
275 "Restarting OSD stuck peering"
276 log_and_kill_hung_procs $hung_procs
278 rm -f $CEPH_STATUS_FAILURE_TEXT_FILE
279 if [ $invalid -eq 0 ]; then
281 for i in $erred_procs; do
284 for i in $hung_procs; do
285 text+="$i (process hang), "
287 echo "$text" | tr -d '\n' > $CEPH_STATUS_FAILURE_TEXT_FILE
289 echo "$host: '${CEPH_SCRIPT} status $1' result contains invalid process names: $erred_procs"
290 echo "Undetermined osd or monitor id" > $CEPH_STATUS_FAILURE_TEXT_FILE
294 if [[ $RC == 0 ]] && [[ "$1" == "mon" ]] && [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]]; then
295 # SM needs exit code != 0 from 'status mon' argument of the init script on
296 # standby controller otherwise it thinks that the monitor is running and
298 # '/etc/init.d/ceph status mon' checks the status of monitors configured in
299 # /etc/ceph/ceph.conf and if it should be running on current host.
300 # If it should not be running it just exits with code 0. This is what
301 # happens on the standby controller.
302 # When floating monitor is running on active controller /var/lib/ceph/mon of
303 # standby is not mounted (Ceph monitor partition is DRBD synced).
304 test -e "/var/lib/ceph/mon/ceph-controller"
305 if [ "$?" -ne 0 ]; then
326 echo "Usage: $0 {start|stop|restart|status} [{mon|osd|osd.<number>|mon.<hostname>}]"