From: E. Scott Daniels Date: Tue, 23 Mar 2021 16:40:55 +0000 (-0400) Subject: Correct bug in listener not detecting eagain/eintr correctly X-Git-Tag: 1.10.0^0 X-Git-Url: https://gerrit.o-ran-sc.org/r/gitweb?a=commitdiff_plain;ds=inline;h=71b42c4cc711c1f917c4fa3d180aaa217a1a7196;p=ric-app%2Fmc.git Correct bug in listener not detecting eagain/eintr correctly It was possible for the listener to write a partial payload into the pipe and completely bugger the reader with an out of synch state. This change adds the necessary code for the listener to properly handle an interrupted write operation while still detecting a 'no reader' sate on a pipe. Issue-ID: RIC-6768 Signed-off-by: E. Scott Daniels Change-Id: Ifc108981cb654c78e91f04e9055e8b4b6cb01052 Signed-off-by: E. Scott Daniels --- diff --git a/sidecars/listener/Dockerfile b/sidecars/listener/Dockerfile index b01a0aa..5968648 100644 --- a/sidecars/listener/Dockerfile +++ b/sidecars/listener/Dockerfile @@ -50,7 +50,7 @@ RUN apt-get update && apt-get install -y cmake gcc make git g++ wget WORKDIR /playpen # Install RMr (runtime and dev) from debian package cached on packagecloud.io -ARG RMR_VER=4.5.2 +ARG RMR_VER=4.6.0 ARG RMR_PC_REPO=release #ARG RMR_PC_REPO=staging diff --git a/sidecars/listener/container-tag.yaml b/sidecars/listener/container-tag.yaml index 1f2c599..6f465f5 100644 --- a/sidecars/listener/container-tag.yaml +++ b/sidecars/listener/container-tag.yaml @@ -1,4 +1,4 @@ --- -tag: '1.9.0' +tag: '1.10.0' # this is used by CI jobs to apply a tag when it builds the image diff --git a/sidecars/listener/src/mcl.c b/sidecars/listener/src/mcl.c index 0077e96..60346ba 100644 --- a/sidecars/listener/src/mcl.c +++ b/sidecars/listener/src/mcl.c @@ -100,7 +100,7 @@ static void* setup_rdc() { char* done = NULL; if( (ep = getenv( "MCL_RDC_ENABLE" )) != NULL && atoi( ep ) == 0 ) { // exists and is 0 - logit( LOG_INFO, "(mcl) raw data capture disabled by environment var setting (MCL_RDCENABLE=0)\n" ); + logit( LOG_INFO, "(mcl) raw data capture disabled by environment var setting (MCL_RDCENABLE=0)" ); return NULL; } @@ -148,7 +148,7 @@ static void* setup_rdc() { Field lengths (bytes) are: - delim 4 + delim 4 len 8 (7 digits + 0) timestamp 16 (15 digits + 0) @@ -275,6 +275,10 @@ static int suss_fifo( mcl_ctx_t* ctx, int mtype, int io_dir, fifo_t** fref ) { fifo = NULL; } } + } else { + if( fifo->fd < 0 ) { // it existed, but was closed; reopen + fifo->fd = open_fifo( ctx, mtype, io_dir ); + } } if( fref != NULL ) { @@ -284,6 +288,32 @@ static int suss_fifo( mcl_ctx_t* ctx, int mtype, int io_dir, fifo_t** fref ) { return fifo == NULL ? -1 : fifo->fd; } +/* + Should we need to close a FIFO we do so and leave the block in the hash + with a bad FD so that we'll attempt to reopen on next use. +*/ +static void close_fifo( mcl_ctx_t* ctx, int mtype, int io_dir ) { + fifo_t* fifo; + void* hash; + + if( ctx == NULL ) { + return; + } + + if( io_dir == READER ) { // with an integer key, we need two hash tables + hash = ctx->rd_hash; + } else { + hash = ctx->wr_hash; + } + + if( (fifo = (fifo_t *) rmr_sym_pull( hash, mtype )) != NULL ) { + if( fifo->fd >= 0 ) { + close( fifo->fd ); + fifo->fd = -1; + } + } +} + /* Make marking counts easier in code */ @@ -334,6 +364,69 @@ static void wr_stats( void* st, void* entry, char const* name, void* thing, void } } +/* + Writes the indicated bytes (n2write) from buf onto the fd. Returns only after + full buffer is written, or there is a hard error (not eagain or eintr). + Returns the number written; if less than n2write the caller may assume + that there was a hard error and errno should reflect. +*/ +static inline int write_all( int fd, char const* buf, int n2write ) { + ssize_t remain = 0; // number of bytes remaining to write + ssize_t wrote = 0; // number of bytes written thus far + ssize_t state = 0; + + if( fd < 0 ) { + errno = EBADFD; + return 0; + } + + errno = 0; + remain = n2write; + do { + if( (state = write( fd, buf + wrote, remain )) > 0 ) { + wrote += state; + remain = n2write - wrote; + } + } while( remain > 0 && (errno == EINTR || errno == EAGAIN) ) ; + + return wrote; +} + +/* + Similar to write_all, this will write all bytes in the buffer, but + will return failure if the first write attempt fails with 0 written + (assuming that the pipe has no reader). We use this when writing the + header bytes; we want to drop the message if we can't even write one + byte, but if we write one, we must loop until all are written. + + Returns the number written. If that value is less than n2write, then + the caller may assume a hard error occurred and errno should reflect. + If 0 is returned it can be assumed that the FIFO would block/has no + reader. +*/ +static inline int write_all_nb( int fd, char const* buf, int n2write ) { + ssize_t remain = 0; // number of bytes remaining to write + ssize_t wrote = 0; // number of bytes written + + if( fd < 0 ) { + errno = EBADFD; + return 0; + } + + errno = 0; + remain = n2write; + wrote = write( fd, buf, remain ); + if( wrote < 0 ) { // report error with exception for broken pipe + return errno == EPIPE ? 0 : -1; // broken pipe we assume no reader and return 0 since nothing written + } + + if( wrote < n2write && wrote > 0 ) { // if we wrote anything, we must tough it out and write all if it was short + wrote += write_all( fd, buf + wrote, n2write - wrote ); + } + + return wrote; +} + // ---------- public ------------------------------------------------------ /* Sets a signal handler for sigpipe so we don't crash if a reader closes the @@ -601,6 +694,7 @@ extern void mcl_fifo_fanout( void* vctx, int report, int long_hdr ) { time_t next_report = 0; // we'll report every 2 seconds if report is true time_t now; size_t hwlen; // write len for header + size_t wrote; // number of bytes actually written void* rdc_ctx = NULL; // raw data capture context void* rdc_buf = NULL; // capture buffer @@ -640,25 +734,32 @@ extern void mcl_fifo_fanout( void* vctx, int report, int long_hdr ) { hwlen = MCL_LEN_SIZE; } - fd = suss_fifo( ctx, mbuf->mtype, WRITER, &fifo ); // map the message type to an open fd + fd = suss_fifo( ctx, mbuf->mtype, WRITER, &fifo ); // map the message type to an open fd if( fd >= 0 ) { - if( write( fd, header, hwlen ) != hwlen ) { // write exactly MCL_LEN_SIZE bytes from the buffer + if( (wrote = write_all_nb( fd, header, hwlen )) == 0 ) { // write header; 0 indicates no reader, drop silently drops++; total_drops++; chalk_error( fifo ); } else { - if( write( fd, mbuf->payload, mbuf->len ) < mbuf->len ) { // followed by the payload + if( wrote != hwlen ) { + logit( LOG_ERR, "(mcl): error writing header to fifo; mt=%d wrote=%d tried=%d: %s", mbuf->mtype, wrote, hwlen, strerror( errno ) ); errors++; chalk_error( fifo ); + close_fifo( ctx, mbuf->mtype, WRITER ); } else { - chalk_ok( fifo ); - count++; - total++; + if( write_all( fd, mbuf->payload, mbuf->len ) != mbuf->len ) { // we wrote a header, so we must write all; no drop at this point + logit( LOG_ERR, "(mcl): error writing payload to fifo; mt=%d: %s\n", mbuf->mtype, strerror( errno ) ); + close_fifo( ctx, mbuf->mtype, WRITER ); + } else { + chalk_ok( fifo ); + count++; + total++; + } } } } - if( rdc_ctx != NULL ) { + if( rdc_ctx != NULL ) { // always put the message to the rdc files if collecting; eve if pipe write failed rdc_buf = rdc_init_buf( mbuf->mtype, header, hwlen, rdc_buf ); // set up for write rdc_write( rdc_ctx, rdc_buf, mbuf->payload, mbuf->len ); // write the raw data } diff --git a/sidecars/listener/src/pipe_reader.c b/sidecars/listener/src/pipe_reader.c index ae0a1da..88ad08f 100644 --- a/sidecars/listener/src/pipe_reader.c +++ b/sidecars/listener/src/pipe_reader.c @@ -49,6 +49,7 @@ static void usage( char* argv0 ) { fprintf( stderr, " -d dir (default is /tmp/mcl/fifos)\n" ); fprintf( stderr, " -e disable extended headers expectation in FIFO data\n" ); fprintf( stderr, " -m msg-type (default is 0)\n" ); + fprintf( stderr, " -M max msg to read (default is all)\n" ); fprintf( stderr, " -s stats only mode\n" ); } @@ -76,6 +77,7 @@ int main( int argc, char** argv ) { char timestamp[MCL_TSTAMP_SIZE]; // we'll get the timestamp from this long count = 0; int blabber = 0; + int max = 0; // we'll force one reader down early to simulate MC going away signal( SIGINT, sigh ); signal( SIGTERM, sigh ); @@ -110,6 +112,16 @@ int main( int argc, char** argv ) { } break; + case 'M': + if( pidx+1 < argc ) { + max = atoi( argv[pidx+1] ); + pidx++; + } else { + bad_arg( argv[pidx] ); + error = 1; + } + break; + case 's': stats_only = 1; break; @@ -139,7 +151,8 @@ int main( int argc, char** argv ) { exit( 1 ); } - while( 1 ) { + fprintf( stderr, "[INFO] max = %d\n", max ); + while( max == 0 || count < max ) { len = mcl_fifo_tsread1( ctx, mtype, buf, sizeof( buf ) -1, long_hdrs, timestamp ); if( len > 0 ) { if( stats_only ) { @@ -156,7 +169,11 @@ int main( int argc, char** argv ) { } count++; + } else { + sleep( 1 ); } } + + fprintf( stderr, "[INFO] max reached: %d\n", max ); } diff --git a/sidecars/listener/src/sender.c b/sidecars/listener/src/sender.c index 5f654df..ea22155 100644 --- a/sidecars/listener/src/sender.c +++ b/sidecars/listener/src/sender.c @@ -129,11 +129,11 @@ int main( int argc, char** argv ) { sbuf = rmr_send_msg( mrc, sbuf ); // retry send until it's good (simple test; real programmes should do better) } count++; - fprintf( stderr, " sent message\n" ); + fprintf( stderr, " sent message type=%d\n", mtype ); usleep( delay ); mtype++; - if( mtype > 6 ) { + if( mtype > 8 ) { // ensure we send a mt that doesn't have a fifo reader to ensure we don't block mtype = 1; } } diff --git a/sidecars/listener/src/verify.sh b/sidecars/listener/src/verify.sh index d98a16f..a50bcce 100755 --- a/sidecars/listener/src/verify.sh +++ b/sidecars/listener/src/verify.sh @@ -79,11 +79,12 @@ function run_listener { # run a pipe reader for one message type function run_pr { - echo "starting pipe reader $1" - $bin_dir/pipe_reader $ext_hdr -m $1 -d $fifo_dir >/tmp/pr.$1.log 2>&1 & - #$bin_dir/pipe_reader -m $1 -d $fifo_dir & # >/tmp/pr.$1.log 2>&1 + echo "starting pipe reader $1 $max_flag" + set -x + $bin_dir/pipe_reader $ext_hdr $max_flag -m $1 -d $fifo_dir >/tmp/pr.$1.log 2>&1 & + set +x typeset prpid=$! - + sleep $reader_wait echo "stopping pipe reader $ppid" kill -15 $prpid @@ -93,13 +94,16 @@ function run_pr { function gen_rt { cat </tmp/local.rt newrt|start - mse | 0 | -1 | localhost:4560 - mse | 1 | -1 | localhost:4560 - mse | 2 | -1 | localhost:4560 - mse | 3 | -1 | localhost:4560 - mse | 4 | -1 | localhost:4560 - mse | 5 | -1 | localhost:4560 - mse | 6 | -1 | localhost:4560 + mse | 0 | -1 | localhost:4560 + mse | 1 | -1 | localhost:4560 + mse | 2 | -1 | localhost:4560 + mse | 3 | -1 | localhost:4560 + mse | 4 | -1 | localhost:4560 + mse | 5 | -1 | localhost:4560 + mse | 6 | -1 | localhost:4560 + mse | 7 | -1 | localhost:4560 + mse | 8 | -1 | localhost:4560 + mse | 9 | -1 | localhost:4560 newrt|end endKat } @@ -110,12 +114,14 @@ si="" # if -s given then we add this to sender/listener to run SI95 version ext_hdr="" # run with extended header enabled (-e turns extended off) long_test=0 raw_capture=1 +verbose=0 while [[ $1 == -* ]] do - case $1 in + case $1 in -l) long_test=1;; -n) raw_capture=0;; -s) si="si_";; + -v) verbose=1;; *) echo "$1 is not a recognised option" exit 1 ;; @@ -160,16 +166,30 @@ gen_rt # generate a dummy route table run_listener & sleep 4 -for p in 0 1 2 3 4 5 6 +# the sender will send types 0-8 inclusive; we only start 7 readers to +# endure listener doesn't hang on pipes without a reader +# +for p in 0 1 2 3 4 5 do - run_pr $p & + run_pr $p & # all but last have no max read done -sleep 1 +max_flag="-M 10" +run_pr 6 & + +sleep 1 # let the readers settle run_sender & sleep $main_wait # long enough for all functions to finish w/o having to risk a wait hanging echo "all functions stopped; looking at logs" +if (( verbose )) +then + echo "[INFO] ---- mc_lisener log follwos --------------------------" + cat /tmp/listen.log + echo "[INFO] ------------------------------------------------------" +fi + + # ---------- validation ------------------------------------------------- errors=0 @@ -197,7 +217,7 @@ do fi done -if (( ! errors )) +if (( ! errors )) then echo "[OK] All logs seem good" fi @@ -228,7 +248,7 @@ then then echo "[OK] Found final direcory ($final_dir)" ls -al $final_dir - + if (( long_test )) # look for files in final dir to ensure roll then found=$( ls $final_dir/MC* | wc -l ) diff --git a/sidecars/listener/src/verify_replay.sh b/sidecars/listener/src/verify_replay.sh index 8ef769e..5acefc6 100755 --- a/sidecars/listener/src/verify_replay.sh +++ b/sidecars/listener/src/verify_replay.sh @@ -43,7 +43,7 @@ function run_replay { chmod 644 $file set -x - $bin_dir/rdc_replay -f $file -d $fifo_dir >/tmp/replay.log 2>&1 + $bin_dir/rdc_replay -f $file -d $fifo_dir >$log_dir/replay.log 2>&1 lpid=$! set +x echo "replay finished" @@ -52,8 +52,7 @@ function run_replay { # run a pipe reader for one message type function run_pr { echo "starting pipe reader $1" - $bin_dir/pipe_reader $ext_hdr -m $1 -d $fifo_dir >/tmp/pr.$1.log 2>&1 & - #$bin_dir/pipe_reader -m $1 -d $fifo_dir & # >/tmp/pr.$1.log 2>&1 + $bin_dir/pipe_reader $ext_hdr -m $1 -d $fifo_dir >$log_dir/pr.$1.log 2>&1 & typeset prpid=$! sleep $reader_wait @@ -63,6 +62,9 @@ function run_pr { # ---- run everything --------------------------------------------------- +log_dir=/tmp/mcl_verify +mkdir -p $log_dir + ext_hdr="" # run with extended header enabled (-e turns extended off) run_listener=0 # -a turns on to run all while [[ $1 == -* ]] @@ -138,7 +140,7 @@ errors=0 # logs should be > 0 in size echo "----- logs ---------" -ls -al /tmp/*.log +ls -al $log_dir/*.log # pipe reader log files 1-6 should have 'stand up and cheer' messages # pipe reader log for MT 0 will likley be empty as sender sends only @@ -146,14 +148,14 @@ ls -al /tmp/*.log # for l in 1 2 3 4 5 6 do - if [[ ! -s /tmp/pr.$l.log ]] + if [[ ! -s $log_dir/pr.$l.log ]] then echo "[FAIL] log $l was empty" (( errors++ )) else - if ! grep -q -i "stand up and cheer" /tmp/pr.$l.log + if ! grep -q -i "stand up and cheer" $log_dir/pr.$l.log then - echo "[FAIL] pipe reader log did not have any valid messages: /tmp/pr.$l.log" + echo "[FAIL] pipe reader log did not have any valid messages: $log_dir/pr.$l.log" (( errors++ )) fi fi diff --git a/sidecars/listener/test/README b/sidecars/listener/test/README index b8b3b9a..6d2c85d 100644 --- a/sidecars/listener/test/README +++ b/sidecars/listener/test/README @@ -11,3 +11,31 @@ The run_app_test script drives the listener application and tools, as well as the test sender, to generate coverage. There is little vetting done here; any problems should be discovered by the verify scripts in ../src at the time the code is pushed. This is simply a coverage test for sonar. + + +Verification +The unit and app test scripts pretty much verify sucessful runs. If it is +necessary to see things like whether or not the "failure" of a pipe +reader affects the listener (blocks it), then running the app test with +-v and looking at the listener output will help. Lines like the following +give clues about this: + +1616516620 [STAT] (mcl) mtype=0 total writes=1 total drops=0; during last 1s writes=0 drops=0 +1616516620 [STAT] (mcl) mtype=1 total writes=88 total drops=0; during last 1s writes=24 drops=0 +1616516620 [STAT] (mcl) mtype=2 total writes=88 total drops=0; during last 1s writes=24 drops=0 +1616516620 [STAT] (mcl) mtype=3 total writes=88 total drops=0; during last 1s writes=24 drops=0 +1616516620 [STAT] (mcl) mtype=4 total writes=88 total drops=0; during last 1s writes=24 drops=0 +1616516620 [STAT] (mcl) mtype=5 total writes=88 total drops=0; during last 1s writes=24 drops=0 +1616516620 [STAT] (mcl) mtype=6 total writes=10 total drops=78; during last 1s writes=0 drops=25 +1616516620 [STAT] (mcl) mtype=7 total writes=0 total drops=88; during last 1s writes=0 drops=25 +1616516620 [STAT] (mcl) mtype=8 total writes=0 total drops=88; during last 1s writes=0 drops=25 + +The sender sends message types 0-8 inclusive, but there are only 7 listeners +started (0-6) so we should never see successful wirte counts for types 7 and 8. +Further, pipe reader for mt 6 stops early, and so we should see some drops after +it has stopped; It stops after 10 successful reads, so the number of writes +should be 10 or 11 depending on timing of the processes. There shouldn't be +any drops on the other FIFOs. + +It is difficult, if not impossible, to test the logic that detects an +interrupted write on the pipe to ensure that all data is written. diff --git a/sidecars/listener/test/run_app_tests.ksh b/sidecars/listener/test/run_app_tests.ksh index f2fd220..a8994a6 100755 --- a/sidecars/listener/test/run_app_tests.ksh +++ b/sidecars/listener/test/run_app_tests.ksh @@ -35,11 +35,11 @@ # This is a hack! There seems not to be an easy way to have the LF -# environment add RMR (or other needed packages) for testing. If we don't +# environment adds RMR (or other needed packages) for testing. If we don't # find RMR in the /usr/local part of the filesystem, we'll force it into # /tmp which doesn't require root. We'll be smart and get the desired -# rmr version from the repo root juas as we _expected_ the CI environmnt -# woudl do (but seems not to). +# rmr version from the repo root just as we _expected_ the CI environmnt +# would do (but seems not to). # function ensure_pkgs { if (( no_rmr_load )) @@ -112,11 +112,13 @@ then fi +verbose=0 while [[ $1 == -* ]] do case $1 in -f) force_rmr_load=1;; -N) no_rmr_load=1;; # for local testing + -v) verbose=1; vflag="-v";; *) echo "unrecognised option: $1" exit 1 @@ -134,6 +136,11 @@ cd ../src export TEST_COV_OPTS="-ftest-coverage -fprofile-arcs" # picked up by make so we get coverage on tools for sonar make clean # ensure coverage files removed make -B # force build under the eyes of sonar build wrapper +if (( $? != 0 )) +then + echo "[FAIL] build failed" + exit +fi rm -fr *.gcov *.gcda # ditch any previously generated coverage info @@ -157,7 +164,7 @@ do ./rdc_replay -$x >/dev/null 2>&1 # drive each exception (missing value) or 'help' done -./verify.sh # verify MUST be first (replay relies on its output) +./verify.sh $vflag # verify MUST be first (replay relies on its output) ./verify_replay.sh # generate and copy coverage files to parent which is where the CI jobs are looking for them diff --git a/sidecars/listener/test/run_unit_test.ksh b/sidecars/listener/test/run_unit_test.ksh index 2c7d7fc..2a90412 100755 --- a/sidecars/listener/test/run_unit_test.ksh +++ b/sidecars/listener/test/run_unit_test.ksh @@ -142,6 +142,7 @@ fi running=/tmp/PID$$.running force_rmr_load=0 ci_mode=1 # -c turns off; see the flower box above +verbose=0 while [[ $1 == -* ]] do @@ -149,6 +150,7 @@ do -c) ci_mode=0;; -f) force_rmr_load=1;; -N) no_rmr_load=1;; # for local testing + -v) verbose=1;; *) echo "unrecognised option: $1" exit 1 @@ -168,7 +170,7 @@ ensure_pkgs # ensure that we have RMR; some CI environments are lacking if (( ci_mode )) # in CI mode we must force a build in the src so build wrapper captures trigger data then - echo "building in src for sonar build wrapper capture" + echo "unit_test: building in src for sonar build wrapper capture" ( cd ../src export TEST_COV_OPTS="-ftest-coverage -fprofile-arcs" # picked up by make so we get coverage on tools for sonar @@ -176,23 +178,26 @@ then ) fi -echo "building unit test(s)" +echo "[INFO] unit_test: building unit test(s)" if ! make -B unit_test # ensure that it's fresh then echo "[FAIL] cannot make unit_test" exit 1 fi -echo "unit test(s) successfully built" +echo "[INFO] unit_test: unit test(s) successfully built" setup_dirs +echo "[INFO] unit_test: directories set up" if [[ $1 == "set"* ]] # setup only then + echo "[INFO] unit_test: setup only; exiting good" exit fi rm -fr *.gcov *.gcda # ditch any previously generated coverage info +echo "[INFO] unit_test: running tests -- may take as long as 120 seconds" ./unit_test >/tmp/PID$$.utlog 2>&1 & pid=$! abort_after 60 $pid & diff --git a/sidecars/listener/test/unit_test.c b/sidecars/listener/test/unit_test.c index e9f689a..76aa72c 100644 --- a/sidecars/listener/test/unit_test.c +++ b/sidecars/listener/test/unit_test.c @@ -124,6 +124,9 @@ int main( int argc, char** argv ) { errors++; } + close_fifo( ctx, TEST_MTYPE, WRITER ); // close one we know is there + close_fifo( ctx, 84306, WRITER ); // coverage on error case + mcl_start_listening( ctx, port, 0 ); // start the listener // under test, the FOREVER = 0 keeps fanout from blocking; drive several times to cover all cases