Fixes some stability issues seen during route table update. 44/11744/1 4.9.2
authorJuha Hyttinen <juha.hyttinen@nokia.com>
Wed, 4 Oct 2023 18:46:01 +0000 (21:46 +0300)
committerJuha Hyttinen <juha.hyttinen@nokia.com>
Thu, 5 Oct 2023 12:48:45 +0000 (15:48 +0300)
Dont allow clean old_rtable until it's reference count is really 0.
Added more faster recognition for connect failure, so route table
update will not stuck in preparing new rtable while waiting, that
old_rtable reference count reaches 0.
"rtgate" lock is not needed to init and destroy continouosly.

Issue-ID: RIC-1020

Signed-off-by: Juha Hyttinen <juha.hyttinen@nokia.com>
Change-Id: Ia5a228679ef9e71634217405334a4b9dc2da01b7

12 files changed:
CHANGES_CORE.txt
CMakeLists.txt
docs/rel-notes.rst
src/rmr/common/include/rmr_agnostic.h
src/rmr/common/src/rt_generic_static.c
src/rmr/si/src/rmr_si.c
src/rmr/si/src/si95/siconnect.c
src/rmr/si/src/si95/siestablish.c
src/rmr/si/src/si95/sinewses.c
src/rmr/si/src/si95/sitransport.h
src/rmr/si/src/si95/socket_if.h
test/test_transport_em.c

index 461f47b..40abaa6 100644 (file)
@@ -5,6 +5,13 @@
 # API and build change  and fix summaries. Doc corrections
 # and/or changes are not mentioned here; see the commit messages.
 
+2023 Oct 4; version 4.9.2
+    Fixes some stability issues seen during route table update.
+    Dont allow clean old_rtable until it's reference count is really 0.
+    Added more faster recognition for connect failure so route table
+    update will not stuck so long in preparing new rtable.
+       rtgate lock is not needed to init and destroy continouosly.
+
 2023 May 28; version 4.9.1
         Fixes IPv6 support for binding to IPv6 interfaces (RIC-985)
 
index f854ace..a1cc9e8 100644 (file)
@@ -42,7 +42,7 @@ cmake_minimum_required( VERSION 3.5 )
 
 set( major_version "4" )               # should be automatically populated from git tag later, but until CI process sets a tag we use this
 set( minor_version "9" )
-set( patch_level "1" )
+set( patch_level "2" )
 
 set( install_root "${CMAKE_INSTALL_PREFIX}" )
 set( install_inc "include/rmr" )
index 46d904b..c33b44f 100644 (file)
@@ -21,6 +21,15 @@ the RMR core library was assigned odd major numbers (e.g.
 the need to leap frog versions ceased, and beginning with
 version 4.0.0, the RMR versions should no longer skip.
 
+2023 Oct 4; version 4.9.2
+------------------------------
+
+Fixes some stability issues seen during route table update.
+Dont allow clean old_rtable until it's reference count is really 0.
+Added more faster recognition for connect failure so route table
+update will not stuck so long in preparing new rtable.
+rtgate lock is not needed to init and destroy continouosly.
+
 
 2023 May 28; version 4.9.1
 ------------------------------
index 0858fde..4410dfd 100644 (file)
@@ -247,7 +247,6 @@ typedef struct {
        int             updates;                // counter of update records received
        int             mupdates;               // counter of meid update records received
        int             ref_count;              // num threads currently using
-       pthread_mutex_t*        gate;   // lock allowing update to ref counter
 } route_table_t;
 
 /*
index 0de69c8..5baf79a 100644 (file)
@@ -456,25 +456,19 @@ static char* ensure_nlterm( char* buf ) {
        is no active table (first load), so we have to account for that (no locking).
 */
 static void roll_tables( uta_ctx_t* ctx ) {
-
+       pthread_mutex_lock( ctx->rtgate );                              // must hold lock to move to active
        if( ctx->new_rtable == NULL || ctx->new_rtable->error ) {
                rmr_vlog( RMR_VL_WARN, "new route table NOT rolled in: nil pointer or error indicated\n" );
                ctx->old_rtable = ctx->new_rtable;
-               ctx->new_rtable = NULL;
-               return;
-       }
-
-       if( ctx->rtable != NULL ) {                                                     // initially there isn't one, so must check!
-               pthread_mutex_lock( ctx->rtgate );                              // must hold lock to move to active
+       }else if( ctx->rtable != NULL ) {                                                       // initially there isn't one, so must check!
                ctx->old_rtable = ctx->rtable;                                  // currently active becomes old and allowed to 'drain'
                ctx->rtable = ctx->new_rtable;                                  // one we've been adding to becomes active
-               pthread_mutex_unlock( ctx->rtgate );
        } else {
                ctx->old_rtable = NULL;                                         // ensure there isn't an old reference
                ctx->rtable = ctx->new_rtable;                          // make new the active one
        }
-
        ctx->new_rtable = NULL;
+       pthread_mutex_unlock( ctx->rtgate );
 }
 
 /*
@@ -1369,10 +1363,7 @@ static route_table_t* uta_rt_init( uta_ctx_t* ctx ) {
                return NULL;
        }
 
-       rt->gate = ctx->rtgate;                                         // single mutex needed for all route tables
        rt->ephash = ctx->ephash;                                       // all route tables share a common endpoint hash
-       pthread_mutex_init( rt->gate, NULL );
-
        return rt;
 }
 
@@ -1509,32 +1500,27 @@ static route_table_t* uta_rt_clone( uta_ctx_t* ctx, route_table_t* srt, route_ta
        do not need to run that portion of the table to deref like we do for the RTEs.
 */
 static route_table_t* prep_new_rt( uta_ctx_t* ctx, int all ) {
-       int counter = 0;
-       int ref_count;
+       //int counter = 0;
        route_table_t*  rt;
 
        if( ctx == NULL ) {
                return NULL;
        }
 
+       pthread_mutex_lock( ctx->rtgate );
        if( (rt = ctx->old_rtable) != NULL ) {
                ctx->old_rtable = NULL;
 
-               pthread_mutex_lock( ctx->rtgate );
-               ref_count = rt->ref_count;
-               pthread_mutex_unlock( ctx->rtgate );
-
-               while( ref_count > 0 ) {                                // wait for all who are using to stop
-                       if( counter++ > 1000 ) {
-                               rmr_vlog( RMR_VL_WARN, "rt_prep_newrt:  internal mishap, ref count on table seems wedged" );
-                               break;
-                       }
+               while(  rt->ref_count > 0 ) {                           // wait for all who are using to stop
+                       //if( counter++ > 1000 ) {
+                       //      rmr_vlog( RMR_VL_WARN, "rt_prep_newrt:  internal mishap, ref count on table seems wedged" );
+                       //      break;
+                       //}
 
+                       pthread_mutex_unlock( ctx->rtgate );
                        usleep( 1000 );                                         // small sleep to yield the processer if that is needed
-
                        pthread_mutex_lock( ctx->rtgate );
-                       ref_count = rt->ref_count;
-                       pthread_mutex_unlock( ctx->rtgate );
+
                }
 
                if( rt->hash != NULL ) {
@@ -1546,8 +1532,8 @@ static route_table_t* prep_new_rt( uta_ctx_t* ctx, int all ) {
        } else {
                rt = NULL;
        }
+       pthread_mutex_unlock( ctx->rtgate );
 
-       pthread_mutex_destroy(ctx->rtgate);
        rt = uta_rt_clone( ctx, ctx->rtable, rt, all );         // also sets the ephash pointer
        if( rt != NULL ) {                                                                      // very small chance for nil, but not zero, so test
                rt->ref_count = 0;                                                              // take no chances; ensure it's 0!
index e50a8df..66724e6 100644 (file)
@@ -703,6 +703,7 @@ static void* init( char* uproto_port, int def_msg_size, int flags ) {
        if( ctx->si_ctx == NULL ) {
                return init_err( "unable to initialise SI95 interface\n", ctx, proto_port, 0 );
        }
+       SIset_tflags(ctx->si_ctx,SI_TF_QUICK);
 
        if( (port = strchr( proto_port, ':' )) != NULL ) {
                if( port == proto_port ) {              // ":1234" supplied; leave proto to default and point port correctly
@@ -814,7 +815,6 @@ static void* init( char* uproto_port, int def_msg_size, int flags ) {
                return init_err( "unable to allocate ep hash\n", ctx, proto_port, ENOMEM );
        }
 
-       pthread_mutex_destroy(ctx->rtgate);
        ctx->rtable = rt_clone_space( ctx, NULL, NULL, 0 );     // create an empty route table so that wormhole/rts calls can be used
        if( flags & RMRFL_NOTHREAD ) {                                          // no thread prevents the collector start for very special cases
                ctx->rtable_ready = 1;                                                  // route based sends will always fail, but rmr is ready for the non thread case
index ad5135d..5649fab 100644 (file)
@@ -33,6 +33,8 @@
 *                              17 Apr 2020 - Add safe connect capabilities
 ******************************************************************************
 */
+#include <netinet/tcp.h>
+
 #include "sisetup.h"
 #include "sitransport.h"
 
@@ -114,7 +116,9 @@ extern int SIconnect( struct ginfo_blk *gptr, char *abuf ) {
        struct sockaddr *taddr;         // convenience pointer to addr of target
        int alen = 0;                                   //  len of address struct
        int fd = SI_ERROR;              //  file descriptor to return to caller
-
+       int optvalrlen;
+       int optvalr;
+       int optvalw;
        if( PARANOID_CHECKS ) {
                if( gptr == NULL ) {
                        return SI_ERROR;
@@ -129,6 +133,14 @@ extern int SIconnect( struct ginfo_blk *gptr, char *abuf ) {
        if( tpptr != NULL ) {
                taddr = tpptr->paddr;
                errno = 0;
+
+               if( gptr->tcp_flags & SI_TF_QUICK ) {
+                       optvalrlen = sizeof(optvalr);
+                       GETSOCKOPT( tpptr->fd, IPPROTO_TCP, TCP_SYNCNT, (void *)&optvalr, &optvalrlen) ;
+                       optvalw=2;
+                       SETSOCKOPT( tpptr->fd, IPPROTO_TCP, TCP_SYNCNT, (void *)&optvalw, sizeof( optvalw) ) ;
+               }
+
                if( tpptr->flags & TPF_SAFEC ) {
                        if( safe_connect( tpptr->fd, taddr, tpptr->palen ) != 0 ) {             // fd closed on failure
                                tpptr->fd = -1;
@@ -141,6 +153,10 @@ extern int SIconnect( struct ginfo_blk *gptr, char *abuf ) {
                }
 
                if( tpptr->fd >= 0 ) {                                                          // connect ok
+                       if( gptr->tcp_flags & SI_TF_QUICK ) {
+                               SETSOCKOPT( tpptr->fd, IPPROTO_TCP, TCP_SYNCNT, (void *)&optvalr, sizeof( optvalr) ) ;
+                       }
+
                        tpptr->flags |= TPF_SESSION;                    //  indicate we have a session here
                        tpptr->next = gptr->tplist;                     //  add block to the list
                        if( tpptr->next != NULL ) {
index 2fa7e04..f2bb752 100644 (file)
@@ -214,6 +214,17 @@ extern struct tp_blk *SIconn_prep( struct ginfo_blk *gptr, int type, char *abuf,
                        }
                        SETSOCKOPT( tptr->fd, SOL_TCP, TCP_QUICKACK, (void *)&optval, sizeof( optval) ) ;
 
+                       if( gptr->tcp_flags & SI_TF_QUICK ) {
+                               optval = 1;
+                               SETSOCKOPT( tptr->fd, SOL_SOCKET, SO_KEEPALIVE, (void *)&optval, sizeof( optval) ) ;
+                               optval = 1;
+                               SETSOCKOPT( tptr->fd, IPPROTO_TCP, TCP_KEEPIDLE, (void *)&optval, sizeof( optval) ) ;
+                               optval = 1;
+                               SETSOCKOPT( tptr->fd, IPPROTO_TCP, TCP_KEEPINTVL, (void *)&optval, sizeof( optval) ) ;
+                               optval = 5;
+                               SETSOCKOPT( tptr->fd, IPPROTO_TCP, TCP_KEEPCNT, (void *)&optval, sizeof( optval) ) ;
+                       }
+
                        tptr->paddr = addr;                             // tuck the remote peer address away
                        if( need_smartc( abuf ) ) {
                                tptr->flags |= TPF_SAFEC;
index 8665892..1549217 100644 (file)
@@ -89,6 +89,17 @@ extern int SInewsession( struct ginfo_blk *gptr, struct tp_blk *tpptr ) {
        }
        SETSOCKOPT( tpptr->fd, SOL_TCP, TCP_QUICKACK, (void *)&optval, sizeof( optval) ) ;
 
+       if( gptr->tcp_flags & SI_TF_QUICK ) {
+               optval = 1;
+               SETSOCKOPT( tpptr->fd, SOL_SOCKET, SO_KEEPALIVE, (void *)&optval, sizeof( optval) ) ;
+               optval = 1;
+               SETSOCKOPT( tpptr->fd, IPPROTO_TCP, TCP_KEEPIDLE, (void *)&optval, sizeof( optval) ) ;
+               optval = 1;
+               SETSOCKOPT( tpptr->fd, IPPROTO_TCP, TCP_KEEPINTVL, (void *)&optval, sizeof( optval) ) ;
+               optval = 5;
+               SETSOCKOPT( tpptr->fd, IPPROTO_TCP, TCP_KEEPCNT, (void *)&optval, sizeof( optval) ) ;
+       }
+
        SIaddress( addr, (void **) &buf, AC_TODOT );                                                    // get addr of remote side; buf must be freed
        if( (cbptr = gptr->cbtab[SI_CB_SECURITY].cbrtn) != NULL ) {                             //   invoke the security callback function if there
                status = (*cbptr)( gptr->cbtab[SI_CB_SECURITY].cbdata, buf );
index b51b0d6..f93528b 100644 (file)
@@ -71,7 +71,7 @@
 #define ACCEPT         accept
 #define CLOSE          close
 #define SHUTDOWN       shutdown
-#define        GETSOCKOPT      getscokopt 
+#define        GETSOCKOPT      getsockopt 
 #define SETSOCKOPT     setsockopt
 #define READ           read
 #define WRITE          write
index eb4e69b..ccbff79 100644 (file)
 #define SI_TF_NONE             0               // tcp flags in the global info applied to each session
 #define SI_TF_NODELAY  0x01    // set nagle's off for each connection
 #define SI_TF_FASTACK  0x02    // set fast ack on for each connection
+#define SI_TF_QUICK     0x04   // set keepalive for each connection and reduce syn amount while doing connect
 
 #ifndef _SI_ERRNO
 extern int SIerrno;               //  error number set by public routines
index 2ca3f08..4957745 100644 (file)
@@ -198,7 +198,7 @@ static int tpem_send( int fd, void* buf, int count, int flags ) {
 */
 #define CLOSE          close
 #define SHUTDOWN       shutdown
-#define        GETSOCKOPT      getscokopt
+#define        GETSOCKOPT      getsockopt
 #define SETSOCKOPT     setsockopt
 #define READ           read
 #define WRITE          write