Richard W.M. Jones
2019-Nov-04 15:42 UTC
[Libguestfs] [PATCH nbdkit 0/3] server: Fix crash on close.
This fixes the long-standing crash on close when nbdkit exits. I did try first to fix threads so we're using a proper thread pool, but that's difficult to implement. So this does the minimal change needed to fix the crash instead. There are still two segfaults that happen during running the test suite. One is deliberately caused (tests/test-captive.sh). The other appears to be an assertion failure bug in the new finalize code: Thread 2 (Thread 0x7f12b820fa40 (LWP 231456)): #0 0x00007f12b8783a1f in __GI___poll (fds=0x55771f18c550, nfds=nfds at entry=2, timeout=timeout at entry=-1) at ../sysdeps/unix/sysv/linux/poll.c:29 #1 0x00005577156b9646 in poll (__timeout=-1, __nfds=2, __fds=<optimized out>) at /usr/include/bits/poll2.h:46 #2 check_sockets_and_quit_fd (nr_socks=1, socks=0x55771f18c2e0) at sockets.c:466 #3 accept_incoming_connections (socks=0x55771f18c2e0, nr_socks=1) at sockets.c:494 #4 0x00005577156ac6ac in start_serving () at main.c:914 #5 main (argc=<optimized out>, argv=0x7ffd0bcb15d8) at main.c:685 Thread 1 (Thread 0x7f12b820e700 (LWP 231469)): #0 __GI_raise (sig=sig at entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50 #1 0x00007f12b86b28d9 in __GI_abort () at abort.c:79 #2 0x00007f12b86b27a9 in __assert_fail_base (fmt=0x7f12b881db18 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x5577156bb09d "h->state & HANDLE_CONNECTED", file=0x5577156bb064 "backend.c", line=240, function=<optimized out>) at assert.c:92 #3 0x00007f12b86c1a66 in __GI___assert_fail (assertion=assertion at entry=0x5577156bb09d "h->state & HANDLE_CONNECTED", file=file at entry=0x5577156bb064 "backend.c", line=line at entry=240, function=function at entry=0x5577156bb880 <__PRETTY_FUNCTION__.6528> "backend_finalize") at assert.c:101 #4 0x00005577156ad04e in backend_finalize (b=<optimized out>, conn=conn at entry=0x55771f1ac710) at backend.c:240 #5 0x00005577156b6ea8 in negotiate_handshake_newstyle_options (conn=<optimized out>) at protocol-handshake-newstyle.c:484 #6 protocol_handshake_newstyle (conn=0x55771f1ac710) at protocol-handshake-newstyle.c:762 #7 0x00005577156b5705 in protocol_handshake (conn=conn at entry=0x55771f1ac710) at protocol-handshake.c:55 #8 0x00005577156af73a in handle_single_connection (sockin=<optimized out>, sockout=15) at connections.c:167 #9 0x00005577156b8a48 in start_thread (datav=0x55771f18c620) at sockets.c:356 #10 0x00007f12b885f4e2 in start_thread (arg=<optimized out>) at pthread_create.c:479 #11 0x00007f12b878e643 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95 Rich.
Richard W.M. Jones
2019-Nov-04 15:42 UTC
[Libguestfs] [PATCH nbdkit 1/3] server: Don't return connection status from handle_single_connection.
This was only used in one place: when using the -s option on the command line. In this case nbdkit -s would exit with EXIT_FAILURE if there was some kind of I/O error or early client close. This is not particularly useful because: (1) It prevents the plugin being unloaded properly. (2) Client close is a normal shutdown strategy for some clients, so not really an error. (3) It's undocumented and inconsistent with other server modes. (4) It's awkward when doing fuzzing. Note that connection_get_status (added in commit 0e1f158a1a "connections: Add thread-safe status indicator") is still required to coordinate connection shutdown across threads. --- server/connections.c | 21 +++++---------------- server/internal.h | 2 +- server/main.c | 3 +-- 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/server/connections.c b/server/connections.c index 95d8296..3adba7b 100644 --- a/server/connections.c +++ b/server/connections.c @@ -132,15 +132,17 @@ connection_worker (void *data) return NULL; } -static int -_handle_single_connection (int sockin, int sockout) +void +handle_single_connection (int sockin, int sockout) { const char *plugin_name; - int ret = -1, r; + int r; struct connection *conn; int nworkers = threads ? threads : DEFAULT_PARALLEL_REQUESTS; pthread_t *workers = NULL; + lock_connection (); + if (backend->thread_model (backend) < NBDKIT_THREAD_MODEL_PARALLEL || nworkers == 1) nworkers = 0; @@ -222,22 +224,9 @@ _handle_single_connection (int sockin, int sockout) if (r == -1) goto done; - ret = connection_get_status (conn); done: free_connection (conn); - return ret; -} - -int -handle_single_connection (int sockin, int sockout) -{ - int r; - - lock_connection (); - r = _handle_single_connection (sockin, sockout); unlock_connection (); - - return r; } static struct connection * diff --git a/server/internal.h b/server/internal.h index bb667ce..b28b1c8 100644 --- a/server/internal.h +++ b/server/internal.h @@ -234,7 +234,7 @@ struct connection { connection_close_function close; }; -extern int handle_single_connection (int sockin, int sockout); +extern void handle_single_connection (int sockin, int sockout); extern int connection_get_status (struct connection *conn) __attribute__((__nonnull__ (1))); extern int connection_set_status (struct connection *conn, int value) diff --git a/server/main.c b/server/main.c index 3e2ac2f..dcfdbde 100644 --- a/server/main.c +++ b/server/main.c @@ -894,8 +894,7 @@ start_serving (void) change_user (); write_pidfile (); threadlocal_new_server_thread (); - if (handle_single_connection (0, 1) == -1) - exit (EXIT_FAILURE); + handle_single_connection (0, 1); return; } -- 2.23.0
Richard W.M. Jones
2019-Nov-04 15:42 UTC
[Libguestfs] [PATCH nbdkit 2/3] server: Inline free_listening_sockets.
This function was only ever called straight after accept_incoming_connections, so simply make accept_incoming_connections free the sockets at quit time. --- server/internal.h | 2 -- server/main.c | 2 -- server/sockets.c | 16 ++++++---------- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/server/internal.h b/server/internal.h index b28b1c8..4638727 100644 --- a/server/internal.h +++ b/server/internal.h @@ -474,8 +474,6 @@ extern int *bind_vsock (size_t *) __attribute__((__nonnull__ (1))); extern void accept_incoming_connections (int *socks, size_t nr_socks) __attribute__((__nonnull__ (1))); -extern void free_listening_sockets (int *socks, size_t nr_socks) - __attribute__((__nonnull__ (1))); /* threadlocal.c */ extern void threadlocal_init (void); diff --git a/server/main.c b/server/main.c index dcfdbde..8ff8e79 100644 --- a/server/main.c +++ b/server/main.c @@ -885,7 +885,6 @@ start_serving (void) change_user (); write_pidfile (); accept_incoming_connections (socks, nr_socks); - free_listening_sockets (socks, nr_socks); /* also closes them */ return; } @@ -913,7 +912,6 @@ start_serving (void) fork_into_background (); write_pidfile (); accept_incoming_connections (socks, nr_socks); - free_listening_sockets (socks, nr_socks); } static void diff --git a/server/sockets.c b/server/sockets.c index 119cb99..11bab6c 100644 --- a/server/sockets.c +++ b/server/sockets.c @@ -322,16 +322,6 @@ bind_vsock (size_t *nr_socks) #endif } -void -free_listening_sockets (int *socks, size_t nr_socks) -{ - size_t i; - - for (i = 0; i < nr_socks; ++i) - close (socks[i]); - free (socks); -} - struct thread_data { int sock; size_t instance_num; @@ -476,6 +466,12 @@ check_sockets_and_quit_fd (int *socks, size_t nr_socks) void accept_incoming_connections (int *socks, size_t nr_socks) { + size_t i; + while (!quit) check_sockets_and_quit_fd (socks, nr_socks); + + for (i = 0; i < nr_socks; ++i) + close (socks[i]); + free (socks); } -- 2.23.0
Richard W.M. Jones
2019-Nov-04 15:42 UTC
[Libguestfs] [PATCH nbdkit 3/3] server: Wait for connection threads to exit before unloading plugins.
A frequent cause of crashes on shutdown happens when the quit signal has been received and the main thread and other threads race each other at shutdown. As in the typical stack trace below what happens is that the main thread unloads the plugins while they are still being used by one of the connection threads, causing the connection thread to segfault after calling dlclosed functions. To avoid this simply count the number of connection threads we create, and at exit in the main thread wait until this count drops to 0 before we actually start unloading anything. Thread 2 (Thread 0x7f8fb054ea40 (LWP 3105233)): #0 _asn1_set_down (down=0x5633dd26c010, node=0x5633dd26be70) at parser_aux.h:116 #1 asn1_delete_structure2 (structure=structure at entry=0x7f8fb0dd3fa0 <_gnutls_pkix1_asn>, flags=flags at entry=0) at structure.c:328 #2 0x00007f8fb06e702b in asn1_delete_structure (structure=structure at entry=0x7f8fb0dd3fa0 <_gnutls_pkix1_asn>) at structure.c:293 #3 0x00007f8fb0c66544 in _gnutls_global_deinit (destructor=1) at global.c:419 #4 0x00007f8fb0e0d13b in _dl_fini () at dl-fini.c:138 #5 0x00007f8fb0a0ae87 in __run_exit_handlers (status=0, listp=0x7f8fb0b8e578 <__exit_funcs>, run_list_atexit=run_list_atexit at entry=true, run_dtors=run_dtors at entry=true) at exit.c:108 #6 0x00007f8fb0a0b040 in __GI_exit (status=<optimized out>) at exit.c:139 #7 0x00005633d47626ee in main (argc=<optimized out>, argv=0x7ffecd6115d8) at main.c:705 Thread 1 (Thread 0x7f8fb054d700 (LWP 3105257)): #0 backend_finalize (b=<optimized out>, conn=conn at entry=0x5633dd287c40) at backend.c:227 #1 0x00005633d476579f in handle_single_connection (sockin=<optimized out>, sockout=<optimized out>) at connections.c:222 #2 0x00005633d476e9e9 in start_thread (datav=0x5633dd269620) at sockets.c:351 #3 0x00007f8fb0b9e4e2 in start_thread (arg=<optimized out>) at pthread_create.c:479 #4 0x00007f8fb0acd643 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95 --- server/sockets.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/server/sockets.c b/server/sockets.c index 11bab6c..3351286 100644 --- a/server/sockets.c +++ b/server/sockets.c @@ -322,6 +322,17 @@ bind_vsock (size_t *nr_socks) #endif } +/* This counts the number of connection threads running (note: not the + * number of worker threads, each connection thread will start many + * worker independent threads in the current implementation). The + * purpose of this is so we can wait for all the connection threads to + * exit before we return from accept_incoming_connections, so that + * unload-time actions happen with no connections open. + */ +static pthread_mutex_t count_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t count_cond = PTHREAD_COND_INITIALIZER; +static unsigned count = 0; + struct thread_data { int sock; size_t instance_num; @@ -334,6 +345,10 @@ start_thread (void *datav) debug ("accepted connection"); + pthread_mutex_lock (&count_mutex); + count++; + pthread_mutex_unlock (&count_mutex); + /* Set thread-local data. */ threadlocal_new_server_thread (); threadlocal_set_instance_num (data->instance_num); @@ -341,6 +356,12 @@ start_thread (void *datav) handle_single_connection (data->sock, data->sock); free (data); + + pthread_mutex_lock (&count_mutex); + count--; + pthread_cond_signal (&count_cond); + pthread_mutex_unlock (&count_mutex); + return NULL; } @@ -467,10 +488,24 @@ void accept_incoming_connections (int *socks, size_t nr_socks) { size_t i; + int err; while (!quit) check_sockets_and_quit_fd (socks, nr_socks); + /* Wait for all threads to exit. */ + pthread_mutex_lock (&count_mutex); + for (;;) { + if (count == 0) + break; + err = pthread_cond_wait (&count_cond, &count_mutex); + if (err != 0) { + errno = err; + perror ("pthread_cond_wait"); + } + } + pthread_mutex_unlock (&count_mutex); + for (i = 0; i < nr_socks; ++i) close (socks[i]); free (socks); -- 2.23.0
Reasonably Related Threads
- [PATCH nbdkit v2] Add support for AF_VSOCK.
- [PATCH nbdkit] Add support for AF_VSOCK.
- Re: Anyone seen build hangs (esp armv7, s390x) in Fedora?
- [PATCH nbdkit v2 0/2] Use of attribute(()).
- [nbdkit PATCH v2 0/8] Support parallel transactions within single connection