I''ve been getting deadlocks in xl, particularly "xl destroy".
It turns
out the main thread is stuck in a pthread_join while holding all the
mutexes, while the xenstore reading thread is stuck in a
pthread_mutex_lock before it can get to a cancellation point and exit.
This looks like it is a very long-standing deadlock (the code in
question mostly dates back to 2005), but perhaps something has changed
that makes it more likely to happen. I think the original intention of
the code was to hold all the mutexes while doing the cancel/join to
avoid cancelling while the reader is holding any mutexes. This fails
when the reader loop is not holding any, but needs to take one before
getting to a cancellation point (pthread_mutex_lock is not itself a
cancellation point).
The following two patches address it by 1) making sure that the read
thread has sufficient pthread cleanup handlers to free any
allocated-but-unused memory and release the mutexes when cancelled, and
2) do the pthread cancel/join while not holding any mutexes.
Thanks,
J
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
Jeremy Fitzhardinge
2010-May-11 22:02 UTC
[Xen-devel] [PATCH 1 of 2] xs: make sure mutexes are cleaned up and memory freed if the read thread is cancelled
If the read thread is terminated with pthread cancel, it must make sure
all memory is freed and mutexes are unlocked.
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
diff -r d77a88f938c6 tools/xenstore/xs.c
--- a/tools/xenstore/xs.c Tue May 11 14:05:28 2010 +0100
+++ b/tools/xenstore/xs.c Tue May 11 14:55:14 2010 -0700
@@ -85,6 +85,8 @@
#define mutex_unlock(m) pthread_mutex_unlock(m)
#define condvar_signal(c) pthread_cond_signal(c)
#define condvar_wait(c,m,hnd) pthread_cond_wait(c,m)
+#define cleanup_push(f, a) pthread_cleanup_push((void (*)(void *))(f), (void
*)(a))
+#define cleanup_pop(run) pthread_cleanup_pop(run)
static void *read_thread(void *arg);
@@ -102,6 +104,8 @@
#define mutex_unlock(m) ((void)0)
#define condvar_signal(c) ((void)0)
#define condvar_wait(c,m,hnd) read_message(hnd)
+#define cleanup_push(f, a) ((void)0)
+#define cleanup_pop(run) ((void)0)
#endif
@@ -262,7 +266,6 @@
#ifdef USE_PTHREAD
if (h->read_thr_exists) {
- /* XXX FIXME: May leak an unpublished message buffer. */
pthread_cancel(h->read_thr);
pthread_join(h->read_thr, NULL);
}
@@ -860,44 +863,53 @@
{
struct xs_stored_msg *msg = NULL;
char *body = NULL;
- int saved_errno;
+ int saved_errno = 0;
+ int ret = -1;
/* Allocate message structure and read the message header. */
msg = malloc(sizeof(*msg));
if (msg == NULL)
goto error;
- if (!read_all(h->fd, &msg->hdr, sizeof(msg->hdr)))
- goto error;
+ cleanup_push(free, msg);
+ if (!read_all(h->fd, &msg->hdr, sizeof(msg->hdr))) { /*
Cancellation point */
+ saved_errno = errno;
+ goto error_freemsg;
+ }
/* Allocate and read the message body. */
body = msg->body = malloc(msg->hdr.len + 1);
if (body == NULL)
- goto error;
- if (!read_all(h->fd, body, msg->hdr.len))
- goto error;
+ goto error_freemsg;
+ cleanup_push(free, body);
+ if (!read_all(h->fd, body, msg->hdr.len)) { /* Cancellation point */
+ saved_errno = errno;
+ goto error_freebody;
+ }
+
body[msg->hdr.len] = ''\0'';
if (msg->hdr.type == XS_WATCH_EVENT) {
mutex_lock(&h->watch_mutex);
+ cleanup_push(pthread_mutex_unlock, &h->watch_mutex);
/* Kick users out of their select() loop. */
if (list_empty(&h->watch_list) &&
(h->watch_pipe[1] != -1))
- while (write(h->watch_pipe[1], body, 1) != 1)
+ while (write(h->watch_pipe[1], body, 1) != 1) /* Cancellation point */
continue;
list_add_tail(&msg->list, &h->watch_list);
condvar_signal(&h->watch_condvar);
- mutex_unlock(&h->watch_mutex);
+ cleanup_pop(1);
} else {
mutex_lock(&h->reply_mutex);
/* There should only ever be one response pending! */
if (!list_empty(&h->reply_list)) {
mutex_unlock(&h->reply_mutex);
- goto error;
+ goto error_freebody;
}
list_add_tail(&msg->list, &h->reply_list);
@@ -906,14 +918,16 @@
mutex_unlock(&h->reply_mutex);
}
- return 0;
+ ret = 0;
- error:
- saved_errno = errno;
- free(msg);
- free(body);
+error_freebody:
+ cleanup_pop(ret == -1);
+error_freemsg:
+ cleanup_pop(ret == -1);
+error:
errno = saved_errno;
- return -1;
+
+ return ret;
}
#ifdef USE_PTHREAD
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
Jeremy Fitzhardinge
2010-May-11 22:03 UTC
[Xen-devel] [PATCH 2 of 2] xs: avoid pthread_join deadlock in xs_daemon_close
Doing a pthread_cancel and join on the reader thread while holding all
the request/reply/watch mutexes can deadlock if the thread needs to
take any of those mutexes to exit. Kill off the reader thread before
taking any mutexes (which should be redundant if we''re single-threaded
at that point).
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
diff -r 84ee0559ddc1 tools/xenstore/xs.c
--- a/tools/xenstore/xs.c Tue May 11 14:55:14 2010 -0700
+++ b/tools/xenstore/xs.c Tue May 11 14:55:20 2010 -0700
@@ -260,10 +260,6 @@
void xs_daemon_close(struct xs_handle *h)
{
- mutex_lock(&h->request_mutex);
- mutex_lock(&h->reply_mutex);
- mutex_lock(&h->watch_mutex);
-
#ifdef USE_PTHREAD
if (h->read_thr_exists) {
pthread_cancel(h->read_thr);
@@ -271,6 +267,10 @@
}
#endif
+ mutex_lock(&h->request_mutex);
+ mutex_lock(&h->reply_mutex);
+ mutex_lock(&h->watch_mutex);
+
close_free_msgs(h);
mutex_unlock(&h->request_mutex);
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel