The original implementation utilies select(). In Linux select() typically
supports up to 1024 file descriptors. This can be a problem when user tries to
boot up many guests. Switching to poll() has minimum impact on existing code
and has better scalibility.
Up to 8192 file descriptors are supported in the current implementation.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 tools/console/daemon/io.c |   90 +++++++++++++++++++++++++--------------------
 1 file changed, 50 insertions(+), 40 deletions(-)
diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c
index 48fe151..4e3c55c 100644
--- a/tools/console/daemon/io.c
+++ b/tools/console/daemon/io.c
@@ -28,7 +28,7 @@
 #include <stdlib.h>
 #include <errno.h>
 #include <string.h>
-#include <sys/select.h>
+#include <poll.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <termios.h>
@@ -930,7 +930,6 @@ static void handle_log_reload(void)
 
 void handle_io(void)
 {
-	fd_set readfds, writefds;
 	int ret;
 
 	if (log_hv) {
@@ -959,21 +958,33 @@ void handle_io(void)
 
 	for (;;) {
 		struct domain *d, *n;
-		int max_fd = -1;
-		struct timeval timeout;
+		int poll_timeout; /* timeout in milliseconds */
 		struct timespec ts;
 		long long now, next_timeout = 0;
 
-		FD_ZERO(&readfds);
-		FD_ZERO(&writefds);
-
-		FD_SET(xs_fileno(xs), &readfds);
-		max_fd = MAX(xs_fileno(xs), max_fd);
-
-		if (log_hv) {
-			FD_SET(xc_evtchn_fd(xce_handle), &readfds);
-			max_fd = MAX(xc_evtchn_fd(xce_handle), max_fd);
-		}
+#define MAX_POLL_FDS 8192
+		static struct pollfd fds[MAX_POLL_FDS];
+		static struct pollfd *fd_to_pollfd[MAX_POLL_FDS];
+		int nr_fds;
+#define SET_FDS(_fd, _events) do {				\
+			if (_fd >= MAX_POLL_FDS)		\
+				break;				\
+			fds[nr_fds].fd = (_fd);			\
+			fds[nr_fds].events = (_events);		\
+			fd_to_pollfd[(_fd)] = &fds[nr_fds];	\
+			nr_fds++;				\
+		} while (0)
+#define FD_REVENTS(_fd) (((_fd) < MAX_POLL_FDS &&
fd_to_pollfd[(_fd)]) ? \
+			 fd_to_pollfd[(_fd)]->revents : 0)
+
+		nr_fds = 0;
+		memset(fds, 0, sizeof(fds));
+		memset(fd_to_pollfd, 0, sizeof(fd_to_pollfd));
+
+		SET_FDS(xs_fileno(xs), POLLIN);
+
+		if (log_hv)
+			SET_FDS(xc_evtchn_fd(xce_handle), POLLIN);
 
 		if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
 			return;
@@ -982,11 +993,7 @@ void handle_io(void)
 		/* Re-calculate any event counter allowances & unblock
 		   domains with new allowance */
 		for (d = dom_head; d; d = d->next) {
-			/* Add 5ms of fuzz since select() often returns
-			   a couple of ms sooner than requested. Without
-			   the fuzz we typically do an extra spin in select()
-			   with a 1/2 ms timeout every other iteration */
-			if ((now+5) > d->next_period) {
+			if (now > d->next_period) {
 				d->next_period = now + RATE_LIMIT_PERIOD;
 				if (d->event_count >= RATE_LIMIT_ALLOWANCE) {
 					(void)xc_evtchn_unmask(d->xce_handle, d->local_port);
@@ -1006,74 +1013,73 @@ void handle_io(void)
 				    !d->buffer.max_capacity ||
 				    d->buffer.size < d->buffer.max_capacity) {
 					int evtchn_fd = xc_evtchn_fd(d->xce_handle);
-					FD_SET(evtchn_fd, &readfds);
-					max_fd = MAX(evtchn_fd, max_fd);
+					SET_FDS(evtchn_fd, POLLIN);
 				}
 			}
 
 			if (d->master_fd != -1) {
+				short events = 0;
 				if (!d->is_dead && ring_free_bytes(d))
-					FD_SET(d->master_fd, &readfds);
+					events |= POLLIN;
 
 				if (!buffer_empty(&d->buffer))
-					FD_SET(d->master_fd, &writefds);
-				max_fd = MAX(d->master_fd, max_fd);
+					events |= POLLOUT;
+
+				if (events)
+					SET_FDS(d->master_fd, events);
 			}
 		}
 
 		/* If any domain has been rate limited, we need to work
-		   out what timeout to supply to select */
+		   out what timeout to supply to poll */
 		if (next_timeout) {
 			long long duration = (next_timeout - now);
 			if (duration <= 0) /* sanity check */
 				duration = 1;
-			timeout.tv_sec = duration / 1000;
-			timeout.tv_usec = ((duration - (timeout.tv_sec * 1000))
-					   * 1000);
+			poll_timeout = (int)duration;
 		}
 
-		ret = select(max_fd + 1, &readfds, &writefds, 0,
-			     next_timeout ? &timeout : NULL);
+		ret = poll(fds, nr_fds, next_timeout ? poll_timeout : -1);
 
 		if (log_reload) {
 			handle_log_reload();
 			log_reload = 0;
 		}
 
-		/* Abort if select failed, except for EINTR cases
+		/* Abort if poll failed, except for EINTR cases
 		   which indicate a possible log reload */
 		if (ret == -1) {
 			if (errno == EINTR)
 				continue;
-			dolog(LOG_ERR, "Failure in select: %d (%s)",
+			dolog(LOG_ERR, "Failure in poll: %d (%s)",
 			      errno, strerror(errno));
 			break;
 		}
 
-		if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle), &readfds))
+		if (log_hv && FD_REVENTS(xc_evtchn_fd(xce_handle)) & POLLIN)
 			handle_hv_logs();
 
 		if (ret <= 0)
 			continue;
 
-		if (FD_ISSET(xs_fileno(xs), &readfds))
+		if (FD_REVENTS(xs_fileno(xs)) & POLLIN)
 			handle_xs();
 
 		for (d = dom_head; d; d = n) {
 			n = d->next;
 			if (d->event_count < RATE_LIMIT_ALLOWANCE) {
 				if (d->xce_handle != NULL &&
-				    FD_ISSET(xc_evtchn_fd(d->xce_handle),
-					     &readfds))
+				    FD_REVENTS(xc_evtchn_fd(d->xce_handle)) &
+				    POLLIN)
 					handle_ring_read(d);
 			}
 
-			if (d->master_fd != -1 && FD_ISSET(d->master_fd,
-							   &readfds))
+			if (d->master_fd != -1 &&
+			    FD_REVENTS(d->master_fd) & POLLIN)
 				handle_tty_read(d);
 
-			if (d->master_fd != -1 && FD_ISSET(d->master_fd,
-							   &writefds))
+			if (d->master_fd != -1 &&
+			    FD_REVENTS(d->master_fd) & POLLOUT)
 				handle_tty_write(d);
 
 			if (d->last_seen != enum_pass)
@@ -1082,6 +1088,10 @@ void handle_io(void)
 			if (d->is_dead)
 				cleanup_domain(d);
 		}
+
+#undef MAX_POLL_FDS
+#undef SET_FDS
+#undef FD_REVENTS
 	}
 
  out:
-- 
1.7.10.4
Mats Petersson
2013-Jan-03  18:22 UTC
Re: [PATCH] Switch to poll in xenconsoled''s io loop.
On 03/01/13 17:14, Wei Liu wrote:> The original implementation utilies select(). In Linux select() typically > supports up to 1024 file descriptors. This can be a problem when user tries to > boot up many guests. Switching to poll() has minimum impact on existing code > and has better scalibility. > > Up to 8192 file descriptors are supported in the current implementation. > > Signed-off-by: Wei Liu <wei.liu2@citrix.com> > --- > tools/console/daemon/io.c | 90 +++++++++++++++++++++++++-------------------- > 1 file changed, 50 insertions(+), 40 deletions(-) > > diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c > index 48fe151..4e3c55c 100644 > --- a/tools/console/daemon/io.c > +++ b/tools/console/daemon/io.c > @@ -28,7 +28,7 @@ > #include <stdlib.h> > #include <errno.h> > #include <string.h> > -#include <sys/select.h> > +#include <poll.h> > #include <fcntl.h> > #include <unistd.h> > #include <termios.h> > @@ -930,7 +930,6 @@ static void handle_log_reload(void) > > void handle_io(void) > { > - fd_set readfds, writefds; > int ret; > > if (log_hv) { > @@ -959,21 +958,33 @@ void handle_io(void) > > for (;;) { > struct domain *d, *n; > - int max_fd = -1; > - struct timeval timeout; > + int poll_timeout; /* timeout in milliseconds */ > struct timespec ts; > long long now, next_timeout = 0; > > - FD_ZERO(&readfds); > - FD_ZERO(&writefds); > - > - FD_SET(xs_fileno(xs), &readfds); > - max_fd = MAX(xs_fileno(xs), max_fd); > - > - if (log_hv) { > - FD_SET(xc_evtchn_fd(xce_handle), &readfds); > - max_fd = MAX(xc_evtchn_fd(xce_handle), max_fd); > - } > +#define MAX_POLL_FDS 8192 > + static struct pollfd fds[MAX_POLL_FDS]; > + static struct pollfd *fd_to_pollfd[MAX_POLL_FDS]; > + int nr_fds; > +#define SET_FDS(_fd, _events) do { \ > + if (_fd >= MAX_POLL_FDS) \ > + break; \ > + fds[nr_fds].fd = (_fd); \ > + fds[nr_fds].events = (_events); \ > + fd_to_pollfd[(_fd)] = &fds[nr_fds]; \ > + nr_fds++; \ > + } while (0) > +#define FD_REVENTS(_fd) (((_fd) < MAX_POLL_FDS && fd_to_pollfd[(_fd)]) ? \ > + fd_to_pollfd[(_fd)]->revents : 0) > + > + nr_fds = 0; > + memset(fds, 0, sizeof(fds)); > + memset(fd_to_pollfd, 0, sizeof(fd_to_pollfd)); > + > + SET_FDS(xs_fileno(xs), POLLIN); > + > + if (log_hv) > + SET_FDS(xc_evtchn_fd(xce_handle), POLLIN); >Would it not make sense to use dynamically allocated memory instead - that way, when we run out of 8192, there is nothing to change. At the very least, can we have some sort of error output when it goes above the limit, so that we don''t just silently "miss" a few consoles out from the poll. -- Mats> if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) > return; > @@ -982,11 +993,7 @@ void handle_io(void) > /* Re-calculate any event counter allowances & unblock > domains with new allowance */ > for (d = dom_head; d; d = d->next) { > - /* Add 5ms of fuzz since select() often returns > - a couple of ms sooner than requested. Without > - the fuzz we typically do an extra spin in select() > - with a 1/2 ms timeout every other iteration */ > - if ((now+5) > d->next_period) { > + if (now > d->next_period) { > d->next_period = now + RATE_LIMIT_PERIOD; > if (d->event_count >= RATE_LIMIT_ALLOWANCE) { > (void)xc_evtchn_unmask(d->xce_handle, d->local_port); > @@ -1006,74 +1013,73 @@ void handle_io(void) > !d->buffer.max_capacity || > d->buffer.size < d->buffer.max_capacity) { > int evtchn_fd = xc_evtchn_fd(d->xce_handle); > - FD_SET(evtchn_fd, &readfds); > - max_fd = MAX(evtchn_fd, max_fd); > + SET_FDS(evtchn_fd, POLLIN); > } > } > > if (d->master_fd != -1) { > + short events = 0; > if (!d->is_dead && ring_free_bytes(d)) > - FD_SET(d->master_fd, &readfds); > + events |= POLLIN; > > if (!buffer_empty(&d->buffer)) > - FD_SET(d->master_fd, &writefds); > - max_fd = MAX(d->master_fd, max_fd); > + events |= POLLOUT; > + > + if (events) > + SET_FDS(d->master_fd, events); > } > } > > /* If any domain has been rate limited, we need to work > - out what timeout to supply to select */ > + out what timeout to supply to poll */ > if (next_timeout) { > long long duration = (next_timeout - now); > if (duration <= 0) /* sanity check */ > duration = 1; > - timeout.tv_sec = duration / 1000; > - timeout.tv_usec = ((duration - (timeout.tv_sec * 1000)) > - * 1000); > + poll_timeout = (int)duration; > } > > - ret = select(max_fd + 1, &readfds, &writefds, 0, > - next_timeout ? &timeout : NULL); > + ret = poll(fds, nr_fds, next_timeout ? poll_timeout : -1); > > if (log_reload) { > handle_log_reload(); > log_reload = 0; > } > > - /* Abort if select failed, except for EINTR cases > + /* Abort if poll failed, except for EINTR cases > which indicate a possible log reload */ > if (ret == -1) { > if (errno == EINTR) > continue; > - dolog(LOG_ERR, "Failure in select: %d (%s)", > + dolog(LOG_ERR, "Failure in poll: %d (%s)", > errno, strerror(errno)); > break; > } > > - if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle), &readfds)) > + if (log_hv && FD_REVENTS(xc_evtchn_fd(xce_handle)) & POLLIN) > handle_hv_logs(); > > if (ret <= 0) > continue; > > - if (FD_ISSET(xs_fileno(xs), &readfds)) > + if (FD_REVENTS(xs_fileno(xs)) & POLLIN) > handle_xs(); > > for (d = dom_head; d; d = n) { > n = d->next; > if (d->event_count < RATE_LIMIT_ALLOWANCE) { > if (d->xce_handle != NULL && > - FD_ISSET(xc_evtchn_fd(d->xce_handle), > - &readfds)) > + FD_REVENTS(xc_evtchn_fd(d->xce_handle)) & > + POLLIN) > handle_ring_read(d); > } > > - if (d->master_fd != -1 && FD_ISSET(d->master_fd, > - &readfds)) > + if (d->master_fd != -1 && > + FD_REVENTS(d->master_fd) & POLLIN) > handle_tty_read(d); > > - if (d->master_fd != -1 && FD_ISSET(d->master_fd, > - &writefds)) > + if (d->master_fd != -1 && > + FD_REVENTS(d->master_fd) & POLLOUT) > handle_tty_write(d); > > if (d->last_seen != enum_pass) > @@ -1082,6 +1088,10 @@ void handle_io(void) > if (d->is_dead) > cleanup_domain(d); > } > + > +#undef MAX_POLL_FDS > +#undef SET_FDS > +#undef FD_REVENTS > } > > out:
On Thu, 2013-01-03 at 18:22 +0000, Mats Petersson wrote:> On 03/01/13 17:14, Wei Liu wrote: > > The original implementation utilies select(). In Linux select() typically > > supports up to 1024 file descriptors. This can be a problem when user tries to > > boot up many guests. Switching to poll() has minimum impact on existing code > > and has better scalibility. > > > > Up to 8192 file descriptors are supported in the current implementation. > > > > Signed-off-by: Wei Liu <wei.liu2@citrix.com> > > --- > > tools/console/daemon/io.c | 90 +++++++++++++++++++++++++-------------------- > > 1 file changed, 50 insertions(+), 40 deletions(-) > > > > diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c > > index 48fe151..4e3c55c 100644 > > --- a/tools/console/daemon/io.c > > +++ b/tools/console/daemon/io.c > > @@ -28,7 +28,7 @@ > > #include <stdlib.h> > > #include <errno.h> > > #include <string.h> > > -#include <sys/select.h> > > +#include <poll.h> > > #include <fcntl.h> > > #include <unistd.h> > > #include <termios.h> > > @@ -930,7 +930,6 @@ static void handle_log_reload(void) > > > > void handle_io(void) > > { > > - fd_set readfds, writefds; > > int ret; > > > > if (log_hv) { > > @@ -959,21 +958,33 @@ void handle_io(void) > > > > for (;;) { > > struct domain *d, *n; > > - int max_fd = -1; > > - struct timeval timeout; > > + int poll_timeout; /* timeout in milliseconds */ > > struct timespec ts; > > long long now, next_timeout = 0; > > > > - FD_ZERO(&readfds); > > - FD_ZERO(&writefds); > > - > > - FD_SET(xs_fileno(xs), &readfds); > > - max_fd = MAX(xs_fileno(xs), max_fd); > > - > > - if (log_hv) { > > - FD_SET(xc_evtchn_fd(xce_handle), &readfds); > > - max_fd = MAX(xc_evtchn_fd(xce_handle), max_fd); > > - } > > +#define MAX_POLL_FDS 8192 > > + static struct pollfd fds[MAX_POLL_FDS]; > > + static struct pollfd *fd_to_pollfd[MAX_POLL_FDS]; > > + int nr_fds; > > +#define SET_FDS(_fd, _events) do { \ > > + if (_fd >= MAX_POLL_FDS) \ > > + break; \ > > + fds[nr_fds].fd = (_fd); \ > > + fds[nr_fds].events = (_events); \ > > + fd_to_pollfd[(_fd)] = &fds[nr_fds]; \ > > + nr_fds++; \ > > + } while (0) > > +#define FD_REVENTS(_fd) (((_fd) < MAX_POLL_FDS && fd_to_pollfd[(_fd)]) ? \ > > + fd_to_pollfd[(_fd)]->revents : 0) > > + > > + nr_fds = 0; > > + memset(fds, 0, sizeof(fds)); > > + memset(fd_to_pollfd, 0, sizeof(fd_to_pollfd)); > > + > > + SET_FDS(xs_fileno(xs), POLLIN); > > + > > + if (log_hv) > > + SET_FDS(xc_evtchn_fd(xce_handle), POLLIN); > > > Would it not make sense to use dynamically allocated memory instead - > that way, when we run out of 8192, there is nothing to change.Writing a new version to use dynamically allocated memory. Wei.
Wei Liu
2013-Jan-04  15:58 UTC
[PATCH V2] Switch from select() to poll() in xenconsoled''s IO loop.
In Linux select() typically supports up to 1024 file descriptors. This can be
a problem when user tries to boot up many guests. Switching to poll() has
minimum impact on existing code and has better scalibility.
Tracking arrays are dynamically allocated / reallocated. If the tracking
arrays fail to expand, we just ignore the incoming fd.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 tools/console/daemon/io.c |  159 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 121 insertions(+), 38 deletions(-)
diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c
index 48fe151..830fc18 100644
--- a/tools/console/daemon/io.c
+++ b/tools/console/daemon/io.c
@@ -28,7 +28,7 @@
 #include <stdlib.h>
 #include <errno.h>
 #include <string.h>
-#include <sys/select.h>
+#include <poll.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <termios.h>
@@ -928,9 +928,98 @@ static void handle_log_reload(void)
        }
 }
 
+
+/* Should have at least max_fd + 1 elements */
+#define DEFAULT_ARRAY_SIZE 1024
+#define GROWTH_LENGTH      512
+static struct pollfd  *fds;
+static struct pollfd **fd_to_pollfd;
+static unsigned int current_array_size;
+static unsigned int nr_fds;
+
+static int initialize_pollfd_arrays(void)
+{
+       fds = (struct pollfd *)
+               malloc(sizeof(struct pollfd) * DEFAULT_ARRAY_SIZE);
+       if (!fds)
+               goto fail;
+       fd_to_pollfd = (struct pollfd **)
+               malloc(sizeof(struct pollfd *) * DEFAULT_ARRAY_SIZE);
+       if (!fd_to_pollfd)
+               goto fail;
+       memset(fds, 0, sizeof(struct pollfd) * DEFAULT_ARRAY_SIZE);
+       memset(fd_to_pollfd, 0, sizeof(struct pollfd *) * DEFAULT_ARRAY_SIZE);
+       current_array_size = DEFAULT_ARRAY_SIZE;
+       return 0;
+fail:
+       free(fds);
+       free(fd_to_pollfd);
+       return -ENOMEM;
+}
+
+static void destroy_pollfd_arrays(void)
+{
+       free(fds);
+       free(fd_to_pollfd);
+       current_array_size = 0;
+}
+
+static void set_fds(int fd, short events)
+{
+       if (current_array_size < fd+1) {
+               struct pollfd  *p1 = NULL;
+               struct pollfd **p2 = NULL;
+               unsigned int newsize = current_array_size;
+
+               do { newsize += GROWTH_LENGTH; } while (newsize < fd+1);
+
+               p1 = realloc(fds, sizeof(struct pollfd)*newsize);
+               if (!p1)
+                       goto fail;
+               fds = p1;
+
+               p2 = realloc(fd_to_pollfd, sizeof(struct pollfd *)*newsize);
+               if (!p2)
+                       goto fail;
+               fd_to_pollfd = p2;
+
+               memset(&fds[0] + current_array_size, 0,
+                      sizeof(struct pollfd) * (newsize-current_array_size));
+               memset(&fd_to_pollfd[0] + current_array_size, 0,
+                      sizeof(struct pollfd *) * (newsize-current_array_size));
+               current_array_size = newsize;
+       }
+
+       fds[nr_fds].fd = fd;
+       fds[nr_fds].events = events;
+       fd_to_pollfd[fd] = &fds[nr_fds];
+       nr_fds++;
+
+       return;
+fail:
+       dolog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd);
+       return;
+}
+
+static short fd_revents(int fd)
+{
+       if (fd >= current_array_size)
+               return 0;
+       if (fd_to_pollfd[fd] == NULL)
+               return 0;
+       return fd_to_pollfd[fd]->revents;
+}
+
+static void reset_fds(void)
+{
+       nr_fds = 0;
+       memset(fds, 0, sizeof(struct pollfd) * current_array_size);
+       memset(fd_to_pollfd, 0,
+              sizeof(struct pollfd *) * current_array_size);
+}
+
 void handle_io(void)
 {
-       fd_set readfds, writefds;
        int ret;
 
        if (log_hv) {
@@ -957,23 +1046,21 @@ void handle_io(void)
                }
        }
 
+       if (initialize_pollfd_arrays())
+               goto out;
+
        for (;;) {
                struct domain *d, *n;
-               int max_fd = -1;
-               struct timeval timeout;
+               int poll_timeout; /* timeout in milliseconds */
                struct timespec ts;
                long long now, next_timeout = 0;
 
-               FD_ZERO(&readfds);
-               FD_ZERO(&writefds);
+               reset_fds();
 
-               FD_SET(xs_fileno(xs), &readfds);
-               max_fd = MAX(xs_fileno(xs), max_fd);
+               set_fds(xs_fileno(xs), POLLIN);
 
-               if (log_hv) {
-                       FD_SET(xc_evtchn_fd(xce_handle), &readfds);
-                       max_fd = MAX(xc_evtchn_fd(xce_handle), max_fd);
-               }
+               if (log_hv)
+                       set_fds(xc_evtchn_fd(xce_handle), POLLIN);
 
                if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
                        return;
@@ -982,11 +1069,7 @@ void handle_io(void)
                /* Re-calculate any event counter allowances & unblock
                   domains with new allowance */
                for (d = dom_head; d; d = d->next) {
-                       /* Add 5ms of fuzz since select() often returns
-                          a couple of ms sooner than requested. Without
-                          the fuzz we typically do an extra spin in select()
-                          with a 1/2 ms timeout every other iteration */
-                       if ((now+5) > d->next_period) {
+                       if (now > d->next_period) {
                                d->next_period = now + RATE_LIMIT_PERIOD;
                                if (d->event_count >=
RATE_LIMIT_ALLOWANCE) {
                                        (void)xc_evtchn_unmask(d->xce_handle,
d->local_port);
@@ -1006,74 +1089,73 @@ void handle_io(void)
                                    !d->buffer.max_capacity ||
                                    d->buffer.size <
d->buffer.max_capacity) {
                                        int evtchn_fd =
xc_evtchn_fd(d->xce_handle);
-                                       FD_SET(evtchn_fd, &readfds);
-                                       max_fd = MAX(evtchn_fd, max_fd);
+                                       set_fds(evtchn_fd, POLLIN);
                                }
                        }
 
                        if (d->master_fd != -1) {
+                               short events = 0;
                                if (!d->is_dead &&
ring_free_bytes(d))
-                                       FD_SET(d->master_fd, &readfds);
+                                       events |= POLLIN;
 
                                if (!buffer_empty(&d->buffer))
-                                       FD_SET(d->master_fd, &writefds);
-                               max_fd = MAX(d->master_fd, max_fd);
+                                       events |= POLLOUT;
+
+                               if (events)
+                                       set_fds(d->master_fd, events);
                        }
                }
 
                /* If any domain has been rate limited, we need to work
-                  out what timeout to supply to select */
+                  out what timeout to supply to poll */
                if (next_timeout) {
                        long long duration = (next_timeout - now);
                        if (duration <= 0) /* sanity check */
                                duration = 1;
-                       timeout.tv_sec = duration / 1000;
-                       timeout.tv_usec = ((duration - (timeout.tv_sec * 1000))
-                                          * 1000);
+                       poll_timeout = (int)duration;
                }
 
-               ret = select(max_fd + 1, &readfds, &writefds, 0,
-                            next_timeout ? &timeout : NULL);
+               ret = poll(fds, nr_fds, next_timeout ? poll_timeout : -1);
 
                if (log_reload) {
                        handle_log_reload();
                        log_reload = 0;
                }
 
-               /* Abort if select failed, except for EINTR cases
+               /* Abort if poll failed, except for EINTR cases
                   which indicate a possible log reload */
                if (ret == -1) {
                        if (errno == EINTR)
                                continue;
-                       dolog(LOG_ERR, "Failure in select: %d (%s)",
+                       dolog(LOG_ERR, "Failure in poll: %d (%s)",
                              errno, strerror(errno));
                        break;
                }
 
-               if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle),
&readfds))
+               if (log_hv && fd_revents(xc_evtchn_fd(xce_handle)) &
POLLIN)
                        handle_hv_logs();
 
                if (ret <= 0)
                        continue;
 
-               if (FD_ISSET(xs_fileno(xs), &readfds))
+               if (fd_revents(xs_fileno(xs)) & POLLIN)
                        handle_xs();
 
                for (d = dom_head; d; d = n) {
                        n = d->next;
                        if (d->event_count < RATE_LIMIT_ALLOWANCE) {
                                if (d->xce_handle != NULL &&
-                                   FD_ISSET(xc_evtchn_fd(d->xce_handle),
-                                            &readfds))
+                                   fd_revents(xc_evtchn_fd(d->xce_handle))
&
+                                   POLLIN)
                                        handle_ring_read(d);
                        }
 
-                       if (d->master_fd != -1 &&
FD_ISSET(d->master_fd,
-                                                          &readfds))
+                       if (d->master_fd != -1 &&
+                           fd_revents(d->master_fd) & POLLIN)
                                handle_tty_read(d);
 
-                       if (d->master_fd != -1 &&
FD_ISSET(d->master_fd,
-                                                          &writefds))
+                       if (d->master_fd != -1 &&
+                           fd_revents(d->master_fd) & POLLOUT)
                                handle_tty_write(d);
 
                        if (d->last_seen != enum_pass)
@@ -1084,6 +1166,7 @@ void handle_io(void)
                }
        }
 
+       destroy_pollfd_arrays();
  out:
        if (log_hv_fd != -1) {
                close(log_hv_fd);
-- 
1.7.10.4
Ian Campbell
2013-Jan-04  16:08 UTC
Re: [PATCH V2] Switch from select() to poll() in xenconsoled''s IO loop.
> +static int initialize_pollfd_arrays(void) > +{ > + fds = (struct pollfd *) > + malloc(sizeof(struct pollfd) * DEFAULT_ARRAY_SIZE); > + if (!fds) > + goto fail; > + fd_to_pollfd = (struct pollfd **) > + malloc(sizeof(struct pollfd *) * DEFAULT_ARRAY_SIZE); > + if (!fd_to_pollfd) > + goto fail; > + memset(fds, 0, sizeof(struct pollfd) * DEFAULT_ARRAY_SIZE); > + memset(fd_to_pollfd, 0, sizeof(struct pollfd *) * DEFAULT_ARRAY_SIZE); > + current_array_size = DEFAULT_ARRAY_SIZE; > + return 0; > +fail: > + free(fds); > + free(fd_to_pollfd); > + return -ENOMEM; > +} > + > +static void destroy_pollfd_arrays(void) > +{ > + free(fds); > + free(fd_to_pollfd); > + current_array_size = 0; > +} > + > +static void set_fds(int fd, short events) > +{ > + if (current_array_size < fd+1) { > + struct pollfd *p1 = NULL; > + struct pollfd **p2 = NULL; > + unsigned int newsize = current_array_size; > + > + do { newsize += GROWTH_LENGTH; } while (newsize < fd+1);Steal #define ROUNDUP from tools/libxc/xc_linux_osdep.c.> + > + p1 = realloc(fds, sizeof(struct pollfd)*newsize); > + if (!p1) > + goto fail; > + fds = p1; > + > + p2 = realloc(fd_to_pollfd, sizeof(struct pollfd *)*newsize);realloc(NULL, ...) is the same as malloc() so I think you can initialise current_array_size to 0 and the various pointers to NULL and avoid the need for initialize_pollfd_arrays -- i.e. it will just grow from 0 on the first use here.> for (;;) { > struct domain *d, *n; > - int max_fd = -1; > - struct timeval timeout; > + int poll_timeout; /* timeout in milliseconds */ > struct timespec ts; > long long now, next_timeout = 0; > > - FD_ZERO(&readfds); > - FD_ZERO(&writefds); > + reset_fds(); > > - FD_SET(xs_fileno(xs), &readfds); > - max_fd = MAX(xs_fileno(xs), max_fd); > + set_fds(xs_fileno(xs), POLLIN);Do you know, or can you track, the maximum fd over time? If you can then you could likely make use of automatic stack allocations (struct pollfd fds[max_fd]) and therefore avoid the pain of manual memory management. Not sure what the semantics of those are inside a for loop where max_fd can change but worst case you could put the content of the loop into a function. Ian.
Wei Liu
2013-Jan-04  16:38 UTC
Re: [PATCH V2] Switch from select() to poll() in xenconsoled''s IO loop.
On Fri, 2013-01-04 at 16:08 +0000, Ian Campbell wrote:> > +static int initialize_pollfd_arrays(void) > > +{ > > + fds = (struct pollfd *) > > + malloc(sizeof(struct pollfd) * DEFAULT_ARRAY_SIZE); > > + if (!fds) > > + goto fail; > > + fd_to_pollfd = (struct pollfd **) > > + malloc(sizeof(struct pollfd *) * DEFAULT_ARRAY_SIZE); > > + if (!fd_to_pollfd) > > + goto fail; > > + memset(fds, 0, sizeof(struct pollfd) * DEFAULT_ARRAY_SIZE); > > + memset(fd_to_pollfd, 0, sizeof(struct pollfd *) * DEFAULT_ARRAY_SIZE); > > + current_array_size = DEFAULT_ARRAY_SIZE; > > + return 0; > > +fail: > > + free(fds); > > + free(fd_to_pollfd); > > + return -ENOMEM; > > +} > > + > > +static void destroy_pollfd_arrays(void) > > +{ > > + free(fds); > > + free(fd_to_pollfd); > > + current_array_size = 0; > > +} > > + > > +static void set_fds(int fd, short events) > > +{ > > + if (current_array_size < fd+1) { > > + struct pollfd *p1 = NULL; > > + struct pollfd **p2 = NULL; > > + unsigned int newsize = current_array_size; > > + > > + do { newsize += GROWTH_LENGTH; } while (newsize < fd+1); > > Steal #define ROUNDUP from tools/libxc/xc_linux_osdep.c. > > + > > + p1 = realloc(fds, sizeof(struct pollfd)*newsize); > > + if (!p1) > > + goto fail; > > + fds = p1; > > + > > + p2 = realloc(fd_to_pollfd, sizeof(struct pollfd *)*newsize); > > realloc(NULL, ...) is the same as malloc() so I think you can initialise > current_array_size to 0 and the various pointers to NULL and avoid the > need for initialize_pollfd_arrays -- i.e. it will just grow from 0 on > the first use here. >Huh, good idea.> > for (;;) { > > struct domain *d, *n; > > - int max_fd = -1; > > - struct timeval timeout; > > + int poll_timeout; /* timeout in milliseconds */ > > struct timespec ts; > > long long now, next_timeout = 0; > > > > - FD_ZERO(&readfds); > > - FD_ZERO(&writefds); > > + reset_fds(); > > > > - FD_SET(xs_fileno(xs), &readfds); > > - max_fd = MAX(xs_fileno(xs), max_fd); > > + set_fds(xs_fileno(xs), POLLIN); > > Do you know, or can you track, the maximum fd over time? > If you can then you could likely make use of automatic stack allocations > (struct pollfd fds[max_fd]) and therefore avoid the pain of manual > memory management.Stack size is subject to system setting. Typically it is limited to 8MB in Linux as shown by `ulimit -s`. This is actually very small compared to heap space. Of course we can make xenconsoled special among all the processes... But that''s not ideal. Wei.> Not sure what the semantics of those are inside a for loop where max_fd > can change but worst case you could put the content of the loop into a > function. > > Ian. >
Mats Petersson
2013-Jan-04  16:51 UTC
Re: [PATCH V2] Switch from select() to poll() in xenconsoled''s IO loop.
On 04/01/13 16:38, Wei Liu wrote:> On Fri, 2013-01-04 at 16:08 +0000, Ian Campbell wrote: >>> +static int initialize_pollfd_arrays(void) >>> +{ >>> + fds = (struct pollfd *) >>> + malloc(sizeof(struct pollfd) * DEFAULT_ARRAY_SIZE); >>> + if (!fds) >>> + goto fail; >>> + fd_to_pollfd = (struct pollfd **) >>> + malloc(sizeof(struct pollfd *) * DEFAULT_ARRAY_SIZE);I believe it''s considered "bad" to cast the result of malloc... Unless you are using a C++ compiler to compile C - in which case it''s necessary. But then it should really be converted to "new" and "delete" where relevant. [Although that does cause problems with realloc!] I do notice you are not casting realloc, so I expect it''s safe to remove the casts here too. Of course, if you remove these calls to malloc and just use realloc in the first place, you can ignore this comment! ;)>>> + if (!fd_to_pollfd) >>> + goto fail; >>> + memset(fds, 0, sizeof(struct pollfd) * DEFAULT_ARRAY_SIZE); >>> + memset(fd_to_pollfd, 0, sizeof(struct pollfd *) * DEFAULT_ARRAY_SIZE); >>> + current_array_size = DEFAULT_ARRAY_SIZE; >>> + return 0; >>> +fail: >>> + free(fds); >>> + free(fd_to_pollfd); >>> + return -ENOMEM; >>> +} >>> + >>> +static void destroy_pollfd_arrays(void) >>> +{ >>> + free(fds); >>> + free(fd_to_pollfd); >>> + current_array_size = 0; >>> +} >>> + >>> +static void set_fds(int fd, short events) >>> +{ >>> + if (current_array_size < fd+1) { >>> + struct pollfd *p1 = NULL; >>> + struct pollfd **p2 = NULL; >>> + unsigned int newsize = current_array_size; >>> + >>> + do { newsize += GROWTH_LENGTH; } while (newsize < fd+1); >> Steal #define ROUNDUP from tools/libxc/xc_linux_osdep.c. >>> + >>> + p1 = realloc(fds, sizeof(struct pollfd)*newsize); >>> + if (!p1) >>> + goto fail; >>> + fds = p1; >>> + >>> + p2 = realloc(fd_to_pollfd, sizeof(struct pollfd *)*newsize); >> realloc(NULL, ...) is the same as malloc() so I think you can initialise >> current_array_size to 0 and the various pointers to NULL and avoid the >> need for initialize_pollfd_arrays -- i.e. it will just grow from 0 on >> the first use here. >> > Huh, good idea. > >>> for (;;) { >>> struct domain *d, *n; >>> - int max_fd = -1; >>> - struct timeval timeout; >>> + int poll_timeout; /* timeout in milliseconds */ >>> struct timespec ts; >>> long long now, next_timeout = 0; >>> >>> - FD_ZERO(&readfds); >>> - FD_ZERO(&writefds); >>> + reset_fds(); >>> >>> - FD_SET(xs_fileno(xs), &readfds); >>> - max_fd = MAX(xs_fileno(xs), max_fd); >>> + set_fds(xs_fileno(xs), POLLIN); >> Do you know, or can you track, the maximum fd over time? >> If you can then you could likely make use of automatic stack allocations >> (struct pollfd fds[max_fd]) and therefore avoid the pain of manual >> memory management. > Stack size is subject to system setting. Typically it is limited to 8MB > in Linux as shown by `ulimit -s`. This is actually very small compared > to heap space.Not to mention that if you run out of heapspace, you can "do something about it" (even if it''s just print an error message and exit, it''s better than "Stack overflow crash with no message at all"). I prefer allocations for that reason. On the other hand, the number of fds we can fit in 8MB (or 4MB) is probably more than we''d ever get from running many guests with consoles.... And it wouldn''t be hard to add a "set stack size to 16MB" or something like that as part of "start xenconsoled". -- Mats> > Of course we can make xenconsoled special among all the processes... But > that''s not ideal. > > > Wei. > >> Not sure what the semantics of those are inside a for loop where max_fd >> can change but worst case you could put the content of the loop into a >> function. >> >> Ian. >> > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel > >
Wei Liu
2013-Jan-04  17:17 UTC
[PATCH V3] Switch from select() to poll() in xenconsoled''s IO loop.
In Linux select() typically supports up to 1024 file descriptors. This can be
a problem when user tries to boot up many guests. Switching to poll() has
minimum impact on existing code and has better scalibility.
Tracking arrays are dynamically allocated / reallocated. If the tracking
arrays fail to expand, we just ignore the incoming fd.
Change from V2:
  * remove unnecessary malloc in initialize_pollfd_arrays
  * use ROUND_UP to get new size of arrays
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 tools/console/daemon/io.c |  147 +++++++++++++++++++++++++++++++++------------
 1 file changed, 109 insertions(+), 38 deletions(-)
diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c
index 48fe151..0c53d30 100644
--- a/tools/console/daemon/io.c
+++ b/tools/console/daemon/io.c
@@ -28,7 +28,7 @@
 #include <stdlib.h>
 #include <errno.h>
 #include <string.h>
-#include <sys/select.h>
+#include <poll.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <termios.h>
@@ -928,9 +928,87 @@ static void handle_log_reload(void)
        }
 }
 
+
+/* Should have at least max_fd + 1 elements */
+static struct pollfd  *fds;
+static struct pollfd **fd_to_pollfd;
+static unsigned int current_array_size;
+static unsigned int nr_fds;
+
+static void initialize_pollfd_arrays(void)
+{
+       fds = NULL;
+       fd_to_pollfd = NULL;
+       current_array_size = 0;
+}
+
+static void destroy_pollfd_arrays(void)
+{
+       free(fds);
+       free(fd_to_pollfd);
+       current_array_size = 0;
+}
+
+#define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) &
~((1UL<<(_w))-1))
+static void set_fds(int fd, short events)
+{
+       if (current_array_size < fd+1) {
+               struct pollfd  *p1 = NULL;
+               struct pollfd **p2 = NULL;
+               unsigned long newsize;
+
+               /* Round up to 2^8 boundary, in practice this just
+                * make newsize larger than current_array_size.
+                */
+               newsize = ROUNDUP(fd+1, 8);
+
+               p1 = realloc(fds, sizeof(struct pollfd)*newsize);
+               if (!p1)
+                       goto fail;
+               fds = p1;
+
+               p2 = realloc(fd_to_pollfd, sizeof(struct pollfd *)*newsize);
+               if (!p2)
+                       goto fail;
+               fd_to_pollfd = p2;
+
+               memset(&fds[0] + current_array_size, 0,
+                      sizeof(struct pollfd) * (newsize-current_array_size));
+               memset(&fd_to_pollfd[0] + current_array_size, 0,
+                      sizeof(struct pollfd *) * (newsize-current_array_size));
+               current_array_size = newsize;
+       }
+
+       fds[nr_fds].fd = fd;
+       fds[nr_fds].events = events;
+       fd_to_pollfd[fd] = &fds[nr_fds];
+       nr_fds++;
+
+       return;
+fail:
+       dolog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd);
+       return;
+}
+
+static short fd_revents(int fd)
+{
+       if (fd >= current_array_size)
+               return 0;
+       if (fd_to_pollfd[fd] == NULL)
+               return 0;
+       return fd_to_pollfd[fd]->revents;
+}
+
+static void reset_fds(void)
+{
+       nr_fds = 0;
+       memset(fds, 0, sizeof(struct pollfd) * current_array_size);
+       memset(fd_to_pollfd, 0,
+              sizeof(struct pollfd *) * current_array_size);
+}
+
 void handle_io(void)
 {
-       fd_set readfds, writefds;
        int ret;
 
        if (log_hv) {
@@ -957,23 +1035,20 @@ void handle_io(void)
                }
        }
 
+       initialize_pollfd_arrays();
+
        for (;;) {
                struct domain *d, *n;
-               int max_fd = -1;
-               struct timeval timeout;
+               int poll_timeout; /* timeout in milliseconds */
                struct timespec ts;
                long long now, next_timeout = 0;
 
-               FD_ZERO(&readfds);
-               FD_ZERO(&writefds);
+               reset_fds();
 
-               FD_SET(xs_fileno(xs), &readfds);
-               max_fd = MAX(xs_fileno(xs), max_fd);
+               set_fds(xs_fileno(xs), POLLIN);
 
-               if (log_hv) {
-                       FD_SET(xc_evtchn_fd(xce_handle), &readfds);
-                       max_fd = MAX(xc_evtchn_fd(xce_handle), max_fd);
-               }
+               if (log_hv)
+                       set_fds(xc_evtchn_fd(xce_handle), POLLIN);
 
                if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
                        return;
@@ -982,11 +1057,7 @@ void handle_io(void)
                /* Re-calculate any event counter allowances & unblock
                   domains with new allowance */
                for (d = dom_head; d; d = d->next) {
-                       /* Add 5ms of fuzz since select() often returns
-                          a couple of ms sooner than requested. Without
-                          the fuzz we typically do an extra spin in select()
-                          with a 1/2 ms timeout every other iteration */
-                       if ((now+5) > d->next_period) {
+                       if (now > d->next_period) {
                                d->next_period = now + RATE_LIMIT_PERIOD;
                                if (d->event_count >=
RATE_LIMIT_ALLOWANCE) {
                                        (void)xc_evtchn_unmask(d->xce_handle,
d->local_port);
@@ -1006,74 +1077,73 @@ void handle_io(void)
                                    !d->buffer.max_capacity ||
                                    d->buffer.size <
d->buffer.max_capacity) {
                                        int evtchn_fd =
xc_evtchn_fd(d->xce_handle);
-                                       FD_SET(evtchn_fd, &readfds);
-                                       max_fd = MAX(evtchn_fd, max_fd);
+                                       set_fds(evtchn_fd, POLLIN);
                                }
                        }
 
                        if (d->master_fd != -1) {
+                               short events = 0;
                                if (!d->is_dead &&
ring_free_bytes(d))
-                                       FD_SET(d->master_fd, &readfds);
+                                       events |= POLLIN;
 
                                if (!buffer_empty(&d->buffer))
-                                       FD_SET(d->master_fd, &writefds);
-                               max_fd = MAX(d->master_fd, max_fd);
+                                       events |= POLLOUT;
+
+                               if (events)
+                                       set_fds(d->master_fd, events);
                        }
                }
 
                /* If any domain has been rate limited, we need to work
-                  out what timeout to supply to select */
+                  out what timeout to supply to poll */
                if (next_timeout) {
                        long long duration = (next_timeout - now);
                        if (duration <= 0) /* sanity check */
                                duration = 1;
-                       timeout.tv_sec = duration / 1000;
-                       timeout.tv_usec = ((duration - (timeout.tv_sec * 1000))
-                                          * 1000);
+                       poll_timeout = (int)duration;
                }
 
-               ret = select(max_fd + 1, &readfds, &writefds, 0,
-                            next_timeout ? &timeout : NULL);
+               ret = poll(fds, nr_fds, next_timeout ? poll_timeout : -1);
 
                if (log_reload) {
                        handle_log_reload();
                        log_reload = 0;
                }
 
-               /* Abort if select failed, except for EINTR cases
+               /* Abort if poll failed, except for EINTR cases
                   which indicate a possible log reload */
                if (ret == -1) {
                        if (errno == EINTR)
                                continue;
-                       dolog(LOG_ERR, "Failure in select: %d (%s)",
+                       dolog(LOG_ERR, "Failure in poll: %d (%s)",
                              errno, strerror(errno));
                        break;
                }
 
-               if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle),
&readfds))
+               if (log_hv && fd_revents(xc_evtchn_fd(xce_handle)) &
POLLIN)
                        handle_hv_logs();
 
                if (ret <= 0)
                        continue;
 
-               if (FD_ISSET(xs_fileno(xs), &readfds))
+               if (fd_revents(xs_fileno(xs)) & POLLIN)
                        handle_xs();
 
                for (d = dom_head; d; d = n) {
                        n = d->next;
                        if (d->event_count < RATE_LIMIT_ALLOWANCE) {
                                if (d->xce_handle != NULL &&
-                                   FD_ISSET(xc_evtchn_fd(d->xce_handle),
-                                            &readfds))
+                                   fd_revents(xc_evtchn_fd(d->xce_handle))
&
+                                   POLLIN)
                                        handle_ring_read(d);
                        }
 
-                       if (d->master_fd != -1 &&
FD_ISSET(d->master_fd,
-                                                          &readfds))
+                       if (d->master_fd != -1 &&
+                           fd_revents(d->master_fd) & POLLIN)
                                handle_tty_read(d);
 
-                       if (d->master_fd != -1 &&
FD_ISSET(d->master_fd,
-                                                          &writefds))
+                       if (d->master_fd != -1 &&
+                           fd_revents(d->master_fd) & POLLOUT)
                                handle_tty_write(d);
 
                        if (d->last_seen != enum_pass)
@@ -1084,6 +1154,7 @@ void handle_io(void)
                }
        }
 
+       destroy_pollfd_arrays();
  out:
        if (log_hv_fd != -1) {
                close(log_hv_fd);
-- 
1.7.10.4
Ian Campbell
2013-Jan-07  10:20 UTC
Re: [PATCH V3] Switch from select() to poll() in xenconsoled''s IO loop.
On Fri, 2013-01-04 at 17:17 +0000, Wei Liu wrote:> In Linux select() typically supports up to 1024 file descriptors. This can be > a problem when user tries to boot up many guests. Switching to poll() has > minimum impact on existing code and has better scalibility. > > Tracking arrays are dynamically allocated / reallocated. If the tracking > arrays fail to expand, we just ignore the incoming fd. > > Change from V2: > * remove unnecessary malloc in initialize_pollfd_arrays > * use ROUND_UP to get new size of arrays > > Signed-off-by: Wei Liu <wei.liu2@citrix.com> > --- > tools/console/daemon/io.c | 147 +++++++++++++++++++++++++++++++++------------ > 1 file changed, 109 insertions(+), 38 deletions(-) > > diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c > index 48fe151..0c53d30 100644 > --- a/tools/console/daemon/io.c > +++ b/tools/console/daemon/io.c > @@ -28,7 +28,7 @@ > #include <stdlib.h> > #include <errno.h> > #include <string.h> > -#include <sys/select.h> > +#include <poll.h> > #include <fcntl.h> > #include <unistd.h> > #include <termios.h> > @@ -928,9 +928,87 @@ static void handle_log_reload(void) > } > } > > + > +/* Should have at least max_fd + 1 elements */ > +static struct pollfd *fds; > +static struct pollfd **fd_to_pollfd; > +static unsigned int current_array_size; > +static unsigned int nr_fds; > + > +static void initialize_pollfd_arrays(void) > +{ > + fds = NULL; > + fd_to_pollfd = NULL; > + current_array_size = 0; > +}These can all be done as part of the definition of the variables.> + > +static void destroy_pollfd_arrays(void) > +{ > + free(fds); > + free(fd_to_pollfd); > + current_array_size = 0; > +}Having done the above I''d be inclined to inline this at the single callsite.> + > +#define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1)) > +static void set_fds(int fd, short events) > +{ > + if (current_array_size < fd+1) { > + struct pollfd *p1 = NULL; > + struct pollfd **p2 = NULL; > + unsigned long newsize; > + > + /* Round up to 2^8 boundary, in practice this just > + * make newsize larger than current_array_size. > + */ > + newsize = ROUNDUP(fd+1, 8); > + > + p1 = realloc(fds, sizeof(struct pollfd)*newsize); > + if (!p1) > + goto fail; > + fds = p1; > + > + p2 = realloc(fd_to_pollfd, sizeof(struct pollfd *)*newsize); > + if (!p2) > + goto fail; > + fd_to_pollfd = p2;Do these two arrays really need to be the same size? I can see why fd_to_pollfd needs to be as big as the highest fd we are polling but the fds array only actually needs to scale with the total *number* of fds we are polling. Is it the case that the set of fds we are waiting on is pretty densely packed? Or maybe even if the fds are sparse it doesn''t matter and this is simpler at the cost of a little bit of extra memory usage?> + > + memset(&fds[0] + current_array_size, 0, > + sizeof(struct pollfd) * (newsize-current_array_size)); > + memset(&fd_to_pollfd[0] + current_array_size, 0, > + sizeof(struct pollfd *) * (newsize-current_array_size)); > + current_array_size = newsize; > + } > + > + fds[nr_fds].fd = fd; > + fds[nr_fds].events = events; > + fd_to_pollfd[fd] = &fds[nr_fds]; > + nr_fds++; > + > + return; > +fail: > + dolog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd); > + return; > +}[...]> - if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle), &readfds)) > + if (log_hv && fd_revents(xc_evtchn_fd(xce_handle)) & POLLIN) > handle_hv_logs(); > > if (ret <= 0) > continue; > > - if (FD_ISSET(xs_fileno(xs), &readfds)) > + if (fd_revents(xs_fileno(xs)) & POLLIN)It seems like both this and the logging fd could be handled with a struct pollfd * variable specifically for each. Which combined with...> handle_xs(); > > for (d = dom_head; d; d = n) { > n = d->next; > if (d->event_count < RATE_LIMIT_ALLOWANCE) { > if (d->xce_handle != NULL && > - FD_ISSET(xc_evtchn_fd(d->xce_handle), > - &readfds)) > + fd_revents(xc_evtchn_fd(d->xce_handle)) &... adding some struct pollfd pointers to struct domain (for this and master_fd) would remove the need for the fd_to_pollfd lookup array altogether.> + POLLIN) > handle_ring_read(d); > } > > - if (d->master_fd != -1 && FD_ISSET(d->master_fd, > - &readfds)) > + if (d->master_fd != -1 && > + fd_revents(d->master_fd) & POLLIN) > handle_tty_read(d); > > - if (d->master_fd != -1 && FD_ISSET(d->master_fd, > - &writefds)) > + if (d->master_fd != -1 && > + fd_revents(d->master_fd) & POLLOUT) > handle_tty_write(d); > > if (d->last_seen != enum_pass)[...] Ian.
Wei Liu
2013-Jan-07  12:12 UTC
Re: [PATCH V3] Switch from select() to poll() in xenconsoled''s IO loop.
On Mon, 2013-01-07 at 10:20 +0000, Ian Campbell wrote:> On Fri, 2013-01-04 at 17:17 +0000, Wei Liu wrote: > > In Linux select() typically supports up to 1024 file descriptors. This can be > > a problem when user tries to boot up many guests. Switching to poll() has > > minimum impact on existing code and has better scalibility. > > > > Tracking arrays are dynamically allocated / reallocated. If the tracking > > arrays fail to expand, we just ignore the incoming fd. > > > > Change from V2: > > * remove unnecessary malloc in initialize_pollfd_arrays > > * use ROUND_UP to get new size of arrays > > > > Signed-off-by: Wei Liu <wei.liu2@citrix.com> > > --- > > tools/console/daemon/io.c | 147 +++++++++++++++++++++++++++++++++------------ > > 1 file changed, 109 insertions(+), 38 deletions(-) > > > > diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c > > index 48fe151..0c53d30 100644 > > --- a/tools/console/daemon/io.c > > +++ b/tools/console/daemon/io.c > > @@ -28,7 +28,7 @@ > > #include <stdlib.h> > > #include <errno.h> > > #include <string.h> > > -#include <sys/select.h> > > +#include <poll.h> > > #include <fcntl.h> > > #include <unistd.h> > > #include <termios.h> > > @@ -928,9 +928,87 @@ static void handle_log_reload(void) > > } > > } > > > > + > > +/* Should have at least max_fd + 1 elements */ > > +static struct pollfd *fds; > > +static struct pollfd **fd_to_pollfd; > > +static unsigned int current_array_size; > > +static unsigned int nr_fds; > > + > > +static void initialize_pollfd_arrays(void) > > +{ > > + fds = NULL; > > + fd_to_pollfd = NULL; > > + current_array_size = 0; > > +} > > These can all be done as part of the definition of the variables. > > > + > > +static void destroy_pollfd_arrays(void) > > +{ > > + free(fds); > > + free(fd_to_pollfd); > > + current_array_size = 0; > > +} > > Having done the above I''d be inclined to inline this at the single > callsite. >I left the initialize() there just to pair with destroy(). But as you suggested, I can eliminate both.> > + > > +#define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1)) > > +static void set_fds(int fd, short events) > > +{ > > + if (current_array_size < fd+1) { > > + struct pollfd *p1 = NULL; > > + struct pollfd **p2 = NULL; > > + unsigned long newsize; > > + > > + /* Round up to 2^8 boundary, in practice this just > > + * make newsize larger than current_array_size. > > + */ > > + newsize = ROUNDUP(fd+1, 8); > > + > > + p1 = realloc(fds, sizeof(struct pollfd)*newsize); > > + if (!p1) > > + goto fail; > > + fds = p1; > > + > > + p2 = realloc(fd_to_pollfd, sizeof(struct pollfd *)*newsize); > > + if (!p2) > > + goto fail; > > + fd_to_pollfd = p2; > > Do these two arrays really need to be the same size? >Not necessary. I will see what I can do.> I can see why fd_to_pollfd needs to be as big as the highest fd we are > polling but the fds array only actually needs to scale with the total > *number* of fds we are polling. Is it the case that the set of fds we > are waiting on is pretty densely packed? >It depends. If we have many guests up then destroy them all, then the arrays can be sparse. There is no way to shrink the arrays. But see below.> Or maybe even if the fds are sparse it doesn''t matter and this is > simpler at the cost of a little bit of extra memory usage? > >Even we have thousands of guests, the tracking arrays only occupy a few megabytes.> > + > > + memset(&fds[0] + current_array_size, 0, > > + sizeof(struct pollfd) * (newsize-current_array_size)); > > + memset(&fd_to_pollfd[0] + current_array_size, 0, > > + sizeof(struct pollfd *) * (newsize-current_array_size)); > > + current_array_size = newsize; > > + } > > + > > + fds[nr_fds].fd = fd; > > + fds[nr_fds].events = events; > > + fd_to_pollfd[fd] = &fds[nr_fds]; > > + nr_fds++; > > + > > + return; > > +fail: > > + dolog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd); > > + return; > > +} > [...] > > - if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle), &readfds)) > > + if (log_hv && fd_revents(xc_evtchn_fd(xce_handle)) & POLLIN) > > handle_hv_logs(); > > > > if (ret <= 0) > > continue; > > > > - if (FD_ISSET(xs_fileno(xs), &readfds)) > > + if (fd_revents(xs_fileno(xs)) & POLLIN) > > It seems like both this and the logging fd could be handled with a > struct pollfd * variable specifically for each. Which combined with... > > > handle_xs(); > > > > for (d = dom_head; d; d = n) { > > n = d->next; > > if (d->event_count < RATE_LIMIT_ALLOWANCE) { > > if (d->xce_handle != NULL && > > - FD_ISSET(xc_evtchn_fd(d->xce_handle), > > - &readfds)) > > + fd_revents(xc_evtchn_fd(d->xce_handle)) & > > ... adding some struct pollfd pointers to struct domain (for this and > master_fd) would remove the need for the fd_to_pollfd lookup array > altogether. >Yes, this is possible. But my original thought was to make as little impact to original code as possible, so I didn''t touch struct domain and choose to add extra tracking facilities to make it just work. If we hit poll() bottleneck we can move to libevent or libev, which requires big surgery on existing code. Of course I can do that as well if that suits you. Wei.
Ian Campbell
2013-Jan-07  12:16 UTC
Re: [PATCH V3] Switch from select() to poll() in xenconsoled''s IO loop.
> > > handle_xs(); > > > > > > for (d = dom_head; d; d = n) { > > > n = d->next; > > > if (d->event_count < RATE_LIMIT_ALLOWANCE) { > > > if (d->xce_handle != NULL && > > > - FD_ISSET(xc_evtchn_fd(d->xce_handle), > > > - &readfds)) > > > + fd_revents(xc_evtchn_fd(d->xce_handle)) & > > > > ... adding some struct pollfd pointers to struct domain (for this and > > master_fd) would remove the need for the fd_to_pollfd lookup array > > altogether. > > > > Yes, this is possible. But my original thought was to make as little > impact to original code as possible, so I didn''t touch struct domain and > choose to add extra tracking facilities to make it just work.I think adding a few pointers to struct domain would be less code changed over all since you wouldn''t need to create/manage one of the arrays at all.> If we hit poll() bottleneck we can move to libevent or libev, which > requires big surgery on existing code. > > Of course I can do that as well if that suits you. > > > Wei. >
Wei Liu
2013-Jan-07  14:28 UTC
[PATCH V4] Switch from select() to poll() in xenconsoled''s IO loop
In Linux select() typically supports up to 1024 file descriptors. This can be
a problem when user tries to boot up many guests. Switching to poll() has
minimum impact on existing code and has better scalibility.
pollfd array is dynamically allocated / reallocated. If the array fails to
expand, we just ignore the incoming fd.
Change from V2:
  * remove unnecessary malloc in initialize_pollfd_arrays
  * use ROUND_UP to get new size of arrays
Change from V3:
  * remove initialize and destroy function for array
  * embedded tracking structure in struct domain, eliminate fd_to_pollfd
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 tools/console/daemon/io.c    |  119 ++++++++++++++++++++++++++++--------------
 tools/console/daemon/utils.c |    2 +
 tools/console/daemon/utils.h |    2 +
 3 files changed, 85 insertions(+), 38 deletions(-)
diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c
index 48fe151..80f7144 100644
--- a/tools/console/daemon/io.c
+++ b/tools/console/daemon/io.c
@@ -28,7 +28,7 @@
 #include <stdlib.h>
 #include <errno.h>
 #include <string.h>
-#include <sys/select.h>
+#include <poll.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <termios.h>
@@ -69,6 +69,7 @@ static int log_hv_fd = -1;
 static evtchn_port_or_error_t log_hv_evtchn = -1;
 static xc_interface *xch; /* why does xenconsoled have two xc handles ? */
 static xc_evtchn *xce_handle = NULL;
+static struct pollfd *xce_pollfd = NULL;
 
 struct buffer {
        char *data;
@@ -81,7 +82,9 @@ struct buffer {
 struct domain {
        int domid;
        int master_fd;
+       struct pollfd *master_pollfd;
        int slave_fd;
+       struct pollfd *slave_pollfd;
        int log_fd;
        bool is_dead;
        unsigned last_seen;
@@ -92,6 +95,7 @@ struct domain {
        evtchn_port_or_error_t local_port;
        evtchn_port_or_error_t remote_port;
        xc_evtchn *xce_handle;
+       struct pollfd *xce_pollfd;
        struct xencons_interface *interface;
        int event_count;
        long long next_period;
@@ -928,9 +932,52 @@ static void handle_log_reload(void)
        }
 }
 
+static struct pollfd  *fds;
+static unsigned int current_array_size;
+static unsigned int nr_fds;
+
+#define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) &
~((1UL<<(_w))-1))
+static struct pollfd *set_fds(int fd, short events)
+{
+       struct pollfd *ret;
+       if (current_array_size < nr_fds + 1) {
+               struct pollfd  *new_fds = NULL;
+               unsigned long newsize;
+
+               /* Round up to 2^8 boundary, in practice this just
+                * make newsize larger than current_array_size.
+                */
+               newsize = ROUNDUP(nr_fds + 1, 8);
+
+               new_fds = realloc(fds, sizeof(struct pollfd)*newsize);
+               if (!new_fds)
+                       goto fail;
+               fds = new_fds;
+
+               memset(&fds[0] + current_array_size, 0,
+                      sizeof(struct pollfd) * (newsize-current_array_size));
+               current_array_size = newsize;
+       }
+
+       fds[nr_fds].fd = fd;
+       fds[nr_fds].events = events;
+       ret = &fds[nr_fds];
+       nr_fds++;
+
+       return ret;
+fail:
+       dolog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd);
+       return NULL;
+}
+
+static void reset_fds(void)
+{
+       nr_fds = 0;
+       memset(fds, 0, sizeof(struct pollfd) * current_array_size);
+}
+
 void handle_io(void)
 {
-       fd_set readfds, writefds;
        int ret;
 
        if (log_hv) {
@@ -959,21 +1006,16 @@ void handle_io(void)
 
        for (;;) {
                struct domain *d, *n;
-               int max_fd = -1;
-               struct timeval timeout;
+               int poll_timeout; /* timeout in milliseconds */
                struct timespec ts;
                long long now, next_timeout = 0;
 
-               FD_ZERO(&readfds);
-               FD_ZERO(&writefds);
+               reset_fds();
 
-               FD_SET(xs_fileno(xs), &readfds);
-               max_fd = MAX(xs_fileno(xs), max_fd);
+               xs_pollfd = set_fds(xs_fileno(xs), POLLIN);
 
-               if (log_hv) {
-                       FD_SET(xc_evtchn_fd(xce_handle), &readfds);
-                       max_fd = MAX(xc_evtchn_fd(xce_handle), max_fd);
-               }
+               if (log_hv)
+                       xce_pollfd = set_fds(xc_evtchn_fd(xce_handle), POLLIN);
 
                if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0)
                        return;
@@ -982,11 +1024,7 @@ void handle_io(void)
                /* Re-calculate any event counter allowances & unblock
                   domains with new allowance */
                for (d = dom_head; d; d = d->next) {
-                       /* Add 5ms of fuzz since select() often returns
-                          a couple of ms sooner than requested. Without
-                          the fuzz we typically do an extra spin in select()
-                          with a 1/2 ms timeout every other iteration */
-                       if ((now+5) > d->next_period) {
+                       if (now > d->next_period) {
                                d->next_period = now + RATE_LIMIT_PERIOD;
                                if (d->event_count >=
RATE_LIMIT_ALLOWANCE) {
                                        (void)xc_evtchn_unmask(d->xce_handle,
d->local_port);
@@ -1006,74 +1044,76 @@ void handle_io(void)
                                    !d->buffer.max_capacity ||
                                    d->buffer.size <
d->buffer.max_capacity) {
                                        int evtchn_fd =
xc_evtchn_fd(d->xce_handle);
-                                       FD_SET(evtchn_fd, &readfds);
-                                       max_fd = MAX(evtchn_fd, max_fd);
+                                       d->xce_pollfd = set_fds(evtchn_fd,
POLLIN);
                                }
                        }
 
                        if (d->master_fd != -1) {
+                               short events = 0;
                                if (!d->is_dead &&
ring_free_bytes(d))
-                                       FD_SET(d->master_fd, &readfds);
+                                       events |= POLLIN;
 
                                if (!buffer_empty(&d->buffer))
-                                       FD_SET(d->master_fd, &writefds);
-                               max_fd = MAX(d->master_fd, max_fd);
+                                       events |= POLLOUT;
+
+                               if (events)
+                                       d->master_pollfd +                   
set_fds(d->master_fd, events);
                        }
                }
 
                /* If any domain has been rate limited, we need to work
-                  out what timeout to supply to select */
+                  out what timeout to supply to poll */
                if (next_timeout) {
                        long long duration = (next_timeout - now);
                        if (duration <= 0) /* sanity check */
                                duration = 1;
-                       timeout.tv_sec = duration / 1000;
-                       timeout.tv_usec = ((duration - (timeout.tv_sec * 1000))
-                                          * 1000);
+                       poll_timeout = (int)duration;
                }
 
-               ret = select(max_fd + 1, &readfds, &writefds, 0,
-                            next_timeout ? &timeout : NULL);
+               ret = poll(fds, nr_fds, next_timeout ? poll_timeout : -1);
 
                if (log_reload) {
                        handle_log_reload();
                        log_reload = 0;
                }
 
-               /* Abort if select failed, except for EINTR cases
+               /* Abort if poll failed, except for EINTR cases
                   which indicate a possible log reload */
                if (ret == -1) {
                        if (errno == EINTR)
                                continue;
-                       dolog(LOG_ERR, "Failure in select: %d (%s)",
+                       dolog(LOG_ERR, "Failure in poll: %d (%s)",
                              errno, strerror(errno));
                        break;
                }
 
-               if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle),
&readfds))
+               if (log_hv && xce_pollfd &&
xce_pollfd->revents & POLLIN)
                        handle_hv_logs();
 
                if (ret <= 0)
                        continue;
 
-               if (FD_ISSET(xs_fileno(xs), &readfds))
+               if (xs_pollfd && xs_pollfd->revents & POLLIN)
                        handle_xs();
 
                for (d = dom_head; d; d = n) {
                        n = d->next;
                        if (d->event_count < RATE_LIMIT_ALLOWANCE) {
                                if (d->xce_handle != NULL &&
-                                   FD_ISSET(xc_evtchn_fd(d->xce_handle),
-                                            &readfds))
+                                   d->xce_pollfd != NULL &&
+                                   d->xce_pollfd->revents & POLLIN)
                                        handle_ring_read(d);
                        }
 
-                       if (d->master_fd != -1 &&
FD_ISSET(d->master_fd,
-                                                          &readfds))
+                       if (d->master_fd != -1 &&
+                           d->master_pollfd &&
+                           d->master_pollfd->revents & POLLIN)
                                handle_tty_read(d);
 
-                       if (d->master_fd != -1 &&
FD_ISSET(d->master_fd,
-                                                          &writefds))
+                       if (d->master_fd != -1 &&
+                           d->master_pollfd &&
+                           d->master_pollfd->revents & POLLOUT)
                                handle_tty_write(d);
 
                        if (d->last_seen != enum_pass)
@@ -1084,6 +1124,9 @@ void handle_io(void)
                }
        }
 
+       free(fds);
+       current_array_size = 0;
+
  out:
        if (log_hv_fd != -1) {
                close(log_hv_fd);
diff --git a/tools/console/daemon/utils.c b/tools/console/daemon/utils.c
index aab6f42..d0b1b00 100644
--- a/tools/console/daemon/utils.c
+++ b/tools/console/daemon/utils.c
@@ -33,11 +33,13 @@
 #include <sys/un.h>
 #include <string.h>
 #include <signal.h>
+#include <poll.h>
 
 #include "xenctrl.h"
 #include "utils.h"
 
 struct xs_handle *xs;
+struct pollfd *xs_pollfd;
 xc_interface *xc;
 
 static void child_exit(int sig)
diff --git a/tools/console/daemon/utils.h b/tools/console/daemon/utils.h
index 8725dcd..8e72dab 100644
--- a/tools/console/daemon/utils.h
+++ b/tools/console/daemon/utils.h
@@ -24,6 +24,7 @@
 #include <stdbool.h>
 #include <syslog.h>
 #include <stdio.h>
+#include <poll.h>
 #include <xenctrl.h>
 
 #include <xenstore.h>
@@ -32,6 +33,7 @@ void daemonize(const char *pidfile);
 bool xen_setup(void);
 
 extern struct xs_handle *xs;
+extern struct pollfd *xs_pollfd;
 extern xc_interface *xc;
 
 #if 1
-- 
1.7.10.4
Ian Campbell
2013-Jan-07  14:39 UTC
Re: [PATCH V4] Switch from select() to poll() in xenconsoled''s IO loop
On Mon, 2013-01-07 at 14:28 +0000, Wei Liu wrote> @@ -69,6 +69,7 @@ static int log_hv_fd = -1; > static evtchn_port_or_error_t log_hv_evtchn = -1; > static xc_interface *xch; /* why does xenconsoled have two xc handles ? */ > static xc_evtchn *xce_handle = NULL; > +static struct pollfd *xce_pollfd = NULL; > > struct buffer { > char *data;[...]> @@ -32,6 +33,7 @@ void daemonize(const char *pidfile); > bool xen_setup(void); > > extern struct xs_handle *xs; > +extern struct pollfd *xs_pollfd; > extern xc_interface *xc;xs_pollfd and the xce_pollfd can both be local to the handle_io function, I think.> > - if (d->master_fd != -1 && FD_ISSET(d->master_fd, > - &readfds)) > + if (d->master_fd != -1 && > + d->master_pollfd && > + d->master_pollfd->revents & POLLIN) > handle_tty_read(d); > > - if (d->master_fd != -1 && FD_ISSET(d->master_fd, > - &writefds)) > + if (d->master_fd != -1 && > + d->master_pollfd && > + d->master_pollfd->revents & POLLOUT) > handle_tty_write(d); > > if (d->last_seen != enum_pass)This is probably one for Ian J but I wonder if you need to handle POLLERR or POLLHUP here. ISTR some of oddness WRT these when Ian implemented the libxl event subsystem. Ian.
Mats Petersson
2013-Jan-07  14:41 UTC
Re: [PATCH V4] Switch from select() to poll() in xenconsoled''s IO loop
On 07/01/13 14:28, Wei Liu wrote:> In Linux select() typically supports up to 1024 file descriptors. This can be > a problem when user tries to boot up many guests. Switching to poll() has > minimum impact on existing code and has better scalibility. > > pollfd array is dynamically allocated / reallocated. If the array fails to > expand, we just ignore the incoming fd. > > Change from V2: > * remove unnecessary malloc in initialize_pollfd_arrays > * use ROUND_UP to get new size of arrays > > Change from V3: > * remove initialize and destroy function for array > * embedded tracking structure in struct domain, eliminate fd_to_pollfd > > Signed-off-by: Wei Liu <wei.liu2@citrix.com> > --- > tools/console/daemon/io.c | 119 ++++++++++++++++++++++++++++-------------- > tools/console/daemon/utils.c | 2 + > tools/console/daemon/utils.h | 2 + > 3 files changed, 85 insertions(+), 38 deletions(-) > > diff --git a/tools/console/daemon/io.c b/tools/console/daemon/io.c > index 48fe151..80f7144 100644 > --- a/tools/console/daemon/io.c > +++ b/tools/console/daemon/io.c > @@ -28,7 +28,7 @@ > #include <stdlib.h> > #include <errno.h> > #include <string.h> > -#include <sys/select.h> > +#include <poll.h> > #include <fcntl.h> > #include <unistd.h> > #include <termios.h> > @@ -69,6 +69,7 @@ static int log_hv_fd = -1; > static evtchn_port_or_error_t log_hv_evtchn = -1; > static xc_interface *xch; /* why does xenconsoled have two xc handles ? */ > static xc_evtchn *xce_handle = NULL; > +static struct pollfd *xce_pollfd = NULL; > > struct buffer { > char *data; > @@ -81,7 +82,9 @@ struct buffer { > struct domain { > int domid; > int master_fd; > + struct pollfd *master_pollfd; > int slave_fd; > + struct pollfd *slave_pollfd; > int log_fd; > bool is_dead; > unsigned last_seen; > @@ -92,6 +95,7 @@ struct domain { > evtchn_port_or_error_t local_port; > evtchn_port_or_error_t remote_port; > xc_evtchn *xce_handle; > + struct pollfd *xce_pollfd; > struct xencons_interface *interface; > int event_count; > long long next_period; > @@ -928,9 +932,52 @@ static void handle_log_reload(void) > } > } > > +static struct pollfd *fds; > +static unsigned int current_array_size; > +static unsigned int nr_fds; > + > +#define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1)) > +static struct pollfd *set_fds(int fd, short events) > +{ > + struct pollfd *ret; > + if (current_array_size < nr_fds + 1) { > + struct pollfd *new_fds = NULL; > + unsigned long newsize; > + > + /* Round up to 2^8 boundary, in practice this just > + * make newsize larger than current_array_size. > + */ > + newsize = ROUNDUP(nr_fds + 1, 8); > + > + new_fds = realloc(fds, sizeof(struct pollfd)*newsize); > + if (!new_fds) > + goto fail; > + fds = new_fds; > + > + memset(&fds[0] + current_array_size, 0, > + sizeof(struct pollfd) * (newsize-current_array_size)); > + current_array_size = newsize; > + } > + > + fds[nr_fds].fd = fd; > + fds[nr_fds].events = events; > + ret = &fds[nr_fds]; > + nr_fds++; > + > + return ret; > +fail: > + dolog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd); > + return NULL; > +} > + > +static void reset_fds(void) > +{ > + nr_fds = 0; > + memset(fds, 0, sizeof(struct pollfd) * current_array_size); > +} > + > void handle_io(void) > { > - fd_set readfds, writefds; > int ret; > > if (log_hv) { > @@ -959,21 +1006,16 @@ void handle_io(void) > > for (;;) { > struct domain *d, *n; > - int max_fd = -1; > - struct timeval timeout; > + int poll_timeout; /* timeout in milliseconds */ > struct timespec ts; > long long now, next_timeout = 0; > > - FD_ZERO(&readfds); > - FD_ZERO(&writefds); > + reset_fds(); > > - FD_SET(xs_fileno(xs), &readfds); > - max_fd = MAX(xs_fileno(xs), max_fd); > + xs_pollfd = set_fds(xs_fileno(xs), POLLIN); > > - if (log_hv) { > - FD_SET(xc_evtchn_fd(xce_handle), &readfds); > - max_fd = MAX(xc_evtchn_fd(xce_handle), max_fd); > - } > + if (log_hv) > + xce_pollfd = set_fds(xc_evtchn_fd(xce_handle), POLLIN); > > if (clock_gettime(CLOCK_MONOTONIC, &ts) < 0) > return; > @@ -982,11 +1024,7 @@ void handle_io(void) > /* Re-calculate any event counter allowances & unblock > domains with new allowance */ > for (d = dom_head; d; d = d->next) { > - /* Add 5ms of fuzz since select() often returns > - a couple of ms sooner than requested. Without > - the fuzz we typically do an extra spin in select() > - with a 1/2 ms timeout every other iteration */ > - if ((now+5) > d->next_period) { > + if (now > d->next_period) {Is poll more accurate than select? I would have thought that they were based on the same timing, and thus equally "fuzzy"?> d->next_period = now + RATE_LIMIT_PERIOD; > if (d->event_count >= RATE_LIMIT_ALLOWANCE) { > (void)xc_evtchn_unmask(d->xce_handle, d->local_port); > @@ -1006,74 +1044,76 @@ void handle_io(void) > !d->buffer.max_capacity || > d->buffer.size < d->buffer.max_capacity) { > int evtchn_fd = xc_evtchn_fd(d->xce_handle); > - FD_SET(evtchn_fd, &readfds); > - max_fd = MAX(evtchn_fd, max_fd); > + d->xce_pollfd = set_fds(evtchn_fd, POLLIN); > } > } > > if (d->master_fd != -1) { > + short events = 0; > if (!d->is_dead && ring_free_bytes(d)) > - FD_SET(d->master_fd, &readfds); > + events |= POLLIN; > > if (!buffer_empty(&d->buffer)) > - FD_SET(d->master_fd, &writefds); > - max_fd = MAX(d->master_fd, max_fd); > + events |= POLLOUT; > + > + if (events) > + d->master_pollfd > + set_fds(d->master_fd, events); > } > } > > /* If any domain has been rate limited, we need to work > - out what timeout to supply to select */ > + out what timeout to supply to poll */ > if (next_timeout) { > long long duration = (next_timeout - now); > if (duration <= 0) /* sanity check */ > duration = 1; > - timeout.tv_sec = duration / 1000; > - timeout.tv_usec = ((duration - (timeout.tv_sec * 1000)) > - * 1000); > + poll_timeout = (int)duration; > } > > - ret = select(max_fd + 1, &readfds, &writefds, 0, > - next_timeout ? &timeout : NULL); > + ret = poll(fds, nr_fds, next_timeout ? poll_timeout : -1); > > if (log_reload) { > handle_log_reload(); > log_reload = 0; > } > > - /* Abort if select failed, except for EINTR cases > + /* Abort if poll failed, except for EINTR cases > which indicate a possible log reload */ > if (ret == -1) { > if (errno == EINTR) > continue; > - dolog(LOG_ERR, "Failure in select: %d (%s)", > + dolog(LOG_ERR, "Failure in poll: %d (%s)", > errno, strerror(errno)); > break; > } > > - if (log_hv && FD_ISSET(xc_evtchn_fd(xce_handle), &readfds)) > + if (log_hv && xce_pollfd && xce_pollfd->revents & POLLIN) > handle_hv_logs(); > > if (ret <= 0) > continue; > > - if (FD_ISSET(xs_fileno(xs), &readfds)) > + if (xs_pollfd && xs_pollfd->revents & POLLIN) > handle_xs(); > > for (d = dom_head; d; d = n) { > n = d->next; > if (d->event_count < RATE_LIMIT_ALLOWANCE) { > if (d->xce_handle != NULL && > - FD_ISSET(xc_evtchn_fd(d->xce_handle), > - &readfds)) > + d->xce_pollfd != NULL && > + d->xce_pollfd->revents & POLLIN) > handle_ring_read(d); > } > > - if (d->master_fd != -1 && FD_ISSET(d->master_fd, > - &readfds)) > + if (d->master_fd != -1 && > + d->master_pollfd && > + d->master_pollfd->revents & POLLIN) > handle_tty_read(d); > > - if (d->master_fd != -1 && FD_ISSET(d->master_fd, > - &writefds)) > + if (d->master_fd != -1 && > + d->master_pollfd && > + d->master_pollfd->revents & POLLOUT) > handle_tty_write(d); > > if (d->last_seen != enum_pass) > @@ -1084,6 +1124,9 @@ void handle_io(void) > } > } > > + free(fds); > + current_array_size = 0; > + > out: > if (log_hv_fd != -1) { > close(log_hv_fd); > diff --git a/tools/console/daemon/utils.c b/tools/console/daemon/utils.c > index aab6f42..d0b1b00 100644 > --- a/tools/console/daemon/utils.c > +++ b/tools/console/daemon/utils.c > @@ -33,11 +33,13 @@ > #include <sys/un.h> > #include <string.h> > #include <signal.h> > +#include <poll.h> > > #include "xenctrl.h" > #include "utils.h" > > struct xs_handle *xs; > +struct pollfd *xs_pollfd;I don''t see this used outside of io.c - am I missing something? Adding more dependencies between different source files seems unnecessary... -- Mats> xc_interface *xc; > > static void child_exit(int sig) > diff --git a/tools/console/daemon/utils.h b/tools/console/daemon/utils.h > index 8725dcd..8e72dab 100644 > --- a/tools/console/daemon/utils.h > +++ b/tools/console/daemon/utils.h > @@ -24,6 +24,7 @@ > #include <stdbool.h> > #include <syslog.h> > #include <stdio.h> > +#include <poll.h> > #include <xenctrl.h> > > #include <xenstore.h> > @@ -32,6 +33,7 @@ void daemonize(const char *pidfile); > bool xen_setup(void); > > extern struct xs_handle *xs; > +extern struct pollfd *xs_pollfd; > extern xc_interface *xc; > > #if 1 > -- > 1.7.10.4 > > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel > >
Wei Liu
2013-Jan-07  14:44 UTC
Re: [PATCH V4] Switch from select() to poll() in xenconsoled''s IO loop
On Mon, 2013-01-07 at 14:39 +0000, Ian Campbell wrote:> On Mon, 2013-01-07 at 14:28 +0000, Wei Liu wrote > > @@ -69,6 +69,7 @@ static int log_hv_fd = -1; > > static evtchn_port_or_error_t log_hv_evtchn = -1; > > static xc_interface *xch; /* why does xenconsoled have two xc handles ? */ > > static xc_evtchn *xce_handle = NULL; > > +static struct pollfd *xce_pollfd = NULL; > > > > struct buffer { > > char *data; > [...] > > @@ -32,6 +33,7 @@ void daemonize(const char *pidfile); > > bool xen_setup(void); > > > > extern struct xs_handle *xs; > > +extern struct pollfd *xs_pollfd; > > extern xc_interface *xc; > > xs_pollfd and the xce_pollfd can both be local to the handle_io > function, I think. > > > > > - if (d->master_fd != -1 && FD_ISSET(d->master_fd, > > - &readfds)) > > + if (d->master_fd != -1 && > > + d->master_pollfd && > > + d->master_pollfd->revents & POLLIN) > > handle_tty_read(d); > > > > - if (d->master_fd != -1 && FD_ISSET(d->master_fd, > > - &writefds)) > > + if (d->master_fd != -1 && > > + d->master_pollfd && > > + d->master_pollfd->revents & POLLOUT) > > handle_tty_write(d); > > > > if (d->last_seen != enum_pass) > > This is probably one for Ian J but I wonder if you need to handle > POLLERR or POLLHUP here. ISTR some of oddness WRT these when Ian > implemented the libxl event subsystem. >Ian J, can you elaborate on this? Wei.
Ian Jackson
2013-Jan-07  14:52 UTC
Re: [PATCH V4] Switch from select() to poll() in xenconsoled''s IO loop
Ian Campbell writes ("Re: [Xen-devel] [PATCH V4] Switch from select() to
poll() in xenconsoled''s IO loop"):> On Mon, 2013-01-07 at 14:28 +0000, Wei Liu wrote
> > -                       if (d->master_fd != -1 &&
FD_ISSET(d->master_fd,
> > -                                                         
&readfds))
...> This is probably one for Ian J but I wonder if you need to handle
> POLLERR or POLLHUP here. ISTR some of oddness WRT these when Ian
> implemented the libxl event subsystem.
I haven''t read the patch in question but:
You certainly need to check for POLLERR and POLLHUP.  They appear in
revents even if not requested.  Normally they can be treated as fatal
for the descriptor.
You should also think about POLLPRI.  Often it''s correct to request it
and then treat it as fatal for the descriptor if it occurs.
Ian.
(I haven''t read the patch I''m afraid...)
Wei Liu
2013-Jan-07  15:01 UTC
Re: [PATCH V4] Switch from select() to poll() in xenconsoled''s IO loop
On Mon, 2013-01-07 at 14:41 +0000, Mats Petersson wrote:> > return; > > @@ -982,11 +1024,7 @@ void handle_io(void) > > /* Re-calculate any event counter allowances & unblock > > domains with new allowance */ > > for (d = dom_head; d; d = d->next) { > > - /* Add 5ms of fuzz since select() often returns > > - a couple of ms sooner than requested. Without > > - the fuzz we typically do an extra spin in select() > > - with a 1/2 ms timeout every other iteration */ > > - if ((now+5) > d->next_period) { > > + if (now > d->next_period) { > Is poll more accurate than select? I would have thought that they were > based on the same timing, and thus equally "fuzzy"?Is there any actual proof that the fuzz is needed? Specs of both select() and poll() don''t seem to mention this behaviour at all.> > #include "utils.h" > > > > struct xs_handle *xs; > > +struct pollfd *xs_pollfd; > I don''t see this used outside of io.c - am I missing something? > > Adding more dependencies between different source files seems > unnecessary... >Fixed. Wei.
Mats Petersson
2013-Jan-07  15:06 UTC
Re: [PATCH V4] Switch from select() to poll() in xenconsoled''s IO loop
On 07/01/13 15:01, Wei Liu wrote:> On Mon, 2013-01-07 at 14:41 +0000, Mats Petersson wrote: > >>> return; >>> @@ -982,11 +1024,7 @@ void handle_io(void) >>> /* Re-calculate any event counter allowances & unblock >>> domains with new allowance */ >>> for (d = dom_head; d; d = d->next) { >>> - /* Add 5ms of fuzz since select() often returns >>> - a couple of ms sooner than requested. Without >>> - the fuzz we typically do an extra spin in select() >>> - with a 1/2 ms timeout every other iteration */ >>> - if ((now+5) > d->next_period) { >>> + if (now > d->next_period) { >> Is poll more accurate than select? I would have thought that they were >> based on the same timing, and thus equally "fuzzy"? > Is there any actual proof that the fuzz is needed? Specs of both > select() and poll() don''t seem to mention this behaviour at all.That''s a good question. I don''t know. The tricky part with this sort of thing is that it may well depend on configurations, hardware differences, etc, so you may find that it works just fine on your test-box, but some big customer with Another-brand Co''s servers don''t work, because there is some subtle difference in hardware. Or it stops working if you have more than X number of CPU''s. If you are convinced it''s fine as it is, then by all means. I''m just thinking that it probably wasn''t put there "by accident". -- Mats> >>> #include "utils.h" >>> >>> struct xs_handle *xs; >>> +struct pollfd *xs_pollfd; >> I don''t see this used outside of io.c - am I missing something? >> >> Adding more dependencies between different source files seems >> unnecessary... >> > Fixed. > > > Wei. > >
Ian Campbell
2013-Jan-07  15:16 UTC
Re: [PATCH V4] Switch from select() to poll() in xenconsoled''s IO loop
On Mon, 2013-01-07 at 15:01 +0000, Wei Liu wrote:> On Mon, 2013-01-07 at 14:41 +0000, Mats Petersson wrote: > > > > return; > > > @@ -982,11 +1024,7 @@ void handle_io(void) > > > /* Re-calculate any event counter allowances & unblock > > > domains with new allowance */ > > > for (d = dom_head; d; d = d->next) { > > > - /* Add 5ms of fuzz since select() often returns > > > - a couple of ms sooner than requested. Without > > > - the fuzz we typically do an extra spin in select() > > > - with a 1/2 ms timeout every other iteration */ > > > - if ((now+5) > d->next_period) { > > > + if (now > d->next_period) { > > Is poll more accurate than select? I would have thought that they were > > based on the same timing, and thus equally "fuzzy"? > > Is there any actual proof that the fuzz is needed? Specs of both > select() and poll() don''t seem to mention this behaviour at all.I agree that it seems pretty dubious but it might be interesting to see what strace shows, specifically if it shows this extra spin every other iteration. The fuzz was introduced in 16257:955ee4fa1345 "Rate-limit activity caused by each domU." but the commit log doesn''t make any reference to the reason for it. It may just have been a kernel bug at around the time that patch was authored, but google doesn''t seem to show any evidence of such a bug ever being widespread (i.e. I don''t find any references to it). Ian.
Ian Campbell
2013-Jan-07  15:17 UTC
Re: [PATCH V4] Switch from select() to poll() in xenconsoled''s IO loop
On Mon, 2013-01-07 at 15:06 +0000, Mats Petersson wrote:> On 07/01/13 15:01, Wei Liu wrote: > > On Mon, 2013-01-07 at 14:41 +0000, Mats Petersson wrote: > > > >>> return; > >>> @@ -982,11 +1024,7 @@ void handle_io(void) > >>> /* Re-calculate any event counter allowances & unblock > >>> domains with new allowance */ > >>> for (d = dom_head; d; d = d->next) { > >>> - /* Add 5ms of fuzz since select() often returns > >>> - a couple of ms sooner than requested. Without > >>> - the fuzz we typically do an extra spin in select() > >>> - with a 1/2 ms timeout every other iteration */ > >>> - if ((now+5) > d->next_period) { > >>> + if (now > d->next_period) { > >> Is poll more accurate than select? I would have thought that they were > >> based on the same timing, and thus equally "fuzzy"? > > Is there any actual proof that the fuzz is needed? Specs of both > > select() and poll() don''t seem to mention this behaviour at all. > That''s a good question. I don''t know. The tricky part with this sort of > thing is that it may well depend on configurations, hardware > differences, etc, so you may find that it works just fine on your > test-box, but some big customer with Another-brand Co''s servers don''t > work, because there is some subtle difference in hardware. Or it stops > working if you have more than X number of CPU''s. If you are convinced > it''s fine as it is, then by all means. I''m just thinking that it > probably wasn''t put there "by accident".There''s certainly an argument for removing it in a separate changeset though in case it does cause issues. Ian
Wei Liu
2013-Jan-07  15:24 UTC
Re: [PATCH V4] Switch from select() to poll() in xenconsoled''s IO loop
On Mon, 2013-01-07 at 15:16 +0000, Ian Campbell wrote:> On Mon, 2013-01-07 at 15:01 +0000, Wei Liu wrote: > > On Mon, 2013-01-07 at 14:41 +0000, Mats Petersson wrote: > > > > > > return; > > > > @@ -982,11 +1024,7 @@ void handle_io(void) > > > > /* Re-calculate any event counter allowances & unblock > > > > domains with new allowance */ > > > > for (d = dom_head; d; d = d->next) { > > > > - /* Add 5ms of fuzz since select() often returns > > > > - a couple of ms sooner than requested. Without > > > > - the fuzz we typically do an extra spin in select() > > > > - with a 1/2 ms timeout every other iteration */ > > > > - if ((now+5) > d->next_period) { > > > > + if (now > d->next_period) { > > > Is poll more accurate than select? I would have thought that they were > > > based on the same timing, and thus equally "fuzzy"? > > > > Is there any actual proof that the fuzz is needed? Specs of both > > select() and poll() don''t seem to mention this behaviour at all. > > I agree that it seems pretty dubious but it might be interesting to see > what strace shows, specifically if it shows this extra spin every other > iteration. > > The fuzz was introduced in 16257:955ee4fa1345 "Rate-limit activity > caused by each domU." but the commit log doesn''t make any reference to > the reason for it. > > It may just have been a kernel bug at around the time that patch was > authored, but google doesn''t seem to show any evidence of such a bug > ever being widespread (i.e. I don''t find any references to it). >Then I will leave the fuzz here, and do another patch to remove it. I will also add reference to CS 16257 in the comment. Wei.
Possibly Parallel Threads
- 1000 Domains: Not able to access Domu via xm console from Dom0
- 1000 Domains: Not able to access Domu via xm console from Dom0
- [PATCH] xenconsoled: use grant references instead of map_foreign_range
- [PATCH v2 1/2] xenconsoled: use grant references instead of map_foreign_range
- [PATCH] Enable ConnectTimeout with ConnectionAttempts