Rob Gardner
2006-Apr-13 21:41 UTC
[Xen-devel] [PATCH]Non-polling trace record access, userland tools
This is a patch for XenMon which only applies to the userspace tools. The primary purpose of this patch is to add support for non-polling access to the xen trace buffers. The hypervisor changes have already been accepted. Also included are a few bug fixes and some minor new features: 1. If xenmon is run without first allocating trace buffers (via ''setsize'') and enabling them (via ''tbctl''), then this is done automatically using sensible defaults. 2. There was a bug that caused the first second''s worth of data output from xenmon to be erroneous; This has been fixed. 3. There was a bug that caused xenmon to sometimes not display data for newly created domains; This has also been fixed. 4. The xenmon display has a ''heartbeat'' which flickers once per second. This is to show that xenmon is still alive, even though the display isn''t changing at all, a situation that can happen sometimes when there is nothing at all happening on a particular cpu. 5. Added cpu utilization display to the top of the xenmon window. 6. Added a bunch of options in xenmon to control exactly which metrics are displayed, so the screen doesn''t get cluttered with stuff you''re not interested in. The new options are: --allocated --noallocated --blocked --noblocked --waited --nowaited --excount --noexcount --iocount --noiocount 7. Added an option ("--cpu=N") to xenmon to specify which physical cpu you''d like data displayed for. 8. Updated the README with information about default trace buffer size, etc. Rob Gardner # HG changeset patch # User rob.gardner@hp.com # Node ID 2c7f48f6b48cb7eb615170cf9cf3293c2da09764 # Parent edefe24d78bd5e6e51bdd002f3ee7721d6dc2ce5 Add support for non-polling trace buffer access; Also fix some bugs and add a few new features. Signed-off-by: Rob Gardner <rob.gardner@hp.com> diff -r edefe24d78bd -r 2c7f48f6b48c tools/xenmon/xenbaked.h --- a/tools/xenmon/xenbaked.h Tue Apr 11 11:51:48 2006 +++ b/tools/xenmon/xenbaked.h Thu Apr 13 21:31:50 2006 @@ -1,5 +1,5 @@ /****************************************************************************** - * tools/xenbaked.h + * TOOLS/xenbaked.h * * Header file for xenbaked * @@ -30,6 +30,7 @@ #define million 1000000LL #define billion 1000000000LL +// caution: don''t use QOS_ADD with negative numbers! #define QOS_ADD(N,A) ((N+A)<(NSAMPLES-1) ? (N+A) : A) #define QOS_INCR(N) ((N<(NSAMPLES-2)) ? (N+1) : 0) #define QOS_DECR(N) ((N==0) ? (NSAMPLES-1) : (N-1)) @@ -43,6 +44,8 @@ /* Number of data points to keep */ #define NSAMPLES 100 +#define ID(X) ((X>NDOMAINS-1)?(NDOMAINS-1):X) +#define DEFAULT_TBUF_SIZE 20 // per domain stuff typedef struct diff -r edefe24d78bd -r 2c7f48f6b48c tools/xenmon/xenmon.py --- a/tools/xenmon/xenmon.py Tue Apr 11 11:51:48 2006 +++ b/tools/xenmon/xenmon.py Thu Apr 13 21:31:50 2006 @@ -5,7 +5,7 @@ # There is a curses interface for live monitoring. XenMon also allows # logging to a file. For options, run python xenmon.py -h # -# Copyright (C) 2005 by Hewlett Packard, Palo Alto and Fort Collins +# Copyright (C) 2005,2006 by Hewlett Packard, Palo Alto and Fort Collins # Authors: Lucy Cherkasova, lucy.cherkasova@hp.com # Rob Gardner, rob.gardner@hp.com # Diwaker Gupta, diwaker.gupta@hp.com @@ -85,6 +85,33 @@ parser.add_option("--ms_per_sample", dest="mspersample", action="store", type="int", default=100, help = "determines how many ms worth of data goes in a sample") + parser.add_option("--cpu", dest="cpu", action="store", type="int", default=0, + help = "specifies which cpu to display data for") + + parser.add_option("--allocated", dest="allocated", action="store_true", + default=False, help="Display allocated time for each domain") + parser.add_option("--noallocated", dest="allocated", action="store_false", + default=False, help="Don''t display allocated time for each domain") + + parser.add_option("--blocked", dest="blocked", action="store_true", + default=True, help="Display blocked time for each domain") + parser.add_option("--noblocked", dest="blocked", action="store_false", + default=True, help="Don''t display blocked time for each domain") + + parser.add_option("--waited", dest="waited", action="store_true", + default=True, help="Display waiting time for each domain") + parser.add_option("--nowaited", dest="waited", action="store_false", + default=True, help="Don''t display waiting time for each domain") + + parser.add_option("--excount", dest="excount", action="store_true", + default=False, help="Display execution count for each domain") + parser.add_option("--noexcount", dest="excount", action="store_false", + default=False, help="Don''t display execution count for each domain") + parser.add_option("--iocount", dest="iocount", action="store_true", + default=False, help="Display I/O count for each domain") + parser.add_option("--noiocount", dest="iocount", action="store_false", + default=False, help="Don''t display I/O count for each domain") + return parser # encapsulate information about a domain @@ -227,19 +254,17 @@ # the live monitoring code -def show_livestats(): - cpu = 0 # cpu of interest to display data for +def show_livestats(cpu): ncpu = 1 # number of cpu''s on this platform slen = 0 # size of shared data structure, incuding padding - global dom_in_use + cpu_1sec_usage = 0.0 + cpu_10sec_usage = 0.0 + heartbeat = 1 + global dom_in_use, options # mmap the (the first chunk of the) file shmf = open(SHM_FILE, "r+") shm = mmap.mmap(shmf.fileno(), QOS_DATA_SIZE) - - samples = [] - doms = [] - dom_in_use = [] # initialize curses stdscr = _c.initscr() @@ -253,7 +278,8 @@ # display in a loop while True: - for cpuidx in range(0, ncpu): + cpuidx = 0 + while cpuidx < ncpu: # calculate offset in mmap file to start from idx = cpuidx * slen @@ -261,6 +287,7 @@ samples = [] doms = [] + dom_in_use = [] # read in data for i in range(0, NSAMPLES): @@ -279,6 +306,8 @@ # dom_in_use.append(in_use) dom_in_use.append(dom[8]) idx += len +# print "dom_in_use(cpu=%d): " % cpuidx, dom_in_use + len = struct.calcsize("4i") oldncpu = ncpu @@ -294,6 +323,8 @@ # stop examining mmap data and start displaying stuff if cpuidx == cpu: break + + cpuidx = cpuidx + 1 # calculate starting and ending datapoints; never look at "next" since # it represents live data that may be in transition. @@ -312,12 +343,15 @@ row = 0 display(stdscr, row, 1, "CPU = %d" % cpu, _c.A_STANDOUT) - display(stdscr, row, 10, "%sLast 10 seconds%sLast 1 second" % (6*'' '', 30*'' ''), _c.A_BOLD) + display(stdscr, row, 10, "%sLast 10 seconds (%3.2f%%)%sLast 1 second (%3.2f%%)" % (6*'' '', cpu_10sec_usage, 30*'' '', cpu_1sec_usage), _c.A_BOLD) row +=1 display(stdscr, row, 1, "%s" % ((maxx-2)*''='')) total_h1_cpu = 0 total_h2_cpu = 0 + + cpu_1sec_usage = 0.0 + cpu_10sec_usage = 0.0 for dom in range(0, NDOMAINS): if not dom_in_use[dom]: @@ -332,92 +366,102 @@ display(stdscr, row, col, "%s" % time_scale(h2[dom][0][0])) col += 12 display(stdscr, row, col, "%3.2f%%" % h2[dom][0][1]) + if dom != NDOMAINS - 1: + cpu_10sec_usage += h2[dom][0][1] col += 12 display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][0][2])) col += 18 display(stdscr, row, col, "%s" % time_scale(h1[dom][0][0])) col += 12 - display(stdscr, row, col, "%3.2f%%" % h1[dom][0][1]) + display(stdscr, row, col, "%3.2f%%" % h1[dom][0][1], _c.A_STANDOUT) col += 12 display(stdscr, row, col, "%s/ex" % time_scale(h1[dom][0][2])) col += 18 display(stdscr, row, col, "Gotten") + + if dom != NDOMAINS - 1: + cpu_1sec_usage = cpu_1sec_usage + h1[dom][0][1] # display allocated - row += 1 - col = 2 - display(stdscr, row, col, "%d" % dom) - col += 28 - display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][1])) - col += 42 - display(stdscr, row, col, "%s/ex" % time_scale(h1[dom][1])) - col += 18 - display(stdscr, row, col, "Allocated") + if options.allocated: + row += 1 + col = 2 + display(stdscr, row, col, "%d" % dom) + col += 28 + display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][1])) + col += 42 + display(stdscr, row, col, "%s/ex" % time_scale(h1[dom][1])) + col += 18 + display(stdscr, row, col, "Allocated") # display blocked - row += 1 - col = 2 - display(stdscr, row, col, "%d" % dom) - col += 4 - display(stdscr, row, col, "%s" % time_scale(h2[dom][2][0])) - col += 12 - display(stdscr, row, col, "%3.2f%%" % h2[dom][2][1]) - col += 12 - display(stdscr, row, col, "%s/io" % time_scale(h2[dom][2][2])) - col += 18 - display(stdscr, row, col, "%s" % time_scale(h1[dom][2][0])) - col += 12 - display(stdscr, row, col, "%3.2f%%" % h1[dom][2][1]) - col += 12 - display(stdscr, row, col, "%s/io" % time_scale(h1[dom][2][2])) - col += 18 - display(stdscr, row, col, "Blocked") + if options.blocked: + row += 1 + col = 2 + display(stdscr, row, col, "%d" % dom) + col += 4 + display(stdscr, row, col, "%s" % time_scale(h2[dom][2][0])) + col += 12 + display(stdscr, row, col, "%3.2f%%" % h2[dom][2][1]) + col += 12 + display(stdscr, row, col, "%s/io" % time_scale(h2[dom][2][2])) + col += 18 + display(stdscr, row, col, "%s" % time_scale(h1[dom][2][0])) + col += 12 + display(stdscr, row, col, "%3.2f%%" % h1[dom][2][1]) + col += 12 + display(stdscr, row, col, "%s/io" % time_scale(h1[dom][2][2])) + col += 18 + display(stdscr, row, col, "Blocked") # display waited - row += 1 - col = 2 - display(stdscr, row, col, "%d" % dom) - col += 4 - display(stdscr, row, col, "%s" % time_scale(h2[dom][3][0])) - col += 12 - display(stdscr, row, col, "%3.2f%%" % h2[dom][3][1]) - col += 12 - display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][3][2])) - col += 18 - display(stdscr, row, col, "%s" % time_scale(h1[dom][3][0])) - col += 12 - display(stdscr, row, col, "%3.2f%%" % h1[dom][3][1]) - col += 12 - display(stdscr, row, col, "%s/ex" % time_scale(h1[dom][3][2])) - col += 18 - display(stdscr, row, col, "Waited") + if options.waited: + row += 1 + col = 2 + display(stdscr, row, col, "%d" % dom) + col += 4 + display(stdscr, row, col, "%s" % time_scale(h2[dom][3][0])) + col += 12 + display(stdscr, row, col, "%3.2f%%" % h2[dom][3][1]) + col += 12 + display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][3][2])) + col += 18 + display(stdscr, row, col, "%s" % time_scale(h1[dom][3][0])) + col += 12 + display(stdscr, row, col, "%3.2f%%" % h1[dom][3][1]) + col += 12 + display(stdscr, row, col, "%s/ex" % time_scale(h1[dom][3][2])) + col += 18 + display(stdscr, row, col, "Waited") # display ex count - row += 1 - col = 2 - display(stdscr, row, col, "%d" % dom) - - col += 28 - display(stdscr, row, col, "%d/s" % h2[dom][4]) - col += 42 - display(stdscr, row, col, "%d" % h1[dom][4]) - col += 18 - display(stdscr, row, col, "Execution count") + if options.excount: + row += 1 + col = 2 + display(stdscr, row, col, "%d" % dom) + + col += 28 + display(stdscr, row, col, "%d/s" % h2[dom][4]) + col += 42 + display(stdscr, row, col, "%d" % h1[dom][4]) + col += 18 + display(stdscr, row, col, "Execution count") # display io count - row += 1 - col = 2 - display(stdscr, row, col, "%d" % dom) - col += 4 - display(stdscr, row, col, "%d/s" % h2[dom][5][0]) - col += 24 - display(stdscr, row, col, "%d/ex" % h2[dom][5][1]) - col += 18 - display(stdscr, row, col, "%d" % h1[dom][5][0]) - col += 24 - display(stdscr, row, col, "%3.2f/ex" % h1[dom][5][1]) - col += 18 - display(stdscr, row, col, "I/O Count") + if options.iocount: + row += 1 + col = 2 + display(stdscr, row, col, "%d" % dom) + col += 4 + display(stdscr, row, col, "%d/s" % h2[dom][5][0]) + col += 24 + display(stdscr, row, col, "%d/ex" % h2[dom][5][1]) + col += 18 + display(stdscr, row, col, "%d" % h1[dom][5][0]) + col += 24 + display(stdscr, row, col, "%3.2f/ex" % h1[dom][5][1]) + col += 18 + display(stdscr, row, col, "I/O Count") #row += 1 #stdscr.hline(row, 1, ''-'', maxx - 2) @@ -426,6 +470,9 @@ row += 1 + star = heartbeat * ''*'' + heartbeat = 1 - heartbeat + display(stdscr, row, 1, star) display(stdscr, row, 2, TOTALS % (total_h2_cpu, total_h1_cpu)) row += 1 # display(stdscr, row, 2, @@ -515,10 +562,10 @@ outfiles[dom].delayed_write("# passed cpu dom cpu(tot) cpu(%) cpu/ex allocated/ex blocked(tot) blocked(%) blocked/io waited(tot) waited(%) waited/ex ex/s io(tot) io/ex\n") while options.duration == 0 or interval < (options.duration * 1000): - for cpuidx in range(0, ncpu): + cpuidx = 0 + while cpuidx < ncpu: idx = cpuidx * slen # offset needed in mmap file - samples = [] doms = [] @@ -571,6 +618,7 @@ curr = time.time() interval += (curr - last) * 1000 last = curr + cpuidx = cpuidx + 1 time.sleep(options.interval / 1000.0) for dom in range(0, NDOMAINS): @@ -601,7 +649,7 @@ start_xenbaked() if options.live: - show_livestats() + show_livestats(options.cpu) else: try: writelog() diff -r edefe24d78bd -r 2c7f48f6b48c tools/xenmon/xenbaked.c --- a/tools/xenmon/xenbaked.c Tue Apr 11 11:51:48 2006 +++ b/tools/xenmon/xenbaked.c Thu Apr 13 21:31:50 2006 @@ -7,6 +7,7 @@ * * Copyright (C) 2004 by Intel Research Cambridge * Copyright (C) 2005 by Hewlett Packard, Palo Alto and Fort Collins + * Copyright (C) 2006 by Hewlett Packard Fort Collins * * Authors: Diwaker Gupta, diwaker.gupta@hp.com * Rob Gardner, rob.gardner@hp.com @@ -42,6 +43,8 @@ #include <xenctrl.h> #include <xen/xen.h> #include <string.h> +#include <sys/select.h> +#include <xen/linux/evtchn.h> #include "xc_private.h" typedef struct { int counter; } atomic_t; @@ -81,14 +84,13 @@ int interrupted = 0; /* gets set if we get a SIGHUP */ int rec_count = 0; +int wakeups = 0; time_t start_time; int dom0_flips = 0; _new_qos_data *new_qos; _new_qos_data **cpu_qos_data; - -#define ID(X) ((X>NDOMAINS-1)?(NDOMAINS-1):X) // array of currently running domains, indexed by cpu int *running = NULL; @@ -223,6 +225,9 @@ printf("processed %d total records in %d seconds (%ld per second)\n", rec_count, (int)run_time, rec_count/run_time); + printf("woke up %d times in %d seconds (%ld per second)\n", wakeups, + (int) run_time, wakeups/run_time); + check_gotten_sum(); } @@ -243,6 +248,112 @@ stat_map[0].event_count++; // other } +#define EVTCHN_DEV_NAME "/dev/xen/evtchn" +#define EVTCHN_DEV_MAJOR 10 +#define EVTCHN_DEV_MINOR 201 + +int virq_port; +int eventchn_fd = -1; + +/* Returns the event channel handle. */ +/* Stolen from xenstore code */ +int eventchn_init(void) +{ + struct stat st; + struct ioctl_evtchn_bind_virq bind; + int rc; + + // to revert to old way: + if (0) + return -1; + + /* Make sure any existing device file links to correct device. */ + if ((lstat(EVTCHN_DEV_NAME, &st) != 0) || !S_ISCHR(st.st_mode) || + (st.st_rdev != makedev(EVTCHN_DEV_MAJOR, EVTCHN_DEV_MINOR))) + (void)unlink(EVTCHN_DEV_NAME); + + reopen: + eventchn_fd = open(EVTCHN_DEV_NAME, O_NONBLOCK|O_RDWR); + if (eventchn_fd == -1) { + if ((errno == ENOENT) && + ((mkdir("/dev/xen", 0755) == 0) || (errno == EEXIST)) && + (mknod(EVTCHN_DEV_NAME, S_IFCHR|0600, + makedev(EVTCHN_DEV_MAJOR, EVTCHN_DEV_MINOR)) == 0)) + goto reopen; + return -errno; + } + + if (eventchn_fd < 0) + perror("Failed to open evtchn device"); + + bind.virq = VIRQ_TBUF; + rc = ioctl(eventchn_fd, IOCTL_EVTCHN_BIND_VIRQ, &bind); + if (rc == -1) + perror("Failed to bind to domain exception virq port"); + virq_port = rc; + + return eventchn_fd; +} + +void wait_for_event(void) +{ + int ret; + fd_set inset; + evtchn_port_t port; + struct timeval tv; + + if (eventchn_fd < 0) { + nanosleep(&opts.poll_sleep, NULL); + return; + } + + FD_ZERO(&inset); + FD_SET(eventchn_fd, &inset); + tv.tv_sec = 1; + tv.tv_usec = 0; + // tv = millis_to_timespec(&opts.poll_sleep); + ret = select(eventchn_fd+1, &inset, NULL, NULL, &tv); + + if ( (ret == 1) && FD_ISSET(eventchn_fd, &inset)) { + if (read(eventchn_fd, &port, sizeof(port)) != sizeof(port)) + perror("Failed to read from event fd"); + + // if (port == virq_port) + // printf("got the event I was looking for\r\n"); + + if (write(eventchn_fd, &port, sizeof(port)) != sizeof(port)) + perror("Failed to write to event fd"); + } +} + +void enable_tracing_or_die(int xc_handle) +{ + int enable = 1; + int tbsize = DEFAULT_TBUF_SIZE; + + if (xc_tbuf_enable(xc_handle, enable) != 0) { + if (xc_tbuf_set_size(xc_handle, tbsize) != 0) { + perror("set_size Hypercall failure"); + exit(1); + } + printf("Set default trace buffer allocation (%d pages)\n", tbsize); + if (xc_tbuf_enable(xc_handle, enable) != 0) { + perror("Could not enable trace buffers\n"); + exit(1); + } + } + else + printf("Tracing enabled\n"); +} + +void disable_tracing(void) +{ + int enable = 0; + int xc_handle = xc_interface_open(); + + xc_tbuf_enable(xc_handle, enable); + xc_interface_close(xc_handle); +} /** @@ -258,6 +369,17 @@ int ret; dom0_op_t op; /* dom0 op we''ll build */ int xc_handle = xc_interface_open(); /* for accessing control interface */ + unsigned int tbsize; + + enable_tracing_or_die(xc_handle); + + if (xc_tbuf_get_size(xc_handle, &tbsize) != 0) { + perror("Failure to get tbuf info from Xen. Guess size is 0?"); + exit(1); + } + else + printf("Current tbuf size: 0x%x\n", tbsize); + op.cmd = DOM0_TBUFCONTROL; op.interface_version = DOM0_INTERFACE_VERSION; @@ -448,6 +570,11 @@ meta = init_bufs_ptrs (tbufs_mapped, num, size); data = init_rec_ptrs(meta, num); + // Set up event channel for select() + if (eventchn_init() < 0) { + fprintf(stderr, "Failed to initialize event channel; Using POLL method\r\n"); + } + /* now, scan buffers for events */ while ( !interrupted ) { @@ -460,7 +587,8 @@ meta[i]->cons++; } - nanosleep(&opts.poll_sleep, NULL); + wait_for_event(); + wakeups++; } /* cleanup */ @@ -640,6 +768,7 @@ dump_stats(); msync(new_qos, sizeof(_new_qos_data), MS_SYNC); + disable_tracing(); return ret; } @@ -737,7 +866,9 @@ start = new_qos->domain_info[id].start_time; if (start > now) { // wrapped around run_time = now + (~0ULL - start); - printf("warning: start > now\n"); + // this could happen if there is nothing going on within a cpu; + // in this case the idle domain would run forever + // printf("warning: start > now\n"); } else run_time = now - start; @@ -746,11 +877,11 @@ new_qos->domain_info[id].ns_oncpu_since_boot += run_time; new_qos->domain_info[id].start_time = now; new_qos->domain_info[id].ns_since_boot += time_since_update; -#if 1 + new_qos->qdata[n].ns_gotten[id] += run_time; - if (domid == 0 && cpu == 1) - printf("adding run time for dom0 on cpu1\r\n"); -#endif + // if (domid == 0 && cpu == 1) + // printf("adding run time for dom0 on cpu1\r\n"); + } new_qos->domain_info[id].runnable_at_last_update = domain_runnable(domid); @@ -916,13 +1047,13 @@ { int id = ID(domid); + qos_update_thread_stats(cpu, domid, now); + if (domain_runnable(id)) // double call? return; new_qos->domain_info[id].runnable = 1; update_blocked_time(domid, now); - qos_update_thread_stats(cpu, domid, now); - new_qos->domain_info[id].blocked_start_time = 0; /* invalidate */ new_qos->domain_info[id].runnable_start_time = now; // runnable_start_time[id] = now; @@ -951,7 +1082,7 @@ if (domid == IDLE_DOMAIN_ID) domid = NDOMAINS-1; if (domid < 0 || domid >= NDOMAINS) { - printf("bad domain id: %d\n", domid); + printf("bad domain id: %d\r\n", domid); return 0; } if (new_qos->domain_info[domid].in_use == 0) diff -r edefe24d78bd -r 2c7f48f6b48c tools/xenmon/README --- a/tools/xenmon/README Tue Apr 11 11:51:48 2006 +++ b/tools/xenmon/README Thu Apr 13 21:31:50 2006 @@ -84,6 +84,16 @@ events cause a trace record to be emitted. - To exit xenmon, type ''q'' - To cycle the display to other physical cpu''s, type ''c'' + - The first time xenmon is run, it attempts to allocate xen trace buffers + using a default size. If you wish to use a non-default value for the + trace buffer size, run the ''setsize'' program (located in tools/xentrace) + and specify the number of memory pages as a parameter. The default is 20. + - Not well tested with domains using more than 1 virtual cpu + - If you create a lot of domains, or repeatedly kill a domain and restart it, + and the domain id''s get to be bigger than NDOMAINS, then xenmon behaves badly. + This is a bug that is due to xenbaked''s treatment of domain id''s vs. domain + indices in a data array. Will be fixed in a future release; Workaround: + Increase NDOMAINS in xenbaked and rebuild. Future Work ----------- _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel