Dan Magenheimer
2009-Dec-09  16:31 UTC
[Xen-devel] [PATCH] tscmode: documentation (in C code) for pvrdtscp algorithm
This code describes the pvrdtscp algorithm and the new
ABI between Xen and applications.
Keir, I''m no sure where to put this... though it is a
C program, it is essentially documentation.  So, I''ve
just attached as a file, not as a patch.
Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
/* pvrdtscp algorithm
 *
 * This sample code demonstrates the use of the paravirtualized rdtscp
 * algorithm.  Using this algorithm, an application may communicate with
 * the Xen hypervisor (version 4.0+) to obtain timestamp information which
 * is both monotonically increasing and has a fixed 1 GHz rate, even across
 * migrations between machines with different TSC rates and offsets.
 * Further,the algorithm provides performance near the performance of a
 * native rdtsc/rdtscp instruction -- much faster than emulation PROVIDED
 * the application is running on a machine on which the rdtscp instruction
 * is supported and TSC is "safe". The application must also be
running in a
 * PV domain.  (HVM domains may be supported at a later time.) On machines
 * where TSC is unsafe or the rdtscp instruction is not supported, Xen
 * (v4.0+) provides emulation which is slower but consistent with the pvrdtscp
 * algorithm, thus providing support for the algorithm for live migration
 * across all machines.
 *
 * More information can be found within the Xen (4.0+) source tree at
 *  docs/misc/tscmode.txt
 *
 * Copyright (c) 2009 Oracle Corporation and/or its affiliates.
 * All rights reserved
 * Written by: Dan Magenheimer <dan.magenheimer@oracle.com>
 * 
 * This code is derived from code licensed under the GNU
 * General Public License ("GPL") version 2 and is therefore itself
 * also licensed under the GPL version 2.
 *
 * This code is known to compile and run on Oracle Enterprise Linux 5 Update 2
 * using gcc version 4.1.2, but its purpose is to describe the pvrdtscp
 * algorithm and its ABI to Xen version 4.0+ 
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/wait.h>
#ifdef __LP64__
#define __X86_64__
typedef unsigned short u16;
typedef unsigned int u32;
typedef unsigned long u64;
typedef int i32;
typedef long i64;
#define NSEC_PER_SEC 1000000000
#else
#define __X86_32__
typedef unsigned int u16;
typedef unsigned long u32;
typedef unsigned long long u64;
typedef long i32;
typedef long long i64;
#define NSEC_PER_SEC 1000000000L
#endif
static inline void hvm_cpuid(u32 idx, u32 sub,
				u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
{
	*eax = idx, *ecx = sub;
	asm("cpuid" : "=a" (*eax), "=b" (*ebx),
"=c" (*ecx), "=d" (*edx)
	    : "0" (*eax), "2" (*ecx));
}
static inline void pv_cpuid(u32 idx, u32 sub,
				u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
{
	*eax = idx, *ecx = sub;
	asm volatile ( "ud2a ; .ascii \"xen\"; cpuid" :
"=a" (*eax),
            "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
: "0" (*eax), "2" (*ecx));
}
static inline u64 do_rdtscp(u32 *aux)
{
static u64 last = 0;
	u32 lo32, hi32;
	u64 val;
	asm volatile(".byte
0x0f,0x01,0xf9":"=a"(lo32),"=d"(hi32),"=c"
(*aux));
	val = lo32 | ((u64)hi32 << 32);
	return val;
}
static inline int get_xen_tsc_mode(void)
{
	u32 val, dummy1, dummy2, dummy3;
	pv_cpuid(0x40000003,0,&dummy1,&val,&dummy2,&dummy3);
	return val;
}
static inline int get_xen_vtsc(void)
{
	u32 val, dummy1, dummy2, dummy3;
	pv_cpuid(0x40000003,0,&val,&dummy1,&dummy2,&dummy3);
	return val & 1;
}
static inline int get_xen_vtsc_khz(void)
{
	u32 val, dummy1, dummy2, dummy3;
	pv_cpuid(0x40000003,0,&dummy1,&dummy2,&val,&dummy3);
	return val;
}
static inline u32 get_xen_cpu_khz(void)
{
	u32 cpu_khz, dummy1, dummy2, dummy3;
	pv_cpuid(0x40000003,2,&cpu_khz,&dummy1,&dummy2,&dummy3);
	return cpu_khz;
}
static inline u32 get_xen_incarnation(void)
{
	u32 incarn, dummy1, dummy2, dummy3;
	pv_cpuid(0x40000003,0,&dummy1,&dummy2,&dummy3,&incarn);
	return incarn;
}
static inline void get_xen_time_values(u64 *offset, u32 *mul_frac, u32 *shift)
{
	u32 off_lo, off_hi, sys_lo, sys_hi, dummy;
	pv_cpuid(0x40000003,1,&off_lo,&off_hi,mul_frac,shift);
	*offset = off_lo | ((u64)off_hi << 32);
}
static inline u64 scale_delta(u64 delta, u32 tsc_mul_frac, i32 tsc_shift)
{
    u64 product;
#ifdef __X86_32__
    u32 tmp1, tmp2;
#endif
    if ( tsc_shift < 0 )
        delta >>= -tsc_shift;
    else
        delta <<= tsc_shift;
#ifdef __X86_32__
    asm (
        "mul  %5       ; "
        "mov  %4,%%eax ; "
        "mov  %%edx,%4 ; "
        "mul  %5       ; "
        "xor  %5,%5    ; "
        "add  %4,%%eax ; "
        "adc  %5,%%edx ; "
        : "=A" (product), "=r" (tmp1), "=r" (tmp2)
        : "a" ((u32)delta), "1" ((u32)(delta >> 32)),
"2" (tsc_mul_frac) );
#else
    asm (
        "mul %%rdx ; shrd $32,%%rdx,%%rax"
        : "=a" (product) : "0" (delta), "d"
((u64)tsc_mul_frac) );
#endif
    return product;
}
static inline u64 get_pvrdtscp_timestamp(int *discontinuity)
{
	static int firsttime = 1;
	static u64 last_pvrdtscp_timestamp = 0;
	static u32 last_tsc_aux;
	static u64 xen_ns_offset;
	static u32 xen_tsc_to_ns_mul_frac, xen_tsc_to_ns_shift;
	u32 this_tsc_aux;
	u64 timestamp, cur_tsc, cur_ns;
	if (firsttime) {
		cur_tsc = do_rdtscp(&last_tsc_aux);
		get_xen_time_values(&xen_ns_offset, &xen_tsc_to_ns_mul_frac,
					&xen_tsc_to_ns_shift);
		cur_ns = scale_delta(cur_tsc, xen_tsc_to_ns_mul_frac,
					xen_tsc_to_ns_shift);
		timestamp = cur_ns - xen_ns_offset;
		last_pvrdtscp_timestamp = timestamp;
		firsttime = 0;
	}
	cur_tsc = do_rdtscp(&this_tsc_aux);
	*discontinuity = 0;
	while (this_tsc_aux != last_tsc_aux) {
		/* if tsc_aux changed, try again */
		last_tsc_aux = this_tsc_aux;
		get_xen_time_values(&xen_ns_offset, &xen_tsc_to_ns_mul_frac,
					&xen_tsc_to_ns_shift);
		cur_tsc = do_rdtscp(&this_tsc_aux);
		*discontinuity = 1;
	}
	/* compute nsec from TSC and Xen time values */
	cur_ns = scale_delta(cur_tsc, xen_tsc_to_ns_mul_frac,
					xen_tsc_to_ns_shift);
	timestamp = cur_ns - xen_ns_offset;
	/* enforce monotonicity just in case */
	if ((i64)(timestamp - last_pvrdtscp_timestamp) > 0)
		last_pvrdtscp_timestamp = timestamp;
	else {
		/* this should never happen but we''ll check it anyway in
		 * case of some strange combination of scaling errors
		 * occurs across a very fast migration */
		printf("Time went backwards by %lluns\n",
		    (unsigned long long)(last_pvrdtscp_timestamp-timestamp));
		timestamp = ++last_pvrdtscp_timestamp;
	}
	return timestamp;
}
#define HVM 1
#define PVM 0
static int running_on_xen(int hvm, u16 *version_major, u16 *version_minor)
{
	u32 eax, ebx, ecx, edx, base;
	union { char csig[16]; u32 u[4]; } sig;
	for (base=0x40000000; base < 0x40010000; base += 0x100) {
		if (hvm==HVM)
			hvm_cpuid(base,0,&eax,&ebx,&ecx,&edx);
		else
			pv_cpuid(base,0,&eax,&ebx,&ecx,&edx);
		sig.u[0] = ebx; sig.u[1] = ecx; sig.u[2] = edx;
		sig.csig[12] = ''\0'';
		if (!strcmp("XenVMMXenVMM",&sig.csig[0]) && (eax >=
(base+2))) {
				if (hvm==HVM)
					hvm_cpuid(base+1,0,&eax,&ebx,&ecx,&edx);
				else
					pv_cpuid(base+1,0,&eax,&ebx,&ecx,&edx);
				*version_major = (eax >> 16) & 0xffff;
				*version_minor = eax & 0xffff;
				return 1;
		}
	}
	return 0;
}
main(int ac, char **av)
{
	u32 dummy;
	u16 version_hi, version_lo;
	u64 ts, last_ts;
	int status, discontinuity = 0;
	pid_t pid;
	if (running_on_xen(HVM,&version_hi,&version_lo)) {
		printf("running on Xen v%d.%d as an HVM domain, "
			"pvrdtsc not supported, exiting\n",
			(int)version_hi, (int)version_lo);
		exit(0);
	}
	pid = fork();
	if (pid == -1) {
		fprintf(stderr,"Huh? Fork failed\n");
		return 0;
	}
	else if (pid == 0) { /* child */
		pv_cpuid(0x40000000,0,&dummy,&dummy,&dummy,&dummy);
		exit(0);
	}
	waitpid(pid,&status,0);
	if (!WIFEXITED(status))
		exit(0);
	if (!running_on_xen(PVM,&version_hi,&version_lo)) {
		printf("not running on Xen, exiting\n");
		exit(0);
	}
	printf("running on Xen v%d.%d as a PV domain\n",
		(int)version_hi, (int)version_lo);
	if ( version_hi <= 3 ) {
		printf("pvrdtscp requires Xen version 4.0 or greater\n");
		/* exit(0); FIXME after xen-unstable is officially v4.0 */
	}
	if ( get_xen_tsc_mode() != 3 )
		printf("tsc_mode not pvrdtscp, set tsc_mode=3, exiting\n");
	/* OK, we are on Xen, now loop forever checking timestamps */
	ts = get_pvrdtscp_timestamp(&discontinuity);
	printf("Starting with ts=%lluns 0x%llx
(%llusec)\n",ts,ts,ts/NSEC_PER_SEC);
	printf("incarn=%d: vtsc=%d, vtsc_khz=%lu, phys cpu_khz=%lu\n",
				(unsigned long)get_xen_incarnation(),
				(unsigned long)get_xen_vtsc(),
				(unsigned long)get_xen_vtsc_khz(),
				(unsigned long)get_xen_cpu_khz());
	ts = get_pvrdtscp_timestamp(&discontinuity);
	last_ts = ts;
	while (1) {
		ts = get_pvrdtscp_timestamp(&discontinuity);
		if (discontinuity)
			printf("migrated/restored, incarn=%d: "
                               "vtsc now %d, vtsc_khz=%lu, phys
cpu_khz=%lu\n",
				(unsigned long)get_xen_incarnation(),
				(unsigned long)get_xen_vtsc(),
				(unsigned long)get_xen_vtsc_khz(),
				(unsigned long)get_xen_cpu_khz());
		if (ts < last_ts)
			/* this should NEVER happen, especially since there
			 * is a check for it in get_pvrdtscp_timestamp() */
			printf("Time went backwards: %lluns (%llusec)\n",
				last_ts-ts,(last_ts-ts)/NSEC_PER_SEC);
		if (ts > last_ts + 200000000LL)
			/* this is OK, usually about 2sec for save/restore
			 * and a fraction of a second for live migrate */
			printf("Time jumped forward %lluns (%llusec)\n",
				ts-last_ts,(ts-last_ts)/NSEC_PER_SEC);
		last_ts = ts;
	}
}
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel