thr3ads.net - Linux Virtualization - [PATCH 00/28] Updates for firstfloor paravirt-ops patches [Apr 2007]

If this information is useful, please help other people find it:
Share via:

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

[PATCH 14/28] fix paravirt-documentation

Remove #defines, add enum for PARAVIRT_LAZY_FLUSH.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>

---
 include/asm-i386/paravirt.h |    7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

==================================================================---
a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -30,6 +30,7 @@ enum paravirt_lazy_mode {
 	PARAVIRT_LAZY_NONE = 0,
 	PARAVIRT_LAZY_MMU = 1,
 	PARAVIRT_LAZY_CPU = 2,
+	PARAVIRT_LAZY_FLUSH = 3,
 };
 
 struct paravirt_ops
@@ -906,12 +907,6 @@ static inline void set_pmd(pmd_t *pmdp, 
 }
 #endif	/* CONFIG_X86_PAE */
 
-/* Lazy mode for batching updates / context switch */
-#define PARAVIRT_LAZY_NONE 0
-#define PARAVIRT_LAZY_MMU  1
-#define PARAVIRT_LAZY_CPU  2
-#define PARAVIRT_LAZY_FLUSH 3
-
 #define  __HAVE_ARCH_ENTER_LAZY_CPU_MODE
 static inline void arch_enter_lazy_cpu_mode(void)
 {

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 19/28] Dont implement native_kmap_atomic_pte for !HIGHPTE

Don't implement native_kmap_atomic_pte for !HIGHPTE case; it is never
needed,
never called, and leaving it in is just plain confusing.  Making it isolated
to the config where it is used may help find bugs.

From: Zachary Amsden <zach@vmware.com>
Signed-off-by: Zachary Amsden <zach@vmware.com>
Acked-by: Jeremy Fitzhardinge <jeremy@xensource.com>

---
 arch/i386/kernel/paravirt.c |    4 +---
 include/asm-i386/highmem.h  |    5 -----
 include/asm-i386/paravirt.h |    4 ++++
 3 files changed, 5 insertions(+), 8 deletions(-)

==================================================================---
a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -318,9 +318,7 @@ struct paravirt_ops paravirt_ops = {
 	.ptep_get_and_clear = native_ptep_get_and_clear,
 
 #ifdef CONFIG_HIGHPTE
-	.kmap_atomic_pte = native_kmap_atomic_pte,
-#else
-	.kmap_atomic_pte = paravirt_nop,
+	.kmap_atomic_pte = kmap_atomic,
 #endif
 
 #ifdef CONFIG_X86_PAE
==================================================================---
a/include/asm-i386/highmem.h
+++ b/include/asm-i386/highmem.h
@@ -74,11 +74,6 @@ void *kmap_atomic_pfn(unsigned long pfn,
 void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
 struct page *kmap_atomic_to_page(void *ptr);
 
-static inline void *native_kmap_atomic_pte(struct page *page, enum km_type
type)
-{
-	return kmap_atomic(page, type);
-}
-
 #ifndef CONFIG_PARAVIRT
 #define kmap_atomic_pte(page, type)	kmap_atomic(page, type)
 #endif
==================================================================---
a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -190,7 +190,9 @@ struct paravirt_ops
 
  	pte_t (*ptep_get_and_clear)(pte_t *ptep);
 
+#ifdef CONFIG_HIGHPTE
 	void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
+#endif
 
 #ifdef CONFIG_X86_PAE
 	void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
@@ -759,12 +761,14 @@ static inline void paravirt_release_pd(u
 	PVOP_VCALL1(release_pd, pfn);
 }
 
+#ifdef CONFIG_HIGHPTE
 static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
 {
 	unsigned long ret;
 	ret = PVOP_CALL2(unsigned long, kmap_atomic_pte, page, type);
 	return (void *)ret;
 }
+#endif
 
 static inline void pte_update(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep)

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 07/28] cleanups to help using per-cpu variables from asm

This patch does a few small cleanups:
 - use PER_CPU_NAME to generate the names of per-cpu variables
 - use lea to add the per_cpu offset in PER_CPU(), because it doesn't
   affect condition flags
 - add PER_CPU_VAR which allows direct access to pre-cpu variables
   with the %fs: prefix on SMP.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>

---
 include/asm-i386/percpu.h |   12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

==================================================================---
a/include/asm-i386/percpu.h
+++ b/include/asm-i386/percpu.h
@@ -16,12 +16,14 @@
  *    PER_CPU(cpu_gdt_descr, %ebx)
  */
 #ifdef CONFIG_SMP
+#define PER_CPU(var, reg)				\
+	movl %fs:per_cpu__##this_cpu_off, reg;		\
+	lea per_cpu__##var(reg), reg
+#define PER_CPU_VAR(var)	%fs:per_cpu__##var
+#else /* ! SMP */
 #define PER_CPU(var, reg)			\
-	movl %fs:per_cpu__this_cpu_off, reg;		\
-	addl $per_cpu__##var, reg
-#else /* ! SMP */
-#define PER_CPU(var, reg) \
-	movl $per_cpu__##var, reg;
+	movl $per_cpu__##var, reg
+#define PER_CPU_VAR(var)	per_cpu__##var
 #endif	/* SMP */
 
 #else /* ...!ASSEMBLY */

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 08/28] Define per_cpu_offset

Define per_cpu_offset in asm-i386/percpu.h when SMP defined, like
asm-generic/percpu.h does for UP.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>

---
 include/asm-i386/percpu.h |    2 ++
 1 file changed, 2 insertions(+)

==================================================================---
a/include/asm-i386/percpu.h
+++ b/include/asm-i386/percpu.h
@@ -34,6 +34,8 @@

 /* This is used for other cpus to find our section. */
 extern unsigned long __per_cpu_offset[];
+
+#define per_cpu_offset(x) (__per_cpu_offset[x])

 /* Separate out the type, so (int[3], foo) works. */
 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 03/28] fix allow-percpu-variables-to-be-page-aligned.patch

Make sure allocation is page-aligned.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>

---
 init/main.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

==================================================================---
a/init/main.c
+++ b/init/main.c
@@ -370,7 +370,7 @@ static void __init setup_per_cpu_areas(v
 
 	/* Copy section for each CPU (we discard the original) */
 	size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
-	ptr = alloc_bootmem(size * nr_possible_cpus);
+	ptr = alloc_bootmem_pages(size * nr_possible_cpus);
 
 	for_each_possible_cpu(i) {
 		__per_cpu_offset[i] = ptr - __per_cpu_start;

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 13/28] paravirt: flush lazy mmu updates on kunmap_atomic

kunmap_atomic should flush any pending lazy mmu updates, mainly to be
consistent with kmap_atomic, and to preserve its normal behaviour.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>

---
 arch/i386/mm/highmem.c |    1 +
 1 file changed, 1 insertion(+)

==================================================================---
a/arch/i386/mm/highmem.c
+++ b/arch/i386/mm/highmem.c
@@ -72,6 +72,7 @@ void kunmap_atomic(void *kvaddr, enum km
 #endif
 	}
 
+	arch_flush_lazy_mmu_mode();
 	pagefault_enable();
 }
 

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 00/28] Updates for firstfloor paravirt-ops patches

Hi Andi,

This is a set of updates for the firstfloor patch queue.

Quick rundown:

revert-mm-x86_64-mm-account-for-module-percpu-space-separately-from-kernel-percpu.patch
separate-module-percpu-space.patch
	Update the module percpu accounting patch

fix-ff-allow-percpu-variables-to-be-page-aligned.patch
	Make sure the percpu memory allocation is page-aligned

deflate-stack-usage-in-lib_inflate_c.patch
	Fix deflate stack usage.  With all the arch-fixes rolled in.

i386-gdt-cleanups-page-align-the-gdt.patch
i386-convert-pda-into-the-percpu-section.patch
i386-cleanups-to-help-using-per-cpu-variables-from-asm.patch
percpu-define-per_cpu_offset.patch
fix-uniproc-gdt-bugs.patch
	Percpu and GDT fixes.

x86-map-enough-initial-memory.patch
	Fix head.S to map enough memory.

cleanup-cleanup-asm-bugs_h.patch
cleanup-identify_cpu-fix.patch
	This is the pair I tried to post yesterday, but they
	got interrupted by an network outage.  They basically
	add a little more cleanup, and move a misplaced hunk.
	cleanup-cleanup-asm-bugs_h.patch should go after/roll
	into the clean-up-asm-(i386|x86_64)-bugs_h patches, and
	cleanup-identify_cpu-fix.patch should go after/roll into
	clean-up-identify_cpu patch.

paravirt-flush-on-kunmap_atomic.patch
	Flush pending lazy mmu operations on kunmap_atomic too.

paravirt-fix-paravirt_lazy.patch
	Fix up an apparent mismerge: remove the #defines for
	PARAVIRT_LAZY_* and add _LAZY_FLUSH to the enum.

i386-sysenter-arch-pages-fix.patch
i386-acpi-remove-earlyquirk-warning.patch
i386-mcheck-p4-grotesque-and-needless-warning-fix.patch
i386-pgd-clone-under-lock-fix.patch
paravirt-kmap_atomic_pte-tidy.patch
vmi-supports-compat-vdso.patch
vmi-kmap_atomic_pte-fix.patch
vmi-timer-update.patch
buslogic-check-range-fixes.patch
pte-drop-ptep_get_and_clear-paravirt-op.patch
	A chunk of pages from Zach.

rename-the-parainstructions-symbols-to-be-consistent-with-the-others.patch
rename-the-parainstructions-symbols-to-be-consistent-with-the-others-fix.patch
	Obvious.

vmi-fix-ff.patch
	Make VMI compile in the -ff patchstack.

paravirt-sched-clock-ff.patch
	Updated paravirt-sched-lock for your sched_clock.

Thanks,
	J

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 04/28] deflate stack usage in lib/inflate.c

inflate_fixed and huft_build together use around 2.7k of stack.  When
using 4k stacks, I saw stack overflows from interrupts arriving while
unpacking the root initrd:

do_IRQ: stack overflow: 384
 [<c0106b64>] show_trace_log_lvl+0x1a/0x30
 [<c01075e6>] show_trace+0x12/0x14
 [<c010763f>] dump_stack+0x16/0x18
 [<c0107ca4>] do_IRQ+0x6d/0xd9
 [<c010202b>] xen_evtchn_do_upcall+0x6e/0xa2
 [<c0106781>] xen_hypervisor_callback+0x25/0x2c
 [<c010116c>] xen_restore_fl+0x27/0x29
 [<c0330f63>] _spin_unlock_irqrestore+0x4a/0x50
 [<c0117aab>] change_page_attr+0x577/0x584
 [<c0117b45>] kernel_map_pages+0x8d/0xb4
 [<c016a314>] cache_alloc_refill+0x53f/0x632
 [<c016a6c2>] __kmalloc+0xc1/0x10d
 [<c0463d34>] malloc+0x10/0x12
 [<c04641c1>] huft_build+0x2a7/0x5fa
 [<c04645a5>] inflate_fixed+0x91/0x136
 [<c04657e2>] unpack_to_rootfs+0x5f2/0x8c1
 [<c0465acf>] populate_rootfs+0x1e/0xe4

(This was under Xen, but there's no reason it couldn't happen on bare
  hardware.)

This patch mallocs the local variables, thereby reducing the stack
usage to sane levels.

Also, up the heap size for the kernel decompressor to deal with the
extra allocation.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Tim Yamin <plasmaroo@gentoo.org>
Cc: Andi Kleen <ak@suse.de>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ian Molton <spyro@f2s.com>

---
 arch/alpha/boot/misc.c             |    2 -
 arch/arm/boot/compressed/misc.c    |    2 -
 arch/arm26/boot/compressed/misc.c  |    2 -
 arch/i386/boot/compressed/misc.c   |    2 -
 arch/x86_64/boot/compressed/misc.c |    2 -
 lib/inflate.c                      |   66 ++++++++++++++++++++++++++----------
 6 files changed, 54 insertions(+), 22 deletions(-)

==================================================================---
a/arch/alpha/boot/misc.c
+++ b/arch/alpha/boot/misc.c
@@ -98,7 +98,7 @@ static ulg free_mem_ptr;
 static ulg free_mem_ptr;
 static ulg free_mem_ptr_end;
 
-#define HEAP_SIZE 0x2000
+#define HEAP_SIZE 0x3000
 
 #include "../../../lib/inflate.c"
 
==================================================================---
a/arch/arm/boot/compressed/misc.c
+++ b/arch/arm/boot/compressed/misc.c
@@ -239,7 +239,7 @@ static ulg free_mem_ptr;
 static ulg free_mem_ptr;
 static ulg free_mem_ptr_end;
 
-#define HEAP_SIZE 0x2000
+#define HEAP_SIZE 0x3000
 
 #include "../../../../lib/inflate.c"
 
==================================================================---
a/arch/arm26/boot/compressed/misc.c
+++ b/arch/arm26/boot/compressed/misc.c
@@ -182,7 +182,7 @@ static ulg free_mem_ptr;
 static ulg free_mem_ptr;
 static ulg free_mem_ptr_end;
 
-#define HEAP_SIZE 0x2000
+#define HEAP_SIZE 0x3000
 
 #include "../../../../lib/inflate.c"
 
==================================================================---
a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -189,7 +189,7 @@ static unsigned long free_mem_ptr;
 static unsigned long free_mem_ptr;
 static unsigned long free_mem_end_ptr;
 
-#define HEAP_SIZE             0x3000
+#define HEAP_SIZE             0x4000
 
 static char *vidmem = (char *)0xb8000;
 static int vidport;
==================================================================---
a/arch/x86_64/boot/compressed/misc.c
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -189,7 +189,7 @@ static long free_mem_ptr;
 static long free_mem_ptr;
 static long free_mem_end_ptr;
 
-#define HEAP_SIZE             0x6000
+#define HEAP_SIZE             0x7000
 
 static char *vidmem = (char *)0xb8000;
 static int vidport;
==================================================================---
a/lib/inflate.c
+++ b/lib/inflate.c
@@ -292,7 +292,6 @@ STATIC int INIT huft_build(
    oversubscribed set of lengths), and three if not enough memory. */
 {
   unsigned a;                   /* counter for codes of length k */
-  unsigned c[BMAX+1];           /* bit length count table */
   unsigned f;                   /* i repeats in table every f entries */
   int g;                        /* maximum code length */
   int h;                        /* table level */
@@ -303,18 +302,33 @@ STATIC int INIT huft_build(
   register unsigned *p;         /* pointer into c[], b[], or v[] */
   register struct huft *q;      /* points to current table */
   struct huft r;                /* table entry for structure assignment */
-  struct huft *u[BMAX];         /* table stack */
-  unsigned v[N_MAX];            /* values in order of bit length */
   register int w;               /* bits before this table == (l * h) */
-  unsigned x[BMAX+1];           /* bit offsets, then code stack */
   unsigned *xp;                 /* pointer into x */
   int y;                        /* number of dummy codes added */
   unsigned z;                   /* number of entries in current table */
+  struct {
+    unsigned c[BMAX+1];           /* bit length count table */
+    struct huft *u[BMAX];         /* table stack */
+    unsigned v[N_MAX];            /* values in order of bit length */
+    unsigned x[BMAX+1];           /* bit offsets, then code stack */
+  } *stk;
+  unsigned *c, *v, *x;
+  struct huft **u;
+  int ret;
 
 DEBG("huft1 ");
 
+  stk = malloc(sizeof(*stk));
+  if (stk == NULL)
+    return 3;			/* out of memory */
+
+  c = stk->c;
+  v = stk->v;
+  x = stk->x;
+  u = stk->u;
+
   /* Generate counts for each bit length */
-  memzero(c, sizeof(c));
+  memzero(stk->c, sizeof(stk->c));
   p = b;  i = n;
   do {
     Tracecv(*p, (stderr, (n-i >= ' ' && n-i <=
'~' ? "%c %d\n" : "0x%x %d\n"),
@@ -326,7 +340,8 @@ DEBG("huft1 ");
   {
     *t = (struct huft *)NULL;
     *m = 0;
-    return 2;
+    ret = 2;
+    goto out;
   }
 
 DEBG("huft2 ");
@@ -351,10 +366,14 @@ DEBG("huft3 ");
 
   /* Adjust last length count to fill out codes, if needed */
   for (y = 1 << j; j < i; j++, y <<= 1)
-    if ((y -= c[j]) < 0)
-      return 2;                 /* bad input: more codes than bits */
-  if ((y -= c[i]) < 0)
-    return 2;
+    if ((y -= c[j]) < 0) {
+      ret = 2;                 /* bad input: more codes than bits */
+      goto out;
+    }
+  if ((y -= c[i]) < 0) {
+    ret = 2;
+    goto out;
+  }
   c[i] += y;
 
 DEBG("huft4 ");
@@ -428,7 +447,8 @@ DEBG1("3 ");
         {
           if (h)
             huft_free(u[0]);
-          return 3;             /* not enough memory */
+          ret = 3;             /* not enough memory */
+	  goto out;
         }
 DEBG1("4 ");
         hufts += z + 1;         /* track memory usage */
@@ -492,7 +512,11 @@ DEBG("huft7 ");
 DEBG("huft7 ");
 
   /* Return true (1) if we were given an incomplete table */
-  return y != 0 && g != 1;
+  ret = y != 0 && g != 1;
+
+  out:
+  free(stk);
+  return ret;
 }
 
 
@@ -705,9 +729,13 @@ STATIC int noinline INIT inflate_fixed(v
   struct huft *td;      /* distance code table */
   int bl;               /* lookup bits for tl */
   int bd;               /* lookup bits for td */
-  unsigned l[288];      /* length list for huft_build */
+  unsigned *l;          /* length list for huft_build */
 
 DEBG("<fix");
+
+  l = malloc(sizeof(*l) * 288);
+  if (l == NULL)
+    return 3;			/* out of memory */
 
   /* set up literal table */
   for (i = 0; i < 144; i++)
@@ -719,9 +747,10 @@ DEBG("<fix");
   for (; i < 288; i++)          /* make a complete, but wrong code set */
     l[i] = 8;
   bl = 7;
-  if ((i = huft_build(l, 288, 257, cplens, cplext, &tl, &bl)) != 0)
+  if ((i = huft_build(l, 288, 257, cplens, cplext, &tl, &bl)) != 0) {
+    free(l);
     return i;
-
+  }
 
   /* set up distance table */
   for (i = 0; i < 30; i++)      /* make an incomplete code set */
@@ -730,6 +759,7 @@ DEBG("<fix");
   if ((i = huft_build(l, 30, 0, cpdist, cpdext, &td, &bd)) > 1)
   {
     huft_free(tl);
+    free(l);
 
     DEBG(">");
     return i;
@@ -737,11 +767,13 @@ DEBG("<fix");
 
 
   /* decompress until an end-of-block code */
-  if (inflate_codes(tl, td, bl, bd))
+  if (inflate_codes(tl, td, bl, bd)) {
+    free(l);
     return 1;
-
+  }
 
   /* free the decoding tables, return */
+  free(l);
   huft_free(tl);
   huft_free(td);
   return 0;

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 12/28] i386: now its ok to use identify_boot_cpu

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>

---
 arch/i386/kernel/cpu/bugs.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

==================================================================---
a/arch/i386/kernel/cpu/bugs.c
+++ b/arch/i386/kernel/cpu/bugs.c
@@ -177,7 +177,7 @@ static void __init check_config(void)
 
 void __init check_bugs(void)
 {
-	identify_cpu(&boot_cpu_data);
+	identify_boot_cpu();
 #ifndef CONFIG_SMP
 	printk("CPU: ");
 	print_cpu_info(&boot_cpu_data);

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 28/28] Add a sched_clock paravirt_op

The tsc-based get_scheduled_cycles interface is not a good match for
Xen's runstate accounting, which reports everything in nanoseconds.

This patch replaces this interface with a sched_clock interface, which
matches both Xen and VMI's requirements.

In order to do this, we:
   1. replace get_scheduled_cycles with sched_clock
   2. hoist cycles_2_ns into a common header
   3. update vmi accordingly

One thing to note: because sched_clock is implemented as a weak
function in kernel/sched.c, we must define a real function in order to
override this weak binding.  This means the usual paravirt_ops
technique of using an inline function won't work in this case.


Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Dan Hecht <dhecht@vmware.com>
Cc: john stultz <johnstul@us.ibm.com>

---
 arch/i386/kernel/paravirt.c    |    2 -
 arch/i386/kernel/sched-clock.c |   43 ++++++++++++++-----------------------
 arch/i386/kernel/vmi.c         |    2 -
 arch/i386/kernel/vmiclock.c    |    6 ++---
 include/asm-i386/paravirt.h    |    7 ++++--
 include/asm-i386/timer.h       |   46 +++++++++++++++++++++++++++++++++++++++-
 include/asm-i386/vmi_time.h    |    2 -
 7 files changed, 73 insertions(+), 35 deletions(-)

==================================================================---
a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -268,7 +268,7 @@ struct paravirt_ops paravirt_ops = {
 	.write_msr = native_write_msr_safe,
 	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
-	.get_scheduled_cycles = native_read_tsc,
+	.sched_clock = native_sched_clock,
 	.get_cpu_khz = native_calculate_cpu_khz,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
==================================================================---
a/arch/i386/kernel/sched-clock.c
+++ b/arch/i386/kernel/sched-clock.c
@@ -35,28 +35,7 @@
  *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
  */
 
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
-struct sc_data {
-	unsigned int cyc2ns_scale;
-	unsigned long long sync_tsc;
-	unsigned long long ns_base;
-	unsigned long long last_val;
-	unsigned long long sync_jiffies;
-};
-
-static DEFINE_PER_CPU(struct sc_data, sc_data);
-
-static inline unsigned long long cycles_2_ns(struct sc_data *sc, unsigned long
long cyc)
-{
-	unsigned long long ns;
-
-	cyc -= sc->sync_tsc;
-	ns = (cyc * sc->cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
-	ns += sc->ns_base;
-
-	return ns;
-}
+DEFINE_PER_CPU(struct sc_data, sc_data);
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -66,7 +45,7 @@ static inline unsigned long long cycles_
  * [1] no attempt to stop CPU instruction reordering, which can hit
  * in a 100 instruction window or so.
  */
-unsigned long long sched_clock(void)
+unsigned long long native_sched_clock(void)
 {
 	unsigned long long r;
 	struct sc_data *sc = &get_cpu_var(sc_data);
@@ -81,8 +60,8 @@ unsigned long long sched_clock(void)
 		sc->last_val = r;
 		local_irq_restore(flags);
 	} else {
-		get_scheduled_cycles(r);
-		r = cycles_2_ns(sc, r);
+		rdtscll(r);
+		r = cycles_2_ns(r);
 		sc->last_val = r;
 	}
 
@@ -90,6 +69,18 @@ unsigned long long sched_clock(void)
 
 	return r;
 }
+
+/* We need to define a real function for sched_clock, to override the
+   weak default version */
+#ifdef CONFIG_PARAVIRT
+unsigned long long sched_clock(void)
+{
+	return paravirt_sched_clock();
+}
+#else
+unsigned long long sched_clock(void)
+	__attribute__((alias("native_sched_clock")));
+#endif
 
 /* Resync with new CPU frequency */
 static void resync_sc_freq(struct sc_data *sc, unsigned int newfreq)
@@ -103,7 +94,7 @@ static void resync_sc_freq(struct sc_dat
 	   because sched_clock callers should be able to tolerate small
 	   errors. */
 	sc->ns_base = ktime_to_ns(ktime_get());
-	get_scheduled_cycles(sc->sync_tsc);
+	rdtscll(sc->sync_tsc);
 	sc->cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR) / newfreq;
 }
 
==================================================================---
a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -887,7 +887,7 @@ static inline int __init activate_vmi(vo
 		paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
 		paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
 #endif
-		paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles;
+		paravirt_ops.sched_clock = vmi_sched_clock;
  		paravirt_ops.get_cpu_khz = vmi_cpu_khz;
 
 		/* We have true wallclock functions; disable CMOS clock sync */
==================================================================---
a/arch/i386/kernel/vmiclock.c
+++ b/arch/i386/kernel/vmiclock.c
@@ -65,9 +65,9 @@ int vmi_set_wallclock(unsigned long now)
 }
 
 /* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */
-unsigned long long vmi_get_sched_cycles(void)
-{
-	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
+unsigned long long vmi_sched_clock(void)
+{
+	return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
 }
 
 /* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
==================================================================---
a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -116,7 +116,7 @@ struct paravirt_ops
 
 	u64 (*read_tsc)(void);
 	u64 (*read_pmc)(void);
- 	u64 (*get_scheduled_cycles)(void);
+ 	unsigned long long (*sched_clock)(void);
 	unsigned long (*get_cpu_khz)(void);
 
 	/* Segment descriptor handling */
@@ -573,7 +573,10 @@ static inline u64 paravirt_read_tsc(void
 
 #define rdtscll(val) (val = paravirt_read_tsc())
 
-#define get_scheduled_cycles(val) (val = paravirt_ops.get_scheduled_cycles())
+static inline unsigned long long paravirt_sched_clock(void)
+{
+	return PVOP_CALL0(unsigned long long, sched_clock);
+}
 #define calculate_cpu_khz() (paravirt_ops.get_cpu_khz())
 
 #define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
==================================================================---
a/include/asm-i386/timer.h
+++ b/include/asm-i386/timer.h
@@ -15,8 +15,52 @@ extern int recalibrate_cpu_khz(void);
 extern int recalibrate_cpu_khz(void);
 
 #ifndef CONFIG_PARAVIRT
-#define get_scheduled_cycles(val) rdtscll(val)
 #define calculate_cpu_khz() native_calculate_cpu_khz()
 #endif
 
+/* Accellerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *		ns = cycles / (freq / ns_per_sec)
+ *		ns = cycles * (ns_per_sec / freq)
+ *		ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *		ns = cycles * (10^6 / cpu_khz)
+ *
+ *	Then we use scaling math (suggested by george@mvista.com) to get:
+ *		ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *		ns = cycles * cyc2ns_scale / SC
+ *
+ *	And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better percision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+struct sc_data {
+	unsigned int cyc2ns_scale;
+	unsigned long long sync_tsc;
+	unsigned long long ns_base;
+	unsigned long long last_val;
+	unsigned long long sync_jiffies;
+};
+
+DECLARE_PER_CPU(struct sc_data, sc_data);
+
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+	const struct sc_data *sc = &__get_cpu_var(sc_data);
+	unsigned long long ns;
+
+	cyc -= sc->sync_tsc;
+	ns = (cyc * sc->cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+	ns += sc->ns_base;
+
+	return ns;
+}
+
 #endif
==================================================================---
a/include/asm-i386/vmi_time.h
+++ b/include/asm-i386/vmi_time.h
@@ -49,7 +49,7 @@ extern void __init vmi_time_init(void);
 extern void __init vmi_time_init(void);
 extern unsigned long vmi_get_wallclock(void);
 extern int vmi_set_wallclock(unsigned long now);
-extern unsigned long long vmi_get_sched_cycles(void);
+extern unsigned long long vmi_sched_clock(void);
 extern unsigned long vmi_cpu_khz(void);
 
 #ifdef CONFIG_X86_LOCAL_APIC

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 21/28] Implement vmi_kmap_atomic_pte

Implement vmi_kmap_atomic_pte in terms of the backend set_linear_mapping
operation.  The conversion is rather straighforward; call kmap_atomic
and then inform the hypervisor of the page mapping.

The _flush_tlb damage is due to macros being pulled in from highmem.h.

From: Zachary Amsden <zach@vmware.com>
Signed-off-by: Zachary Amsden <zach@vmware.com>

---
 arch/i386/kernel/vmi.c |   38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

==================================================================---
a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -26,6 +26,7 @@
 #include <linux/cpu.h>
 #include <linux/bootmem.h>
 #include <linux/mm.h>
+#include <linux/highmem.h>
 #include <asm/vmi.h>
 #include <asm/io.h>
 #include <asm/fixmap.h>
@@ -65,8 +66,8 @@ static struct {
 	void (*release_page)(u32, u32);
 	void (*set_pte)(pte_t, pte_t *, unsigned);
 	void (*update_pte)(pte_t *, unsigned);
-	void (*set_linear_mapping)(int, u32, u32, u32);
-	void (*flush_tlb)(int);
+	void (*set_linear_mapping)(int, void *, u32, u32);
+	void (*_flush_tlb)(int);
 	void (*set_initial_ap_state)(int, int);
 	void (*halt)(void);
   	void (*set_lazy_mode)(int mode);
@@ -217,12 +218,12 @@ static void vmi_load_esp0(struct tss_str
 
 static void vmi_flush_tlb_user(void)
 {
-	vmi_ops.flush_tlb(VMI_FLUSH_TLB);
+	vmi_ops._flush_tlb(VMI_FLUSH_TLB);
 }
 
 static void vmi_flush_tlb_kernel(void)
 {
-	vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
+	vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
 }
 
 /* Stub to do nothing at all; used for delays and unimplemented calls */
@@ -345,8 +346,11 @@ static void vmi_check_page_type(u32 pfn,
 #define vmi_check_page_type(p,t) do { } while (0)
 #endif
 
-static void vmi_map_pt_hook(int type, pte_t *va, u32 pfn)
-{
+#ifdef CONFIG_HIGHPTE
+static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
+{
+	void *va = kmap_atomic(page, type);
+
 	/*
 	 * Internally, the VMI ROM must map virtual addresses to physical
 	 * addresses for processing MMU updates.  By the time MMU updates
@@ -360,8 +364,11 @@ static void vmi_map_pt_hook(int type, pt
 	 *  args:                 SLOT                 VA    COUNT PFN
 	 */
 	BUG_ON(type != KM_PTE0 && type != KM_PTE1);
-	vmi_ops.set_linear_mapping((type - KM_PTE0)+1, (u32)va, 1, pfn);
-}
+	vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
+
+	return va;
+}
+#endif
 
 static void vmi_allocate_pt(u32 pfn)
 {
@@ -656,7 +663,7 @@ void vmi_bringup(void)
 {
  	/* We must establish the lowmem mapping for MMU ops to work */
 	if (vmi_ops.set_linear_mapping)
-		vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0);
+		vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0);
 }
 
 /*
@@ -793,8 +800,8 @@ static inline int __init activate_vmi(vo
 	para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode);
 
 	/* user and kernel flush are just handled with different flags to FlushTLB */
-	para_wrap(flush_tlb_user, vmi_flush_tlb_user, flush_tlb, FlushTLB);
-	para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, flush_tlb, FlushTLB);
+	para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
+	para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
 	para_fill(flush_tlb_single, InvalPage);
 
 	/*
@@ -840,9 +847,12 @@ static inline int __init activate_vmi(vo
 		paravirt_ops.release_pt = vmi_release_pt;
 		paravirt_ops.release_pd = vmi_release_pd;
 	}
-#if 0
-	para_wrap(map_pt_hook, vmi_map_pt_hook, set_linear_mapping,
-		  SetLinearMapping);
+
+	/* Set linear is needed in all cases */
+	vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
+#ifdef CONFIG_HIGHPTE
+	if (vmi_ops.set_linear_mapping)
+		paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
 #endif
 
 	/*

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 23/28] Fix BusLogic to stop using check_region

I got so sick of seing the check_region warnings from BusLogic.c I actually
fixed it properly.  Never use check region, reserve it before the probe
with request region instead and check the error result; free region if
setup fails.  Should be functionally identical to the original except for
fixing the potential race.

From: Zachary Amsden <zach@vmware.com>
Signed-off-by: Zachary Amsden <zach@vmware.com>
CC: Lenoard N. Zubkoff <lnz@dandelion.com>
CC: Michael Clay <claym@osuosl.org>

---
 drivers/scsi/BusLogic.c |   73 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 25 deletions(-)

==================================================================---
a/drivers/scsi/BusLogic.c
+++ b/drivers/scsi/BusLogic.c
@@ -579,17 +579,17 @@ static void __init BusLogic_InitializePr
 	/*
 	   Append the list of standard BusLogic MultiMaster ISA I/O Addresses.
 	 */
-	if (BusLogic_ProbeOptions.LimitedProbeISA ? BusLogic_ProbeOptions.Probe330 :
check_region(0x330, BusLogic_MultiMasterAddressCount) == 0)
+	if (!BusLogic_ProbeOptions.LimitedProbeISA || BusLogic_ProbeOptions.Probe330)
 		BusLogic_AppendProbeAddressISA(0x330);
-	if (BusLogic_ProbeOptions.LimitedProbeISA ? BusLogic_ProbeOptions.Probe334 :
check_region(0x334, BusLogic_MultiMasterAddressCount) == 0)
+	if (!BusLogic_ProbeOptions.LimitedProbeISA || BusLogic_ProbeOptions.Probe334)
 		BusLogic_AppendProbeAddressISA(0x334);
-	if (BusLogic_ProbeOptions.LimitedProbeISA ? BusLogic_ProbeOptions.Probe230 :
check_region(0x230, BusLogic_MultiMasterAddressCount) == 0)
+	if (!BusLogic_ProbeOptions.LimitedProbeISA || BusLogic_ProbeOptions.Probe230)
 		BusLogic_AppendProbeAddressISA(0x230);
-	if (BusLogic_ProbeOptions.LimitedProbeISA ? BusLogic_ProbeOptions.Probe234 :
check_region(0x234, BusLogic_MultiMasterAddressCount) == 0)
+	if (!BusLogic_ProbeOptions.LimitedProbeISA || BusLogic_ProbeOptions.Probe234)
 		BusLogic_AppendProbeAddressISA(0x234);
-	if (BusLogic_ProbeOptions.LimitedProbeISA ? BusLogic_ProbeOptions.Probe130 :
check_region(0x130, BusLogic_MultiMasterAddressCount) == 0)
+	if (!BusLogic_ProbeOptions.LimitedProbeISA || BusLogic_ProbeOptions.Probe130)
 		BusLogic_AppendProbeAddressISA(0x130);
-	if (BusLogic_ProbeOptions.LimitedProbeISA ? BusLogic_ProbeOptions.Probe134 :
check_region(0x134, BusLogic_MultiMasterAddressCount) == 0)
+	if (!BusLogic_ProbeOptions.LimitedProbeISA || BusLogic_ProbeOptions.Probe134)
 		BusLogic_AppendProbeAddressISA(0x134);
 }
 
@@ -795,7 +795,9 @@ static int __init BusLogic_InitializeMul
 	   host adapters are probed.
 	 */
 	if (!BusLogic_ProbeOptions.NoProbeISA)
-		if (PrimaryProbeInfo->IO_Address == 0 &&
(BusLogic_ProbeOptions.LimitedProbeISA ? BusLogic_ProbeOptions.Probe330 :
check_region(0x330, BusLogic_MultiMasterAddressCount) == 0)) {
+		if (PrimaryProbeInfo->IO_Address == 0 &&
+				(!BusLogic_ProbeOptions.LimitedProbeISA ||
+				 BusLogic_ProbeOptions.Probe330)) {
 			PrimaryProbeInfo->HostAdapterType = BusLogic_MultiMaster;
 			PrimaryProbeInfo->HostAdapterBusType = BusLogic_ISA_Bus;
 			PrimaryProbeInfo->IO_Address = 0x330;
@@ -805,15 +807,25 @@ static int __init BusLogic_InitializeMul
 	   omitting the Primary I/O Address which has already been handled.
 	 */
 	if (!BusLogic_ProbeOptions.NoProbeISA) {
-		if (!StandardAddressSeen[1] && (BusLogic_ProbeOptions.LimitedProbeISA
? BusLogic_ProbeOptions.Probe334 : check_region(0x334,
BusLogic_MultiMasterAddressCount) == 0))
+		if (!StandardAddressSeen[1] &&
+				(!BusLogic_ProbeOptions.LimitedProbeISA ||
+				 BusLogic_ProbeOptions.Probe334))
 			BusLogic_AppendProbeAddressISA(0x334);
-		if (!StandardAddressSeen[2] && (BusLogic_ProbeOptions.LimitedProbeISA
? BusLogic_ProbeOptions.Probe230 : check_region(0x230,
BusLogic_MultiMasterAddressCount) == 0))
+		if (!StandardAddressSeen[2] &&
+				(!BusLogic_ProbeOptions.LimitedProbeISA ||
+				 BusLogic_ProbeOptions.Probe230))
 			BusLogic_AppendProbeAddressISA(0x230);
-		if (!StandardAddressSeen[3] && (BusLogic_ProbeOptions.LimitedProbeISA
? BusLogic_ProbeOptions.Probe234 : check_region(0x234,
BusLogic_MultiMasterAddressCount) == 0))
+		if (!StandardAddressSeen[3] &&
+				(!BusLogic_ProbeOptions.LimitedProbeISA ||
+				 BusLogic_ProbeOptions.Probe234))
 			BusLogic_AppendProbeAddressISA(0x234);
-		if (!StandardAddressSeen[4] && (BusLogic_ProbeOptions.LimitedProbeISA
? BusLogic_ProbeOptions.Probe130 : check_region(0x130,
BusLogic_MultiMasterAddressCount) == 0))
+		if (!StandardAddressSeen[4] &&
+				(!BusLogic_ProbeOptions.LimitedProbeISA ||
+				 BusLogic_ProbeOptions.Probe130))
 			BusLogic_AppendProbeAddressISA(0x130);
-		if (!StandardAddressSeen[5] && (BusLogic_ProbeOptions.LimitedProbeISA
? BusLogic_ProbeOptions.Probe134 : check_region(0x134,
BusLogic_MultiMasterAddressCount) == 0))
+		if (!StandardAddressSeen[5] &&
+				(!BusLogic_ProbeOptions.LimitedProbeISA ||
+				 BusLogic_ProbeOptions.Probe134))
 			BusLogic_AppendProbeAddressISA(0x134);
 	}
 	/*
@@ -2220,22 +2232,35 @@ static int __init BusLogic_init(void)
 		HostAdapter->PCI_Device = ProbeInfo->PCI_Device;
 		HostAdapter->IRQ_Channel = ProbeInfo->IRQ_Channel;
 		HostAdapter->AddressCount =
BusLogic_HostAdapterAddressCount[HostAdapter->HostAdapterType];
+
+		/*
+		   Make sure region is free prior to probing.
+		 */
+		if (!request_region(HostAdapter->IO_Address, HostAdapter->AddressCount,
+					"BusLogic"))
+			continue;
 		/*
 		   Probe the Host Adapter.  If unsuccessful, abort further initialization.
 		 */
-		if (!BusLogic_ProbeHostAdapter(HostAdapter))
+		if (!BusLogic_ProbeHostAdapter(HostAdapter)) {
+			release_region(HostAdapter->IO_Address, HostAdapter->AddressCount);
 			continue;
+		}
 		/*
 		   Hard Reset the Host Adapter.  If unsuccessful, abort further
 		   initialization.
 		 */
-		if (!BusLogic_HardwareResetHostAdapter(HostAdapter, true))
+		if (!BusLogic_HardwareResetHostAdapter(HostAdapter, true)) {
+			release_region(HostAdapter->IO_Address, HostAdapter->AddressCount);
 			continue;
+		}
 		/*
 		   Check the Host Adapter.  If unsuccessful, abort further initialization.
 		 */
-		if (!BusLogic_CheckHostAdapter(HostAdapter))
+		if (!BusLogic_CheckHostAdapter(HostAdapter)) {
+			release_region(HostAdapter->IO_Address, HostAdapter->AddressCount);
 			continue;
+		}
 		/*
 		   Initialize the Driver Options field if provided.
 		 */
@@ -2247,16 +2272,6 @@ static int __init BusLogic_init(void)
 		 */
 		BusLogic_AnnounceDriver(HostAdapter);
 		/*
-		   Register usage of the I/O Address range.  From this point onward, any
-		   failure will be assumed to be due to a problem with the Host Adapter,
-		   rather than due to having mistakenly identified this port as belonging
-		   to a BusLogic Host Adapter.  The I/O Address range will not be
-		   released, thereby preventing it from being incorrectly identified as
-		   any other type of Host Adapter.
-		 */
-		if (!request_region(HostAdapter->IO_Address, HostAdapter->AddressCount,
"BusLogic"))
-			continue;
-		/*
 		   Register the SCSI Host structure.
 		 */
 
@@ -2280,6 +2295,12 @@ static int __init BusLogic_init(void)
 		   Acquire the System Resources necessary to use the Host Adapter, then
 		   Create the Initial CCBs, Initialize the Host Adapter, and finally
 		   perform Target Device Inquiry.
+
+		   From this point onward, any failure will be assumed to be due to a
+		   problem with the Host Adapter, rather than due to having mistakenly
+		   identified this port as belonging to a BusLogic Host Adapter.  The
+		   I/O Address range will not be released, thereby preventing it from
+		   being incorrectly identified as any other type of Host Adapter.
 		 */
 		if (BusLogic_ReadHostAdapterConfiguration(HostAdapter) &&
 		    BusLogic_ReportHostAdapterConfiguration(HostAdapter) &&
@@ -3598,6 +3619,7 @@ static void __exit BusLogic_exit(void)
 
 __setup("BusLogic=", BusLogic_Setup);
 
+#ifdef MODULE
 static struct pci_device_id BusLogic_pci_tbl[] __devinitdata = {
 	{ PCI_VENDOR_ID_BUSLOGIC, PCI_DEVICE_ID_BUSLOGIC_MULTIMASTER,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
@@ -3607,6 +3629,7 @@ static struct pci_device_id BusLogic_pci
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{ }
 };
+#endif
 MODULE_DEVICE_TABLE(pci, BusLogic_pci_tbl);
 
 module_init(BusLogic_init);

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 05/28] Page-align the GDT

Xen wants a dedicated page for the GDT.  I believe VMI likes it too.
lguest, KVM and native don't care.

Simple transformation to page-aligned "struct gdt_page".

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Jeremy Fitzhardinge <jeremy@xensource.com>

---
 arch/i386/kernel/cpu/common.c |    6 +++---
 arch/i386/kernel/entry.S      |    2 +-
 arch/i386/kernel/head.S       |    2 +-
 arch/i386/kernel/traps.c      |    2 +-
 include/asm-i386/desc.h       |    9 +++++++--
 5 files changed, 13 insertions(+), 8 deletions(-)

==================================================================---
a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -22,7 +22,7 @@
 
 #include "cpu.h"
 
-DEFINE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]) = {
+DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
 	[GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
 	[GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
 	[GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
@@ -48,8 +48,8 @@ DEFINE_PER_CPU(struct desc_struct, cpu_g
 
 	[GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
 	[GDT_ENTRY_PDA] = { 0x00000000, 0x00c09200 }, /* set in setup_pda */
-};
-EXPORT_PER_CPU_SYMBOL_GPL(cpu_gdt);
+} };
+EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 
 DEFINE_PER_CPU(struct i386_pda, _cpu_pda) = {
 	._pda = &per_cpu___cpu_pda,
==================================================================---
a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -558,7 +558,7 @@ END(syscall_badsys)
 #define FIXUP_ESPFIX_STACK \
 	/* since we are on a wrong stack, we cant make it a C code :( */ \
 	movl %fs:PDA_cpu, %ebx; \
-	PER_CPU(cpu_gdt, %ebx); \
+	PER_CPU(gdt_page, %ebx); \
 	GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
 	addl %esp, %eax; \
 	pushl $__KERNEL_DS; \
==================================================================---
a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -599,7 +599,7 @@ idt_descr:
 	.word 0				# 32 bit align gdt_desc.address
 ENTRY(early_gdt_descr)
 	.word GDT_ENTRIES*8-1
-	.long per_cpu__cpu_gdt		/* Overwritten for secondary CPUs */
+	.long per_cpu__gdt_page		/* Overwritten for secondary CPUs */
 
 /*
  * The boot_gdt_table must mirror the equivalent in setup.S and is
==================================================================---
a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -1037,7 +1037,7 @@ fastcall unsigned long patch_espfix_desc
 fastcall unsigned long patch_espfix_desc(unsigned long uesp,
 					  unsigned long kesp)
 {
-	struct desc_struct *gdt = __get_cpu_var(cpu_gdt);
+	struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
 	unsigned long base = (kesp - uesp) & -THREAD_SIZE;
 	unsigned long new_kesp = kesp - base;
 	unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
==================================================================---
a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -18,10 +18,15 @@ struct Xgt_desc_struct {
 	unsigned short pad;
 } __attribute__ ((packed));
 
-DECLARE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]);
+struct gdt_page
+{
+	struct desc_struct gdt[GDT_ENTRIES];
+} __attribute__((aligned(PAGE_SIZE)));
+DECLARE_PER_CPU(struct gdt_page, gdt_page);
+
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {
-	return per_cpu(cpu_gdt, cpu);
+	return per_cpu(gdt_page, cpu).gdt;
 }
 
 extern struct Xgt_desc_struct idt_descr;

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 10/28] i386: map enough initial memory to create lowmem mappings

head.S creates the very initial pagetable for the kernel.  This just
maps enough space for the kernel itself, and an allocation bitmap.
The amount of mapped memory is rounded up to 4Mbytes, and so this
typically ends up mapping 8Mbytes of memory.

When booting, pagetable_init() needs to create mappings for all
lowmem, and the pagetables for these mappings are allocated from the
free pages around the kernel in low memory.  If the number of
pagetable pages + kernel size exceeds head.S's initial mapping, it
will end up faulting on an unmapped page.  This will only happen with
specific combinations of kernel size and memory size.

This patch makes sure that head.S also maps enough space to fit the
kernel pagetables as well as the kernel itself.  It ends up using an
additional two pages of unreclaimable memory.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Acked-by: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>,

---
 arch/i386/kernel/asm-offsets.c |    6 ++++++
 arch/i386/kernel/head.S        |   25 ++++++++++++++++++++-----
 2 files changed, 26 insertions(+), 5 deletions(-)

==================================================================---
a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -11,6 +11,7 @@
 #include <linux/suspend.h>
 #include <asm/ucontext.h>
 #include "sigframe.h"
+#include <asm/pgtable.h>
 #include <asm/fixmap.h>
 #include <asm/processor.h>
 #include <asm/thread_info.h>
@@ -96,6 +97,11 @@ void foo(void)
 		 sizeof(struct tss_struct));
 
 	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
+	DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
+	DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
+	DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
+	DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
+
 	DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
 
 	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
==================================================================---
a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -34,17 +34,32 @@
 
 /*
  * This is how much memory *in addition to the memory covered up to
- * and including _end* we need mapped initially.  We need one bit for
- * each possible page, but only in low memory, which means
- * 2^32/4096/8 = 128K worst case (4G/4G split.)
+ * and including _end* we need mapped initially.
+ * We need:
+ *  - one bit for each possible page, but only in low memory, which means
+ *     2^32/4096/8 = 128K worst case (4G/4G split.)
+ *  - enough space to map all low memory, which means
+ *     (2^32/4096) / 1024 pages (worst case, non PAE)
+ *     (2^32/4096) / 512 + 4 pages (worst case for PAE)
+ *  - a few pages for allocator use before the kernel pagetable has
+ *     been set up
  *
  * Modulo rounding, each megabyte assigned here requires a kilobyte of
  * memory, which is currently unreclaimed.
  *
  * This should be a multiple of a page.
  */
-#define INIT_MAP_BEYOND_END	(128*1024)
-
+LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
+
+#if PTRS_PER_PMD > 1
+PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
+#else
+PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
+#endif
+BOOTBITMAP_SIZE = LOW_PAGES / 8
+ALLOCATOR_SLOP = 4
+
+INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE +
ALLOCATOR_SLOP)*PAGE_SIZE_asm
 
 /*
  * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 22/28] Convert VMI timer to use clock events

Convert VMI timer to use clock events, making it properly able to use the NO_HZ
infrastructure.  On UP systems, with no local APIC, we just continue to route
these events through the PIT.  On systems with a local APIC, or SMP, we provide
a single source interrupt chip which creates the local timer IRQ.  It actually
gets delivered by the APIC hardware, but we don't want to use the same local
APIC clocksource processing, so we create our own handler here.

From: Zachary Amsden <zach@vmware.com>
Signed-off-by: Zachary Amsden <zach@vmware.com>
CC: Dan Hecht <dhecht@vmware.com>
CC: Ingo Molnar <mingo@elte.hu>
CC: Thomas Gleixner <tglx@linutronix.de>

---
 arch/i386/kernel/Makefile   |    2 
 arch/i386/kernel/entry.S    |    5 
 arch/i386/kernel/vmi.c      |   26 --
 arch/i386/kernel/vmiclock.c |  318 ++++++++++++++++++++++++++++
 arch/i386/kernel/vmitime.c  |  482 -------------------------------------------
 include/asm-i386/vmi_time.h |   18 -
 6 files changed, 327 insertions(+), 524 deletions(-)

==================================================================---
a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -41,7 +41,7 @@ obj-$(CONFIG_K8_NB)		+= k8.o
 obj-$(CONFIG_K8_NB)		+= k8.o
 obj-$(CONFIG_STACK_UNWIND)	+= unwind.o
 
-obj-$(CONFIG_VMI)		+= vmi.o vmitime.o
+obj-$(CONFIG_VMI)		+= vmi.o vmiclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
 obj-y				+= pcspeaker.o
 
==================================================================---
a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -73,6 +73,9 @@ static struct {
   	void (*set_lazy_mode)(int mode);
 } vmi_ops;
 
+/* Cached VMI operations */
+struct vmi_timer_ops vmi_timer_ops;
+
 /*
  * VMI patching routines.
  */
@@ -231,18 +234,6 @@ static void vmi_nop(void)
 {
 }
 
-/* For NO_IDLE_HZ, we stop the clock when halting the kernel */
-static fastcall void vmi_safe_halt(void)
-{
-	int idle = vmi_stop_hz_timer();
-	vmi_ops.halt();
-	if (idle) {
-		local_irq_disable();
-		vmi_account_time_restart_hz_timer();
-		local_irq_enable();
-	}
-}
-
 #ifdef CONFIG_DEBUG_PAGE_TYPE
 
 #ifdef CONFIG_X86_PAE
@@ -714,7 +705,6 @@ do {								\
 		vmi_ops.cache = (void *)rel->eip;		\
 	}							\
 } while (0)
-
 
 /*
  * Activate the VMI interface and switch into paravirtualized mode
@@ -894,8 +884,8 @@ static inline int __init activate_vmi(vo
 		paravirt_ops.get_wallclock = vmi_get_wallclock;
 		paravirt_ops.set_wallclock = vmi_set_wallclock;
 #ifdef CONFIG_X86_LOCAL_APIC
-		paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm;
-		paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm;
+		paravirt_ops.setup_boot_clock = vmi_time_bsp_init;
+		paravirt_ops.setup_secondary_clock = vmi_time_ap_init;
 #endif
 		paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles;
  		paravirt_ops.get_cpu_khz = vmi_cpu_khz;
@@ -907,11 +897,7 @@ static inline int __init activate_vmi(vo
 		disable_vmi_timer = 1;
 	}
 
-	/* No idle HZ mode only works if VMI timer and no idle is enabled */
-	if (disable_noidle || disable_vmi_timer)
-		para_fill(safe_halt, Halt);
-	else
-		para_wrap(safe_halt, vmi_safe_halt, halt, Halt);
+	para_fill(safe_halt, Halt);
 
 	/*
 	 * Alternative instruction rewriting doesn't happen soon enough
==================================================================---
a/include/asm-i386/vmi_time.h
+++ b/include/asm-i386/vmi_time.h
@@ -53,22 +53,8 @@ extern unsigned long vmi_cpu_khz(void);
 extern unsigned long vmi_cpu_khz(void);
 
 #ifdef CONFIG_X86_LOCAL_APIC
-extern void __init vmi_timer_setup_boot_alarm(void);
-extern void __devinit vmi_timer_setup_secondary_alarm(void);
-extern void apic_vmi_timer_interrupt(void);
-#endif
-
-#ifdef CONFIG_NO_IDLE_HZ
-extern int vmi_stop_hz_timer(void);
-extern void vmi_account_time_restart_hz_timer(void);
-#else
-static inline int vmi_stop_hz_timer(void)
-{
-	return 0;
-}
-static inline void vmi_account_time_restart_hz_timer(void)
-{
-}
+extern void __devinit vmi_time_bsp_init(void);
+extern void __devinit vmi_time_ap_init(void);
 #endif
 
 /*
==================================================================---
a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -637,11 +637,6 @@ ENDPROC(name)
 /* The include is where all of the SMP etc. interrupts come from */
 #include "entry_arch.h"
 
-/* This alternate entry is needed because we hijack the apic LVTT */
-#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
-BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
-#endif
-
 KPROBE_ENTRY(page_fault)
 	RING0_EC_FRAME
 	pushl $do_page_fault
==================================================================--- /dev/null
+++ b/arch/i386/kernel/vmiclock.c
@@ -0,0 +1,318 @@
+/*
+ * VMI paravirtual timer support routines.
+ *
+ * Copyright (C) 2007, VMware, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/cpumask.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+
+#include <asm/vmi.h>
+#include <asm/vmi_time.h>
+#include <asm/arch_hooks.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/timer.h>
+
+#include <irq_vectors.h>
+#include "io_ports.h"
+
+#define VMI_ONESHOT  (VMI_ALARM_IS_ONESHOT  | VMI_CYCLES_REAL |
vmi_get_alarm_wiring())
+#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL |
vmi_get_alarm_wiring())
+
+static DEFINE_PER_CPU(struct clock_event_device, local_events);
+
+static inline u32 vmi_counter(u32 flags)
+{
+	/* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
+	 * cycle counter. */
+	return flags & VMI_ALARM_COUNTER_MASK;
+}
+
+/* paravirt_ops.get_wallclock = vmi_get_wallclock */
+unsigned long vmi_get_wallclock(void)
+{
+	unsigned long long wallclock;
+	wallclock = vmi_timer_ops.get_wallclock(); // nsec
+	(void)do_div(wallclock, 1000000000);       // sec
+
+	return wallclock;
+}
+
+/* paravirt_ops.set_wallclock = vmi_set_wallclock */
+int vmi_set_wallclock(unsigned long now)
+{
+	return 0;
+}
+
+/* paravirt_ops.get_scheduled_cycles = vmi_get_sched_cycles */
+unsigned long long vmi_get_sched_cycles(void)
+{
+	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
+}
+
+/* paravirt_ops.get_cpu_khz = vmi_cpu_khz */
+unsigned long vmi_cpu_khz(void)
+{
+	unsigned long long khz;
+	khz = vmi_timer_ops.get_cycle_frequency();
+	(void)do_div(khz, 1000);
+	return khz;
+}
+
+static inline unsigned int vmi_get_timer_vector(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+	return FIRST_DEVICE_VECTOR;
+#else
+	return FIRST_EXTERNAL_VECTOR;
+#endif
+}
+
+/** vmi clockchip */
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned int startup_timer_irq(unsigned int irq)
+{
+	unsigned long val = apic_read(APIC_LVTT);
+	apic_write(APIC_LVTT, vmi_get_timer_vector());
+
+	return (val & APIC_SEND_PENDING);
+}
+
+static void mask_timer_irq(unsigned int irq)
+{
+	unsigned long val = apic_read(APIC_LVTT);
+	apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
+}
+
+static void unmask_timer_irq(unsigned int irq)
+{
+	unsigned long val = apic_read(APIC_LVTT);
+	apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
+}
+
+static void ack_timer_irq(unsigned int irq)
+{
+	ack_APIC_irq();
+}
+
+static struct irq_chip vmi_chip __read_mostly = {
+	.name 		= "VMI-LOCAL",
+	.startup 	= startup_timer_irq,
+	.mask	 	= mask_timer_irq,
+	.unmask	 	= unmask_timer_irq,
+	.ack 		= ack_timer_irq
+};
+#endif
+
+/** vmi clockevent */
+#define VMI_ALARM_WIRED_IRQ0    0x00000000
+#define VMI_ALARM_WIRED_LVTT    0x00010000
+static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
+
+static inline int vmi_get_alarm_wiring(void)
+{
+	return vmi_wiring;
+}
+
+static void vmi_timer_set_mode(enum clock_event_mode mode,
+			       struct clock_event_device *evt)
+{
+	cycle_t now, cycles_per_hz;
+	BUG_ON(!irqs_disabled());
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_ONESHOT:
+		break;
+	case CLOCK_EVT_MODE_PERIODIC:
+		cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
+		(void)do_div(cycles_per_hz, HZ);
+		now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
+		vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		switch (evt->mode) {
+		case CLOCK_EVT_MODE_ONESHOT:
+			vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
+			break;
+		case CLOCK_EVT_MODE_PERIODIC:
+			vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
+			break;
+		default:
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+static int vmi_timer_next_event(unsigned long delta,
+				struct clock_event_device *evt)
+{
+	/* Unfortunately, set_next_event interface only passes relative
+	 * expiry, but we want absolute expiry.  It'd be better if were
+	 * were passed an aboslute expiry, since a bunch of time may
+	 * have been stolen between the time the delta is computed and
+	 * when we set the alarm below. */
+	cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
+
+	BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+	vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
+	return 0;
+}
+
+static struct clock_event_device vmi_clockevent = {
+	.name		= "vmi-timer",
+	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
+	.shift		= 22,
+	.set_mode	= vmi_timer_set_mode,
+	.set_next_event = vmi_timer_next_event,
+	.rating         = 1000,
+	.irq		= 0,
+};
+
+static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
+{
+	struct clock_event_device *evt = &__get_cpu_var(local_events);
+	evt->event_handler(evt);
+	return IRQ_HANDLED;
+}
+
+static struct irqaction vmi_clock_action  = {
+	.name 		= "vmi-timer",
+	.handler 	= vmi_timer_interrupt,
+	.flags 		= IRQF_DISABLED | IRQF_NOBALANCING,
+	.mask 		= CPU_MASK_ALL,
+};
+
+static void __devinit vmi_time_init_clockevent(void)
+{
+	cycle_t cycles_per_msec;
+	struct clock_event_device *evt;
+
+	int cpu = smp_processor_id();
+	evt = &__get_cpu_var(local_events);
+
+	/* Use cycles_per_msec since div_sc params are 32-bits. */
+	cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
+	(void)do_div(cycles_per_msec, 1000);
+
+	memcpy(evt, &vmi_clockevent, sizeof(*evt));
+	/* Must pick .shift such that .mult fits in 32-bits.  Choosing
+	 * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
+	 * before overflow. */
+	evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
+	/* Upper bound is clockevent's use of ulong for cycle deltas. */
+	evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
+	evt->min_delta_ns = clockevent_delta2ns(1, evt);
+	evt->cpumask = cpumask_of_cpu(cpu);
+
+	printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu
shift=%u\n",
+	       evt->name, evt->mult, evt->shift);
+	clockevents_register_device(evt);
+}
+
+void __init vmi_time_init(void)
+{
+	/* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
+	outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
+
+	vmi_time_init_clockevent();
+	setup_irq(0, &vmi_clock_action);
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+void __devinit vmi_time_bsp_init(void)
+{
+	/*
+	 * On APIC systems, we want local timers to fire on each cpu.  We do
+	 * this by programming LVTT to deliver timer events to the IRQ handler
+	 * for IRQ-0, since we can't re-use the APIC local timer handler
+	 * without interfering with that code.
+	 */
+	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+	local_irq_disable();
+#ifdef CONFIG_X86_SMP
+	/*
+	 * XXX handle_percpu_irq only defined for SMP; we need to switch over
+	 * to using it, since this is a local interrupt, which each CPU must
+	 * handle individually without locking out or dropping simultaneous
+	 * local timers on other CPUs.  We also don't want to trigger the
+	 * quirk workaround code for interrupts which gets invoked from
+	 * handle_percpu_irq via eoi, so we use our own IRQ chip.
+	 */
+	set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq,
"lvtt");
+#else
+	set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq,
"lvtt");
+#endif
+	vmi_wiring = VMI_ALARM_WIRED_LVTT;
+	apic_write(APIC_LVTT, vmi_get_timer_vector());
+	local_irq_enable();
+	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+}
+
+void __devinit vmi_time_ap_init(void)
+{
+	vmi_time_init_clockevent();
+	apic_write(APIC_LVTT, vmi_get_timer_vector());
+}
+#endif
+
+/** vmi clocksource */
+
+static cycle_t read_real_cycles(void)
+{
+	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
+}
+
+static struct clocksource clocksource_vmi = {
+	.name			= "vmi-timer",
+	.rating			= 450,
+	.read			= read_real_cycles,
+	.mask			= CLOCKSOURCE_MASK(64),
+	.mult			= 0, /* to be set */
+	.shift			= 22,
+	.flags			= CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static int __init init_vmi_clocksource(void)
+{
+	cycle_t cycles_per_msec;
+
+	if (!vmi_timer_ops.get_cycle_frequency)
+		return 0;
+	/* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
+	cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
+	(void)do_div(cycles_per_msec, 1000);
+
+	/* Note that clocksource.{mult, shift} converts in the opposite direction
+	 * as clockevents.  */
+	clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
+						    clocksource_vmi.shift);
+
+	printk(KERN_WARNING "vmi: registering clock source khz=%lld\n",
cycles_per_msec);
+	return clocksource_register(&clocksource_vmi);
+
+}
+module_init(init_vmi_clocksource);
==================================================================---
a/arch/i386/kernel/vmitime.c
+++ /dev/null
@@ -1,482 +0,0 @@
-/*
- * VMI paravirtual timer support routines.
- *
- * Copyright (C) 2005, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to dhecht@vmware.com
- *
- */
-
-/*
- * Portions of this code from arch/i386/kernel/timers/timer_tsc.c.
- * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c.
- * See comments there for proper credits.
- */
-
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/jiffies.h>
-#include <linux/interrupt.h>
-#include <linux/kernel_stat.h>
-#include <linux/rcupdate.h>
-#include <linux/clocksource.h>
-
-#include <asm/timer.h>
-#include <asm/io.h>
-#include <asm/apic.h>
-#include <asm/div64.h>
-#include <asm/timer.h>
-#include <asm/desc.h>
-
-#include <asm/vmi.h>
-#include <asm/vmi_time.h>
-
-#include <mach_timer.h>
-#include <io_ports.h>
-
-#ifdef CONFIG_X86_LOCAL_APIC
-#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT
-#else
-#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0
-#endif
-
-/* Cached VMI operations */
-struct vmi_timer_ops vmi_timer_ops;
-
-#ifdef CONFIG_NO_IDLE_HZ
-
-/* /proc/sys/kernel/hz_timer state. */
-int sysctl_hz_timer;
-
-/* Some stats */
-static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs);
-static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies);
-static DEFINE_PER_CPU(unsigned long, idle_start_jiffies);
-
-#endif /* CONFIG_NO_IDLE_HZ */
-
-/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */
-static int alarm_hz = CONFIG_VMI_ALARM_HZ;
-
-/* Cache of the value get_cycle_frequency / HZ. */
-static signed long long cycles_per_jiffy;
-
-/* Cache of the value get_cycle_frequency / alarm_hz. */
-static signed long long cycles_per_alarm;
-
-/* The number of cycles accounted for by the 'jiffies'/'xtime'
count.
- * Protected by xtime_lock. */
-static unsigned long long real_cycles_accounted_system;
-
-/* The number of cycles accounted for by update_process_times(), per cpu. */
-static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu);
-
-/* The number of stolen cycles accounted, per cpu. */
-static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu);
-
-/* Clock source. */
-static cycle_t read_real_cycles(void)
-{
-	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
-}
-
-static cycle_t read_available_cycles(void)
-{
-	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
-}
-
-#if 0
-static cycle_t read_stolen_cycles(void)
-{
-	return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN);
-}
-#endif  /*  0  */
-
-static struct clocksource clocksource_vmi = {
-	.name			= "vmi-timer",
-	.rating			= 450,
-	.read			= read_real_cycles,
-	.mask			= CLOCKSOURCE_MASK(64),
-	.mult			= 0, /* to be set */
-	.shift			= 22,
-	.flags			= CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-
-/* Timer interrupt handler. */
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id);
-
-static struct irqaction vmi_timer_irq  = {
-	.handler = vmi_timer_interrupt,
-	.flags = IRQF_DISABLED,
-	.mask = CPU_MASK_NONE,
-	.name = "VMI-alarm",
-};
-
-/* Alarm rate */
-static int __init vmi_timer_alarm_rate_setup(char* str)
-{
-	int alarm_rate;
-	if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) {
-		alarm_hz = alarm_rate;
-		printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz);
-	}
-	return 1;
-}
-__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup);
-
-
-/* Initialization */
-static void vmi_get_wallclock_ts(struct timespec *ts)
-{
-	unsigned long long wallclock;
-	wallclock = vmi_timer_ops.get_wallclock(); // nsec units
-	ts->tv_nsec = do_div(wallclock, 1000000000);
-	ts->tv_sec = wallclock;
-}
-
-unsigned long vmi_get_wallclock(void)
-{
-	struct timespec ts;
-	vmi_get_wallclock_ts(&ts);
-	return ts.tv_sec;
-}
-
-int vmi_set_wallclock(unsigned long now)
-{
-	return -1;
-}
-
-unsigned long long vmi_get_sched_cycles(void)
-{
-	return read_available_cycles();
-}
-
-unsigned long vmi_cpu_khz(void)
-{
-	unsigned long long khz;
-
-	khz = vmi_timer_ops.get_cycle_frequency();
-	(void)do_div(khz, 1000);
-	return khz;
-}
-
-void __init vmi_time_init(void)
-{
-	unsigned long long cycles_per_sec, cycles_per_msec;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	setup_irq(0, &vmi_timer_irq);
-#ifdef CONFIG_X86_LOCAL_APIC
-	set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt);
-#endif
-
-	real_cycles_accounted_system = read_real_cycles();
-	per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles();
-
-	cycles_per_sec = vmi_timer_ops.get_cycle_frequency();
-	cycles_per_jiffy = cycles_per_sec;
-	(void)do_div(cycles_per_jiffy, HZ);
-	cycles_per_alarm = cycles_per_sec;
-	(void)do_div(cycles_per_alarm, alarm_hz);
-	cycles_per_msec = cycles_per_sec;
-	(void)do_div(cycles_per_msec, 1000);
-
-	printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu
;"
-	       "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy,
-	       cycles_per_alarm);
-
-	clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
-						    clocksource_vmi.shift);
-	if (clocksource_register(&clocksource_vmi))
-		printk(KERN_WARNING "Error registering VMITIME clocksource.");
-
-	/* Disable PIT. */
-	outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
-
-	/* schedule the alarm. do this in phase with
process_times_cycles_accounted_cpu
-	 * reduce the latency calling update_process_times. */
-	vmi_timer_ops.set_alarm(
-		      VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
-		      per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
-		      cycles_per_alarm);
-
-	local_irq_restore(flags);
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-void __init vmi_timer_setup_boot_alarm(void)
-{
-	local_irq_disable();
-
-	/* Route the interrupt to the correct vector. */
-	apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
-
-	/* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */
-	vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
-	vmi_timer_ops.set_alarm(
-		      VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
-		      per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
-		      cycles_per_alarm);
-	local_irq_enable();
-}
-
-/* Initialize the time accounting variables for an AP on an SMP system.
- * Also, set the local alarm for the AP. */
-void __devinit vmi_timer_setup_secondary_alarm(void)
-{
-	int cpu = smp_processor_id();
-
-	/* Route the interrupt to the correct vector. */
-	apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
-
-	per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles();
-
-	vmi_timer_ops.set_alarm(
-		      VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
-		      per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
-		      cycles_per_alarm);
-}
-
-#endif
-
-/* Update system wide (real) time accounting (e.g. jiffies, xtime). */
-static void vmi_account_real_cycles(unsigned long long cur_real_cycles)
-{
-	long long cycles_not_accounted;
-
-	write_seqlock(&xtime_lock);
-
-	cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system;
-	while (cycles_not_accounted >= cycles_per_jiffy) {
-		/* systems wide jiffies. */
-		do_timer(1);
-
-		cycles_not_accounted -= cycles_per_jiffy;
-		real_cycles_accounted_system += cycles_per_jiffy;
-	}
-
-	write_sequnlock(&xtime_lock);
-}
-
-/* Update per-cpu process times. */
-static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu,
-					     unsigned long long cur_process_times_cycles)
-{
-	long long cycles_not_accounted;
-	cycles_not_accounted = cur_process_times_cycles -
-		per_cpu(process_times_cycles_accounted_cpu, cpu);
-
-	while (cycles_not_accounted >= cycles_per_jiffy) {
-		/* Account time to the current process.  This includes
-		 * calling into the scheduler to decrement the timeslice
-		 * and possibly reschedule.*/
-		update_process_times(user_mode(regs));
-		/* XXX handle /proc/profile multiplier.  */
-		profile_tick(CPU_PROFILING);
-
-		cycles_not_accounted -= cycles_per_jiffy;
-		per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
-	}
-}
-
-#ifdef CONFIG_NO_IDLE_HZ
-/* Update per-cpu idle times.  Used when a no-hz halt is ended. */
-static void vmi_account_no_hz_idle_cycles(int cpu,
-					  unsigned long long cur_process_times_cycles)
-{
-	long long cycles_not_accounted;
-	unsigned long no_idle_hz_jiffies = 0;
-
-	cycles_not_accounted = cur_process_times_cycles -
-		per_cpu(process_times_cycles_accounted_cpu, cpu);
-
-	while (cycles_not_accounted >= cycles_per_jiffy) {
-		no_idle_hz_jiffies++;
-		cycles_not_accounted -= cycles_per_jiffy;
-		per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
-	}
-	/* Account time to the idle process. */
-	account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies));
-}
-#endif
-
-/* Update per-cpu stolen time. */
-static void vmi_account_stolen_cycles(int cpu,
-				      unsigned long long cur_real_cycles,
-				      unsigned long long cur_avail_cycles)
-{
-	long long stolen_cycles_not_accounted;
-	unsigned long stolen_jiffies = 0;
-
-	if (cur_real_cycles < cur_avail_cycles)
-		return;
-
-	stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles -
-		per_cpu(stolen_cycles_accounted_cpu, cpu);
-
-	while (stolen_cycles_not_accounted >= cycles_per_jiffy) {
-		stolen_jiffies++;
-		stolen_cycles_not_accounted -= cycles_per_jiffy;
-		per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
-	}
-	/* HACK: pass NULL to force time onto cpustat->steal. */
-	account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies));
-}
-
-/* Body of either IRQ0 interrupt handler (UP no local-APIC) or
- * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */
-static void vmi_local_timer_interrupt(int cpu)
-{
-	unsigned long long cur_real_cycles, cur_process_times_cycles;
-
-	cur_real_cycles = read_real_cycles();
-	cur_process_times_cycles = read_available_cycles();
-	/* Update system wide (real) time state (xtime, jiffies). */
-	vmi_account_real_cycles(cur_real_cycles);
-	/* Update per-cpu process times. */
-	vmi_account_process_times_cycles(get_irq_regs(), cpu,
cur_process_times_cycles);
-        /* Update time stolen from this cpu by the hypervisor. */
-	vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
-}
-
-#ifdef CONFIG_NO_IDLE_HZ
-
-/* Must be called only from idle loop, with interrupts disabled. */
-int vmi_stop_hz_timer(void)
-{
-	/* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */
-
-	unsigned long seq, next;
-	unsigned long long real_cycles_expiry;
-	int cpu = smp_processor_id();
-
-	BUG_ON(!irqs_disabled());
-	if (sysctl_hz_timer != 0)
-		return 0;
-
-	cpu_set(cpu, nohz_cpu_mask);
-	smp_mb();
-
-	if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
-	    (next = next_timer_interrupt(),
-	     time_before_eq(next, jiffies + HZ/CONFIG_VMI_ALARM_HZ))) {
-		cpu_clear(cpu, nohz_cpu_mask);
-		return 0;
-	}
-
-	/* Convert jiffies to the real cycle counter. */
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		real_cycles_expiry = real_cycles_accounted_system +
-			(long)(next - jiffies) * cycles_per_jiffy;
-	} while (read_seqretry(&xtime_lock, seq));
-
-	/* This cpu is going idle. Disable the periodic alarm. */
-	vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
-	per_cpu(idle_start_jiffies, cpu) = jiffies;
-	/* Set the real time alarm to expire at the next event. */
-	vmi_timer_ops.set_alarm(
-		VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL,
-		real_cycles_expiry, 0);
-	return 1;
-}
-
-static void vmi_reenable_hz_timer(int cpu)
-{
-	/* For /proc/vmi/info idle_hz stat. */
-	per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies,
cpu);
-	per_cpu(vmi_idle_no_hz_irqs, cpu)++;
-
-	/* Don't bother explicitly cancelling the one-shot alarm -- at
-	 * worse we will receive a spurious timer interrupt. */
-	vmi_timer_ops.set_alarm(
-		      VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
-		      per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
-		      cycles_per_alarm);
-	/* Indicate this cpu is no longer nohz idle. */
-	cpu_clear(cpu, nohz_cpu_mask);
-}
-
-/* Called from interrupt handlers when (local) HZ timer is disabled. */
-void vmi_account_time_restart_hz_timer(void)
-{
-	unsigned long long cur_real_cycles, cur_process_times_cycles;
-	int cpu = smp_processor_id();
-
-	BUG_ON(!irqs_disabled());
-	/* Account the time during which the HZ timer was disabled. */
-	cur_real_cycles = read_real_cycles();
-	cur_process_times_cycles = read_available_cycles();
-	/* Update system wide (real) time state (xtime, jiffies). */
-	vmi_account_real_cycles(cur_real_cycles);
-	/* Update per-cpu idle times. */
-	vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles);
-        /* Update time stolen from this cpu by the hypervisor. */
-	vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
-	/* Reenable the hz timer. */
-	vmi_reenable_hz_timer(cpu);
-}
-
-#endif /* CONFIG_NO_IDLE_HZ */
-
-/* UP (and no local-APIC) VMI-timer alarm interrupt handler.
- * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after
- * APIC setup and setup_boot_vmi_alarm() is called.  */
-static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
-{
-	vmi_local_timer_interrupt(smp_processor_id());
-	return IRQ_HANDLED;
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector.
- * Also used in UP when CONFIG_X86_LOCAL_APIC.
- * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt.
*/
-void smp_apic_vmi_timer_interrupt(struct pt_regs *regs)
-{
-	struct pt_regs *old_regs = set_irq_regs(regs);
-	int cpu = smp_processor_id();
-
-	/*
-	 * the NMI deadlock-detector uses this.
-	 */
-        per_cpu(irq_stat,cpu).apic_timer_irqs++;
-
-	/*
-	 * NOTE! We'd better ACK the irq immediately,
-	 * because timer handling can be slow.
-	 */
-	ack_APIC_irq();
-
-	/*
-	 * update_process_times() expects us to have done irq_enter().
-	 * Besides, if we don't timer interrupts ignore the global
-	 * interrupt lock, which is the WrongThing (tm) to do.
-	 */
-	irq_enter();
-	vmi_local_timer_interrupt(cpu);
-	irq_exit();
-	set_irq_regs(old_regs);
-}
-
-#endif  /* CONFIG_X86_LOCAL_APIC */

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 02/28] Account for module percpu space separately from kernel percpu

Rather than using a single constant PERCPU_ENOUGH_ROOM, compute it as
the sum of kernel_percpu + PERCPU_MODULE_RESERVE.  This is now common
to all architectures; if an architecture wants to set
PERCPU_ENOUGH_ROOM to something special, then it may do so (ia64 is
the only one which does).

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Andi Kleen <ak@suse.de>

---
 include/asm-alpha/percpu.h   |   14 --------------
 include/asm-sparc64/percpu.h |   10 ----------
 include/asm-x86_64/percpu.h  |   10 ----------
 include/linux/percpu.h       |    9 ++++++++-
 kernel/module.c              |    2 +-
 5 files changed, 9 insertions(+), 36 deletions(-)

==================================================================---
a/include/asm-alpha/percpu.h
+++ b/include/asm-alpha/percpu.h
@@ -1,19 +1,5 @@
 #ifndef __ALPHA_PERCPU_H
 #define __ALPHA_PERCPU_H
-
-/*
- * Increase the per cpu area for Alpha so that
- * modules using percpu area can load.
- */
-#ifdef CONFIG_MODULES
-# define PERCPU_MODULE_RESERVE 8192
-#else
-# define PERCPU_MODULE_RESERVE 0
-#endif
-
-#define PERCPU_ENOUGH_ROOM \
-	(ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \
-	 PERCPU_MODULE_RESERVE)
 
 #include <asm-generic/percpu.h>
 
==================================================================---
a/include/asm-sparc64/percpu.h
+++ b/include/asm-sparc64/percpu.h
@@ -4,16 +4,6 @@
 #include <linux/compiler.h>
 
 #ifdef CONFIG_SMP
-
-#ifdef CONFIG_MODULES
-# define PERCPU_MODULE_RESERVE 8192
-#else
-# define PERCPU_MODULE_RESERVE 0
-#endif
-
-#define PERCPU_ENOUGH_ROOM \
-	(ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \
-	 PERCPU_MODULE_RESERVE)
 
 extern void setup_per_cpu_areas(void);
 
==================================================================---
a/include/asm-x86_64/percpu.h
+++ b/include/asm-x86_64/percpu.h
@@ -10,16 +10,6 @@
 #ifdef CONFIG_SMP
 
 #include <asm/pda.h>
-
-#ifdef CONFIG_MODULES
-# define PERCPU_MODULE_RESERVE 8192
-#else
-# define PERCPU_MODULE_RESERVE 0
-#endif
-
-#define PERCPU_ENOUGH_ROOM \
-	(ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \
-	 PERCPU_MODULE_RESERVE)
 
 #define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
 #define __my_cpu_offset() read_pda(data_offset)
==================================================================---
a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -11,8 +11,15 @@
 
 /* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
 #ifndef PERCPU_ENOUGH_ROOM
-#define PERCPU_ENOUGH_ROOM 32768
+#ifdef CONFIG_MODULES
+#define PERCPU_MODULE_RESERVE	8192
+#else
+#define PERCPU_MODULE_RESERVE	0
 #endif
+
+#define PERCPU_ENOUGH_ROOM						\
+	(__per_cpu_end - __per_cpu_start + PERCPU_MODULE_RESERVE)
+#endif	/* PERCPU_ENOUGH_ROOM */
 
 /*
  * Must be an lvalue. Since @var must be a simple identifier,
==================================================================---
a/kernel/module.c
+++ b/kernel/module.c
@@ -430,7 +430,7 @@ static int percpu_modinit(void)
 	pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
 			    GFP_KERNEL);
 	/* Static in-kernel percpu data (used). */
-	pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES);
+	pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
 	/* Free room. */
 	pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
 	if (pcpu_size[1] < 0) {

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 09/28] Fix UP gdt bugs

Fixes two problems with the GDT when compiling for uniprocessor:
 - There's no percpu segment, so trying to load its selector into %fs fails.
   Use a null selector instead.
 - The real gdt needs to be loaded at some point.  Do it in cpu_init().

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>

---
 arch/i386/kernel/cpu/common.c |   13 +++++++++++++
 arch/i386/kernel/smpboot.c    |   12 ------------
 include/asm-i386/processor.h  |    1 +
 include/asm-i386/segment.h    |    4 ++++
 4 files changed, 18 insertions(+), 12 deletions(-)

==================================================================---
a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -644,6 +644,18 @@ struct pt_regs * __devinit idle_regs(str
 	return regs;
 }
 
+/* Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one. */
+void switch_to_new_gdt(void)
+{
+	struct Xgt_desc_struct gdt_descr;
+
+	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+	gdt_descr.size = GDT_SIZE - 1;
+	load_gdt(&gdt_descr);
+	asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) :
"memory");
+}
+
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
  * initialized (naturally) in the bootstrap process, such as the GDT
@@ -674,6 +688,7 @@ void __cpuinit cpu_init(void)
 	}
 
 	load_idt(&idt_descr);
+	switch_to_new_gdt();
 
 	/*
 	 * Set up and load the per-CPU TSS and LDT
==================================================================---
a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -1176,18 +1176,6 @@ void __init native_smp_prepare_cpus(unsi
 	smp_boot_cpus(max_cpus);
 }
 
-/* Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one. */
-static inline void switch_to_new_gdt(void)
-{
-	struct Xgt_desc_struct gdt_descr;
-
-	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
-	gdt_descr.size = GDT_SIZE - 1;
-	load_gdt(&gdt_descr);
-	asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) :
"memory");
-}
-
 void __init native_smp_prepare_boot_cpu(void)
 {
 	unsigned int cpu = smp_processor_id();
==================================================================---
a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -777,6 +777,7 @@ extern int sysenter_setup(void);
 extern int sysenter_setup(void);
 
 extern void cpu_set_gdt(int);
+extern void switch_to_new_gdt(void);
 extern void cpu_init(void);
 
 #endif /* __ASM_I386_PROCESSOR_H */
==================================================================---
a/include/asm-i386/segment.h
+++ b/include/asm-i386/segment.h
@@ -75,7 +75,11 @@
 #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
 
 #define GDT_ENTRY_PERCPU			(GDT_ENTRY_KERNEL_BASE + 15)
+#ifdef CONFIG_SMP
 #define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
+#else
+#define __KERNEL_PERCPU 0
+#endif
 
 #define GDT_ENTRY_DOUBLEFAULT_TSS	31
 

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 15/28] In compat mode, the return value here was uninitialized.

From: Zachary Amsden <zach@vmware.com>
Signed-off-by: Zachary Amsden <zach@vmware.com>

---
 arch/i386/kernel/sysenter.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

==================================================================---
a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -254,7 +254,7 @@ int arch_setup_additional_pages(struct l
 {
 	struct mm_struct *mm = current->mm;
 	unsigned long addr;
-	int ret;
+	int ret = 0;
 	bool compat;
 
 	down_write(&mm->mmap_sem);

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 20/28] Now that the VDSO can be relocated, we can support it in VMI configurations.

From: Zachary Amsden <zach@vmware.com>
Signed-off-by: Zachary Amsden <zach@vmware.com>

---
 arch/i386/Kconfig |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

==================================================================---
a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -220,7 +220,7 @@ config PARAVIRT
 
 config VMI
 	bool "VMI Paravirt-ops support"
-	depends on PARAVIRT && !COMPAT_VDSO
+	depends on PARAVIRT
 	help
 	  VMI provides a paravirtualized interface to the VMware ESX server
 	  (it could be used by other hypervisors in theory too, but is not

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 18/28] Copying of the pgd range must happen under the pgd_lock

Copying of the pgd range must happen under the pgd_lock.  This got broken by
the paravirt changes in the -mm tree.  Badness can result if you copy the pgd
before being added to the list when splitting or rejoining large pages.

From: Zachary Amsden <zach@vmware.com>
Signed-off-by: Zachary Amsden <zach@vmware.com>
Acked-by: William Lee Irwin III <wli@holomorphy.com>
---
 arch/i386/mm/pgtable.c |    8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

==================================================================---
a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -241,18 +241,16 @@ void pgd_ctor(void *pgd, struct kmem_cac
 	/* !PAE, no pagetable sharing */
 	memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
 
+	spin_lock_irqsave(&pgd_lock, flags);
+
+	/* must happen under lock */
 	clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
 			swapper_pg_dir + USER_PTRS_PER_PGD,
 			KERNEL_PGD_PTRS);
-
-	spin_lock_irqsave(&pgd_lock, flags);
-
-	/* must happen under lock */
 	paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
 				__pa(swapper_pg_dir) >> PAGE_SHIFT,
 				USER_PTRS_PER_PGD,
 				KERNEL_PGD_PTRS);
-
 	pgd_list_add(pgd);
 	spin_unlock_irqrestore(&pgd_lock, flags);
 }

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 17/28] x86: cleanup arch/i386/kernel/cpu/mcheck/p4.c

No, just no.  You do not use goto to skip a code block.  You do not
return an obvious variable from a singly-inlined function and give
the function a return value.  You don't put unexplained comments
about kmalloc in code which doesn't do dynamic allocation.  And
you don't leave stray warnings around for no good reason.

Also, when possible, it is better to use block scoped variables
because gcc can sometime generate better code.

From: Zachary Amsden <zach@vmware.com>
Signed-off-by: Zachary Amsden <zach@vmware.com>

---
 arch/i386/kernel/cpu/mcheck/p4.c |   16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

==================================================================---
a/arch/i386/kernel/cpu/mcheck/p4.c
+++ b/arch/i386/kernel/cpu/mcheck/p4.c
@@ -124,12 +124,9 @@ static void intel_init_thermal(struct cp
 
 
 /* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
-static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
+static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
 {
 	u32 h;
-
-	if (mce_num_extended_msrs == 0)
-		goto done;
 
 	rdmsr (MSR_IA32_MCG_EAX, r->eax, h);
 	rdmsr (MSR_IA32_MCG_EBX, r->ebx, h);
@@ -141,12 +138,6 @@ static inline int intel_get_extended_msr
 	rdmsr (MSR_IA32_MCG_ESP, r->esp, h);
 	rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h);
 	rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
-
-	/* can we rely on kmalloc to do a dynamic
-	 * allocation for the reserved registers?
-	 */
-done:
-	return mce_num_extended_msrs;
 }
 
 static fastcall void intel_machine_check(struct pt_regs * regs, long
error_code)
@@ -155,7 +146,6 @@ static fastcall void intel_machine_check
 	u32 alow, ahigh, high, low;
 	u32 mcgstl, mcgsth;
 	int i;
-	struct intel_mce_extended_msrs dbg;
 
 	rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
 	if (mcgstl & (1<<0))	/* Recoverable ? */
@@ -164,7 +154,9 @@ static fastcall void intel_machine_check
 	printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
 		smp_processor_id(), mcgsth, mcgstl);
 
-	if (intel_get_extended_msrs(&dbg)) {
+	if (mce_num_extended_msrs > 0) {
+		struct intel_mce_extended_msrs dbg;
+		intel_get_extended_msrs(&dbg);
 		printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
 			smp_processor_id(), dbg.eip, dbg.eflags);
 		printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n",

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 25/28] From: Jeremy Fitzhardinge <jeremy@goop.org>

The other symbols used to delineate the alt-instructions sections have the
form __foo/__foo_end.  Rename parainstructions to match.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/i386/kernel/alternative.c |    2 +-
 arch/i386/kernel/vmi.c         |    6 +++---
 arch/i386/kernel/vmlinux.lds.S |    4 ++--
 include/asm-i386/alternative.h |    4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

==================================================================---
a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -386,6 +386,6 @@ void __init alternative_instructions(voi
 		alternatives_smp_switch(0);
 	}
 #endif
- 	apply_paravirt(__start_parainstructions, __stop_parainstructions);
+ 	apply_paravirt(__parainstructions, __parainstructions_end);
 	local_irq_restore(flags);
 }
==================================================================---
a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -74,8 +74,8 @@ static struct {
 } vmi_ops;
 
 /* XXX move this to alternative.h */
-extern struct paravirt_patch __start_parainstructions[],
-	__stop_parainstructions[];
+extern struct paravirt_patch __parainstructions[],
+	__parainstructions_end[];
 
 /* Cached VMI operations */
 struct vmi_timer_ops vmi_timer_ops;
@@ -909,7 +909,7 @@ static inline int __init activate_vmi(vo
 	 * to do this before IRQs get reenabled.  Fortunately, it is
 	 * idempotent.
 	 */
-	apply_paravirt(__start_parainstructions, __stop_parainstructions);
+	apply_paravirt(__parainstructions, __parainstructions_end);
 
 	vmi_bringup();
 
==================================================================---
a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -166,9 +166,9 @@ SECTIONS
   }
   . = ALIGN(4);
   .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-  	__start_parainstructions = .;
+  	__parainstructions = .;
 	*(.parainstructions)
-  	__stop_parainstructions = .;
+  	__parainstructions_end = .;
   }
   /* .exit.text is discard at runtime, not link time, to deal with references
      from .altinstructions and .eh_frame */
==================================================================---
a/include/asm-i386/alternative.h
+++ b/include/asm-i386/alternative.h
@@ -124,8 +124,8 @@ apply_paravirt(struct paravirt_patch_sit
 apply_paravirt(struct paravirt_patch_site *start,
 	       struct paravirt_patch_site *end)
 {}
-#define __start_parainstructions NULL
-#define __stop_parainstructions NULL
+#define __parainstructions	NULL
+#define __parainstructions_end	NULL
 #endif
 
 #endif /* _I386_ALTERNATIVE_H */

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 27/28] paravirt: little compile fixes for vmi.c

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Zachary Amsden <zach@vmware.com>

---
 arch/i386/kernel/vmi.c |   10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

==================================================================---
a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -73,10 +73,6 @@ static struct {
   	void (*set_lazy_mode)(int mode);
 } vmi_ops;
 
-/* XXX move this to alternative.h */
-extern struct paravirt_patch __parainstructions[],
-	__parainstructions_end[];
-
 /* Cached VMI operations */
 struct vmi_timer_ops vmi_timer_ops;
 
@@ -548,9 +544,9 @@ vmi_startup_ipi_hook(int phys_apicid, un
 }
 #endif
 
-static void vmi_set_lazy_mode(int mode)
-{
-	static DEFINE_PER_CPU(int, lazy_mode);
+static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode)
+{
+	static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode);
 
 	if (!vmi_ops.set_lazy_mode)
 		return;

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 24/28] paravirt: drop unused ptep_get_and_clear

In shadow mode hypervisors, ptep_get_and_clear achieves the desired
purpose of keeping the shadows in sync by issuing a native_get_and_clear,
followed by a call to pte_update, which indicates the PTE has been
modified.

Direct mode hypervisors (Xen) have no need for this anyway, and will trap
the update using writable pagetables.

This means no hypervisor makes use of ptep_get_and_clear; there is no
reason to have it in the paravirt-ops structure.  Change confusing
terminology about raw vs. native functions into consistent use of
native_pte_xxx for operations which do not invoke paravirt-ops.

Signed-off-by: Zachary Amsden <zach@vmware.com>

---
 arch/i386/kernel/paravirt.c |    2 --
 include/asm-i386/paravirt.h |   13 +------------
 include/asm-i386/pgtable.h  |    4 +---
 3 files changed, 2 insertions(+), 17 deletions(-)

==================================================================---
a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -315,8 +315,6 @@ struct paravirt_ops paravirt_ops = {
 	.pte_update = paravirt_nop,
 	.pte_update_defer = paravirt_nop,
 
-	.ptep_get_and_clear = native_ptep_get_and_clear,
-
 #ifdef CONFIG_HIGHPTE
 	.kmap_atomic_pte = kmap_atomic,
 #endif
==================================================================---
a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -187,8 +187,6 @@ struct paravirt_ops
 	void (*pte_update)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 	void (*pte_update_defer)(struct mm_struct *mm,
 				 unsigned long addr, pte_t *ptep);
-
- 	pte_t (*ptep_get_and_clear)(pte_t *ptep);
 
 #ifdef CONFIG_HIGHPTE
 	void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
@@ -859,12 +857,8 @@ static inline void pmd_clear(pmd_t *pmdp
 	PVOP_VCALL1(pmd_clear, pmdp);
 }
 
-static inline pte_t raw_ptep_get_and_clear(pte_t *p)
-{
-	unsigned long long val = PVOP_CALL1(unsigned long long, ptep_get_and_clear,
p);
-	return (pte_t) { val, val >> 32 };
-}
 #else  /* !CONFIG_X86_PAE */
+
 static inline pte_t __pte(unsigned long val)
 {
 	return (pte_t) { PVOP_CALL1(unsigned long, make_pte, val) };
@@ -899,11 +893,6 @@ static inline void set_pmd(pmd_t *pmdp, 
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
 {
 	PVOP_VCALL2(set_pmd, pmdp, pmdval.pud.pgd.pgd);
-}
-
-static inline pte_t raw_ptep_get_and_clear(pte_t *p)
-{
-	return (pte_t) { PVOP_CALL1(unsigned long, ptep_get_and_clear, p) };
 }
 #endif	/* CONFIG_X86_PAE */
 
==================================================================---
a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -265,8 +265,6 @@ static inline pte_t pte_mkhuge(pte_t pte
  */
 #define pte_update(mm, addr, ptep)		do { } while (0)
 #define pte_update_defer(mm, addr, ptep)	do { } while (0)
-
-#define raw_ptep_get_and_clear(xp)     native_ptep_get_and_clear(xp)
 #endif
 
 /*
@@ -340,7 +338,7 @@ do {									\
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long
addr, pte_t *ptep)
 {
-	pte_t pte = raw_ptep_get_and_clear(ptep);
+	pte_t pte = native_ptep_get_and_clear(ptep);
 	pte_update(mm, addr, ptep);
 	return pte;
 }

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 26/28] From: Andrew Morton <akpm@linux-foundation.org>

x86_64:

arch/x86_64/kernel/../../i386/kernel/alternative.c: In function
'alternative_instructions':
arch/x86_64/kernel/../../i386/kernel/alternative.c:374: error:
'__parainstructions' undeclared (first use in this function)
arch/x86_64/kernel/../../i386/kernel/alternative.c:374: error: (Each undeclared
identifier is reported only once
arch/x86_64/kernel/../../i386/kernel/alternative.c:374: error: for each function
it appears in.)
arch/x86_64/kernel/../../i386/kernel/alternative.c:374: error:
'__parainstructions_end' undeclared (first use in this function)

Cc: Andi Kleen <ak@suse.de>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-x86_64/alternative.h |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

==================================================================---
a/include/asm-x86_64/alternative.h~rename-the-parainstructions-symbols-to-be-consistent-with-the-others-fix
+++ a/include/asm-x86_64/alternative.h
@@ -141,8 +141,8 @@ void apply_paravirt(struct paravirt_patc
 static inline void
 apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
 {}
-#define __start_parainstructions NULL
-#define __stop_parainstructions NULL
+#define __parainstructions NULL
+#define __parainstructions_end NULL
 #endif
 
 #endif /* _X86_64_ALTERNATIVE_H */
_

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 16/28] kRemove a warning about unused variable in !CONFIG_ACPI compilation.

From: Zachary Amsden <zach@vmware.com>
Signed-off-by: Zachary Amsden <zach@vmware.com>
CC: Trivial <trivial@kernel.org>

---
 arch/i386/kernel/acpi/earlyquirk.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

==================================================================---
a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -21,8 +21,8 @@ static int __init nvidia_hpet_check(stru
 
 static int __init check_bridge(int vendor, int device)
 {
+#ifdef CONFIG_ACPI
 	static int warned;
-#ifdef CONFIG_ACPI
 	/* According to Nvidia all timer overrides are bogus unless HPET
 	   is enabled. */
 	if (!acpi_use_timer_override && vendor == PCI_VENDOR_ID_NVIDIA) {

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 06/28] Convert PDA into the percpu section

Currently x86 (similar to x84-64) has a special per-cpu structure
called "i386_pda" which can be easily and efficiently referenced via
the %fs register.  An ELF section is more flexible than a structure,
allowing any piece of code to use this area.  Indeed, such a section
already exists: the per-cpu area.

So this patch:
(1) Removes the PDA and uses per-cpu variables for each current member.
(2) Replaces the __KERNEL_PDA segment with __KERNEL_PERCPU.
(3) Creates a per-cpu mirror of __per_cpu_offset called this_cpu_off, which
    can be used to calculate addresses for this CPU's variables.
(4) Simplifies startup, because %fs doesn't need to be loaded with a
    special segment at early boot; it can be deferred until the first
    percpu area is allocated (or never for UP).

The result is less code and one less x86-specific concept.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Andi Kleen <ak@suse.de>
---
 arch/i386/kernel/asm-offsets.c |    5 -
 arch/i386/kernel/cpu/common.c  |   17 -----
 arch/i386/kernel/entry.S       |    5 -
 arch/i386/kernel/head.S        |   31 +--------
 arch/i386/kernel/i386_ksyms.c  |    2 
 arch/i386/kernel/irq.c         |    3 
 arch/i386/kernel/process.c     |   12 ++-
 arch/i386/kernel/smpboot.c     |   34 ++++------
 arch/i386/kernel/vmi.c         |    6 -
 arch/i386/kernel/vmlinux.lds.S |    1 
 include/asm-i386/current.h     |    5 -
 include/asm-i386/irq_regs.h    |   12 ++-
 include/asm-i386/pda.h         |   99 ------------------------------
 include/asm-i386/percpu.h      |  132 +++++++++++++++++++++++++++++++++++++---
 include/asm-i386/processor.h   |    2 
 include/asm-i386/segment.h     |    6 -
 include/asm-i386/smp.h         |    4 -
 include/asm-i386/unwind.h      |    2 
 18 files changed, 180 insertions(+), 198 deletions(-)

==================================================================---
a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -15,7 +15,6 @@
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 #include <asm/elf.h>
-#include <asm/pda.h>
 
 #define DEFINE(sym, val) \
         asm volatile("\n->" #sym " %0 " #val : :
"i" (val))
@@ -101,10 +100,6 @@ void foo(void)
 
 	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
 
-	BLANK();
- 	OFFSET(PDA_cpu, i386_pda, cpu_number);
-	OFFSET(PDA_pcurrent, i386_pda, pcurrent);
-
 #ifdef CONFIG_PARAVIRT
 	BLANK();
 	OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
==================================================================---
a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -18,7 +18,6 @@
 #include <asm/apic.h>
 #include <mach_apic.h>
 #endif
-#include <asm/pda.h>
 
 #include "cpu.h"
 
@@ -47,12 +46,9 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page
 	[GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
 
 	[GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
-	[GDT_ENTRY_PDA] = { 0x00000000, 0x00c09200 }, /* set in setup_pda */
+	[GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
 } };
 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
-
-DEFINE_PER_CPU(struct i386_pda, _cpu_pda);
-EXPORT_PER_CPU_SYMBOL(_cpu_pda);
 
 static int cachesize_override __cpuinitdata = -1;
 static int disable_x86_fxsr __cpuinitdata;
@@ -634,20 +630,13 @@ void __init early_cpu_init(void)
 #endif
 }
 
-/* Make sure %gs is initialized properly in idle threads */
+/* Make sure %fs is initialized properly in idle threads */
 struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
 {
 	memset(regs, 0, sizeof(struct pt_regs));
-	regs->xfs = __KERNEL_PDA;
+	regs->xfs = __KERNEL_PERCPU;
 	return regs;
 }
-
-/* Initial PDA used by boot CPU */
-struct i386_pda boot_pda = {
-	._pda = &boot_pda,
-	.cpu_number = 0,
-	.pcurrent = &init_task,
-};
 
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
==================================================================---
a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -132,7 +132,7 @@ 1:
 	movl $(__USER_DS), %edx; \
 	movl %edx, %ds; \
 	movl %edx, %es; \
-	movl $(__KERNEL_PDA), %edx; \
+	movl $(__KERNEL_PERCPU), %edx; \
 	movl %edx, %fs
 
 #define RESTORE_INT_REGS \
@@ -556,7 +556,6 @@ END(syscall_badsys)
 
 #define FIXUP_ESPFIX_STACK \
 	/* since we are on a wrong stack, we cant make it a C code :( */ \
-	movl %fs:PDA_cpu, %ebx; \
 	PER_CPU(gdt_page, %ebx); \
 	GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
 	addl %esp, %eax; \
@@ -681,7 +680,7 @@ error_code:
 	pushl %fs
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET fs, 0*/
-	movl $(__KERNEL_PDA), %ecx
+	movl $(__KERNEL_PERCPU), %ecx
 	movl %ecx, %fs
 	UNWIND_ESPFIX_STACK
 	popl %ecx
==================================================================---
a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -317,12 +317,12 @@ 2:	movl %cr0,%eax
 	movl %eax,%cr0
 
 	call check_x87
-	call setup_pda
 	lgdt early_gdt_descr
 	lidt idt_descr
 	ljmp $(__KERNEL_CS),$1f
 1:	movl $(__KERNEL_DS),%eax	# reload all the segment registers
 	movl %eax,%ss			# after changing gdt.
+	movl %eax,%fs			# gets reset once there's real percpu
 
 	movl $(__USER_DS),%eax		# DS/ES contains default USER segment
 	movl %eax,%ds
@@ -331,9 +331,6 @@ 1:	movl $(__KERNEL_DS),%eax	# reload all
 	xorl %eax,%eax			# Clear GS and LDT
 	movl %eax,%gs
 	lldt %ax
-
-	movl $(__KERNEL_PDA),%eax
-	mov  %eax,%fs
 
 	cld			# gcc2 wants the direction flag cleared at all times
 	pushl $0		# fake return address for unwinder
@@ -341,7 +338,11 @@ 1:	movl $(__KERNEL_DS),%eax	# reload all
 	movb ready, %cl
 	movb $1, ready
 	cmpb $0,%cl		# the first CPU calls start_kernel
-	jne initialize_secondary # all other CPUs call initialize_secondary
+	je   1f
+	movl $(__KERNEL_PERCPU), %eax
+	movl %eax,%fs		# set this cpu's percpu
+	jmp initialize_secondary # all other CPUs call initialize_secondary
+1:
 #endif /* CONFIG_SMP */
 	jmp start_kernel
 
@@ -362,23 +363,6 @@ check_x87:
 	ALIGN
 1:	movb $1,X86_HARD_MATH
 	.byte 0xDB,0xE4		/* fsetpm for 287, ignored by 387 */
-	ret
-
-/*
- * Point the GDT at this CPU's PDA.  On boot this will be
- * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
- * that CPU's GDT and PDA.
- */
-ENTRY(setup_pda)
-	/* get the PDA pointer */
-	movl start_pda, %eax
-
-	/* slot the PDA address into the GDT */
-	mov early_gdt_descr+2, %ecx
-	mov %ax, (__KERNEL_PDA+0+2)(%ecx)		/* base & 0x0000ffff */
-	shr $16, %eax
-	mov %al, (__KERNEL_PDA+4+0)(%ecx)		/* base & 0x00ff0000 */
-	mov %ah, (__KERNEL_PDA+4+3)(%ecx)		/* base & 0xff000000 */
 	ret
 
 /*
@@ -553,9 +537,6 @@ ENTRY(empty_zero_page)
  * This starts the data section.
  */
 .data
-ENTRY(start_pda)
-	.long boot_pda
-
 ENTRY(stack_start)
 	.long init_thread_union+THREAD_SIZE
 	.long __BOOT_DS
==================================================================---
a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -28,5 +28,3 @@ EXPORT_SYMBOL(__read_lock_failed);
 #endif
 
 EXPORT_SYMBOL(csum_partial);
-
-EXPORT_SYMBOL(_proxy_pda);
==================================================================---
a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -23,6 +23,9 @@
 
 DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
 EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
 
 /*
  * 'what should we do if we get a hw irq event on an illegal vector'.
==================================================================---
a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -39,6 +39,7 @@
 #include <linux/random.h>
 #include <linux/personality.h>
 #include <linux/tick.h>
+#include <linux/percpu.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -57,7 +58,6 @@
 
 #include <asm/tlbflush.h>
 #include <asm/cpu.h>
-#include <asm/pda.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -65,6 +65,12 @@ static int hlt_counter;
 
 unsigned long boot_option_idle_override = 0;
 EXPORT_SYMBOL(boot_option_idle_override);
+
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+DEFINE_PER_CPU(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
 
 /*
  * Return saved PC of a blocked thread.
@@ -342,7 +348,7 @@ int kernel_thread(int (*fn)(void *), voi
 
 	regs.xds = __USER_DS;
 	regs.xes = __USER_DS;
-	regs.xfs = __KERNEL_PDA;
+	regs.xfs = __KERNEL_PERCPU;
 	regs.orig_eax = -1;
 	regs.eip = (unsigned long) kernel_thread_helper;
 	regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -711,7 +717,7 @@ struct task_struct fastcall * __switch_t
 	if (prev->gs | next->gs)
 		loadsegment(gs, next->gs);
 
-	write_pda(pcurrent, next_p);
+	x86_write_percpu(current_task, next_p);
 
 	return prev_p;
 }
==================================================================---
a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -53,7 +53,6 @@
 #include <asm/desc.h>
 #include <asm/arch_hooks.h>
 #include <asm/nmi.h>
-#include <asm/pda.h>
 
 #include <mach_apic.h>
 #include <mach_wakecpu.h>
@@ -98,6 +97,9 @@ EXPORT_SYMBOL(x86_cpu_to_apicid);
 EXPORT_SYMBOL(x86_cpu_to_apicid);
 
 u8 apicid_2_node[MAX_APICID];
+
+DEFINE_PER_CPU(unsigned long, this_cpu_off);
+EXPORT_PER_CPU_SYMBOL(this_cpu_off);
 
 /*
  * Trampoline 80x86 program as an array.
@@ -456,7 +458,6 @@ extern struct {
 	void * esp;
 	unsigned short ss;
 } stack_start;
-extern struct i386_pda *start_pda;
 
 #ifdef CONFIG_NUMA
 
@@ -784,20 +785,17 @@ static inline struct task_struct * alloc
 /* Initialize the CPU's GDT.  This is either the boot CPU doing itself
    (still using the master per-cpu area), or a CPU doing it for a
    secondary which will soon come up. */
-static __cpuinit void init_gdt(int cpu, struct task_struct *idle)
+static __cpuinit void init_gdt(int cpu)
 {
 	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-	struct i386_pda *pda = &per_cpu(_cpu_pda, cpu);
-
-	pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
-			(u32 *)&gdt[GDT_ENTRY_PDA].b,
-			(unsigned long)pda, sizeof(*pda) - 1,
-			0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
-
-	memset(pda, 0, sizeof(*pda));
-	pda->_pda = pda;
-	pda->cpu_number = cpu;
-	pda->pcurrent = idle;
+
+	pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a,
+			(u32 *)&gdt[GDT_ENTRY_PERCPU].b,
+			__per_cpu_offset[cpu], 0xFFFFF,
+			0x80 | DESCTYPE_S | 0x2, 0x8);
+
+	per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
+	per_cpu(cpu_number, cpu) = cpu;
 }
 
 /* Defined in head.S */
@@ -824,9 +822,9 @@ static int __cpuinit do_boot_cpu(int api
 	if (IS_ERR(idle))
 		panic("failed fork for CPU %d", cpu);
 
-	init_gdt(cpu, idle);
+	init_gdt(cpu);
+ 	per_cpu(current_task, cpu) = idle;
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
-	start_pda = cpu_pda(cpu);
 
 	idle->thread.eip = (unsigned long) start_secondary;
 	/* start_eip had better be page-aligned! */
@@ -1188,14 +1186,14 @@ static inline void switch_to_new_gdt(voi
 	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
 	gdt_descr.size = GDT_SIZE - 1;
 	load_gdt(&gdt_descr);
-	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) :
"memory");
+	asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) :
"memory");
 }
 
 void __init native_smp_prepare_boot_cpu(void)
 {
 	unsigned int cpu = smp_processor_id();
 
-	init_gdt(cpu, current);
+	init_gdt(cpu);
 	switch_to_new_gdt();
 
 	cpu_set(cpu, cpu_online_map);
==================================================================---
a/arch/i386/kernel/vmi.c
+++ b/arch/i386/kernel/vmi.c
@@ -504,8 +504,6 @@ static void vmi_pmd_clear(pmd_t *pmd)
 #endif
 
 #ifdef CONFIG_SMP
-extern void setup_pda(void);
-
 static void __devinit
 vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
 		     unsigned long start_esp)
@@ -530,12 +528,10 @@ vmi_startup_ipi_hook(int phys_apicid, un
 
 	ap.ds = __USER_DS;
 	ap.es = __USER_DS;
-	ap.fs = __KERNEL_PDA;
+	ap.fs = __KERNEL_PERCPU;
 	ap.gs = 0;
 
 	ap.eflags = 0;
-
-	setup_pda();
 
 #ifdef CONFIG_X86_PAE
 	/* efer should match BSP efer. */
==================================================================---
a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -26,7 +26,6 @@ OUTPUT_ARCH(i386)
 OUTPUT_ARCH(i386)
 ENTRY(phys_startup_32)
 jiffies = jiffies_64;
-_proxy_pda = 0;
 
 PHDRS {
 	text PT_LOAD FLAGS(5);	/* R_E */
==================================================================---
a/include/asm-i386/current.h
+++ b/include/asm-i386/current.h
@@ -1,14 +1,15 @@
 #ifndef _I386_CURRENT_H
 #define _I386_CURRENT_H
 
-#include <asm/pda.h>
 #include <linux/compiler.h>
+#include <asm/percpu.h>
 
 struct task_struct;
 
+DECLARE_PER_CPU(struct task_struct *, current_task);
 static __always_inline struct task_struct *get_current(void)
 {
-	return read_pda(pcurrent);
+	return x86_read_percpu(current_task);
 }
  
 #define current get_current()
==================================================================---
a/include/asm-i386/irq_regs.h
+++ b/include/asm-i386/irq_regs.h
@@ -1,25 +1,27 @@
 /*
  * Per-cpu current frame pointer - the location of the last exception frame on
- * the stack, stored in the PDA.
+ * the stack, stored in the per-cpu area.
  *
  * Jeremy Fitzhardinge <jeremy@goop.org>
  */
 #ifndef _ASM_I386_IRQ_REGS_H
 #define _ASM_I386_IRQ_REGS_H
 
-#include <asm/pda.h>
+#include <asm/percpu.h>
+
+DECLARE_PER_CPU(struct pt_regs *, irq_regs);
 
 static inline struct pt_regs *get_irq_regs(void)
 {
-	return read_pda(irq_regs);
+	return x86_read_percpu(irq_regs);
 }
 
 static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
 {
 	struct pt_regs *old_regs;
 
-	old_regs = read_pda(irq_regs);
-	write_pda(irq_regs, new_regs);
+	old_regs = get_irq_regs();
+	x86_write_percpu(irq_regs, new_regs);
 
 	return old_regs;
 }
==================================================================---
a/include/asm-i386/pda.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
-   Per-processor Data Areas
-   Jeremy Fitzhardinge <jeremy@goop.org> 2006
-   Based on asm-x86_64/pda.h by Andi Kleen.
- */
-#ifndef _I386_PDA_H
-#define _I386_PDA_H
-
-#include <linux/stddef.h>
-#include <linux/types.h>
-#include <asm/percpu.h>
-
-struct i386_pda
-{
-	struct i386_pda *_pda;		/* pointer to self */
-
-	int cpu_number;
-	struct task_struct *pcurrent;	/* current process */
-	struct pt_regs *irq_regs;
-};
-
-DECLARE_PER_CPU(struct i386_pda, _cpu_pda);
-#define cpu_pda(i)	(&per_cpu(_cpu_pda, (i)))
-#define pda_offset(field) offsetof(struct i386_pda, field)
-
-extern void __bad_pda_field(void);
-
-/* This variable is never instantiated.  It is only used as a stand-in
-   for the real per-cpu PDA memory, so that gcc can understand what
-   memory operations the inline asms() below are performing.  This
-   eliminates the need to make the asms volatile or have memory
-   clobbers, so gcc can readily analyse them. */
-extern struct i386_pda _proxy_pda;
-
-#define pda_to_op(op,field,val)						\
-	do {								\
-		typedef typeof(_proxy_pda.field) T__;			\
-		if (0) { T__ tmp__; tmp__ = (val); }			\
-		switch (sizeof(_proxy_pda.field)) {			\
-		case 1:							\
-			asm(op "b %1,%%fs:%c2"				\
-			    : "+m" (_proxy_pda.field)			\
-			    :"ri" ((T__)val),				\
-			     "i"(pda_offset(field)));			\
-			break;						\
-		case 2:							\
-			asm(op "w %1,%%fs:%c2"				\
-			    : "+m" (_proxy_pda.field)			\
-			    :"ri" ((T__)val),				\
-			     "i"(pda_offset(field)));			\
-			break;						\
-		case 4:							\
-			asm(op "l %1,%%fs:%c2"				\
-			    : "+m" (_proxy_pda.field)			\
-			    :"ri" ((T__)val),				\
-			     "i"(pda_offset(field)));			\
-			break;						\
-		default: __bad_pda_field();				\
-		}							\
-	} while (0)
-
-#define pda_from_op(op,field)						\
-	({								\
-		typeof(_proxy_pda.field) ret__;				\
-		switch (sizeof(_proxy_pda.field)) {			\
-		case 1:							\
-			asm(op "b %%fs:%c1,%0"				\
-			    : "=r" (ret__)				\
-			    : "i" (pda_offset(field)),			\
-			      "m" (_proxy_pda.field));			\
-			break;						\
-		case 2:							\
-			asm(op "w %%fs:%c1,%0"				\
-			    : "=r" (ret__)				\
-			    : "i" (pda_offset(field)),			\
-			      "m" (_proxy_pda.field));			\
-			break;						\
-		case 4:							\
-			asm(op "l %%fs:%c1,%0"				\
-			    : "=r" (ret__)				\
-			    : "i" (pda_offset(field)),			\
-			      "m" (_proxy_pda.field));			\
-			break;						\
-		default: __bad_pda_field();				\
-		}							\
-		ret__; })
-
-/* Return a pointer to a pda field */
-#define pda_addr(field)							\
-	((typeof(_proxy_pda.field) *)((unsigned char *)read_pda(_pda) + \
-				      pda_offset(field)))
-
-#define read_pda(field) pda_from_op("mov",field)
-#define write_pda(field,val) pda_to_op("mov",field,val)
-#define add_pda(field,val) pda_to_op("add",field,val)
-#define sub_pda(field,val) pda_to_op("sub",field,val)
-#define or_pda(field,val) pda_to_op("or",field,val)
-
-#endif	/* _I386_PDA_H */
==================================================================---
a/include/asm-i386/percpu.h
+++ b/include/asm-i386/percpu.h
@@ -1,9 +1,30 @@
 #ifndef __ARCH_I386_PERCPU__
 #define __ARCH_I386_PERCPU__
 
-#ifndef __ASSEMBLY__
-#include <asm-generic/percpu.h>
-#else
+#ifdef __ASSEMBLY__
+
+/*
+ * PER_CPU finds an address of a per-cpu variable.
+ *
+ * Args:
+ *    var - variable name
+ *    reg - 32bit register
+ *
+ * The resulting address is stored in the "reg" argument.
+ *
+ * Example:
+ *    PER_CPU(cpu_gdt_descr, %ebx)
+ */
+#ifdef CONFIG_SMP
+#define PER_CPU(var, reg)			\
+	movl %fs:per_cpu__this_cpu_off, reg;		\
+	addl $per_cpu__##var, reg
+#else /* ! SMP */
+#define PER_CPU(var, reg) \
+	movl $per_cpu__##var, reg;
+#endif	/* SMP */
+
+#else /* ...!ASSEMBLY */
 
 /*
  * PER_CPU finds an address of a per-cpu variable.
@@ -18,14 +39,107 @@
  *    PER_CPU(cpu_gdt_descr, %ebx)
  */
 #ifdef CONFIG_SMP
-#define PER_CPU(var, cpu) \
-	movl __per_cpu_offset(,cpu,4), cpu;	\
-	addl $per_cpu__##var, cpu;
-#else /* ! SMP */
-#define PER_CPU(var, cpu) \
-	movl $per_cpu__##var, cpu;
+/* Same as generic implementation except for optimized local access. */
+#define __GENERIC_PER_CPU
+
+/* This is used for other cpus to find our section. */
+extern unsigned long __per_cpu_offset[];
+
+/* Separate out the type, so (int[3], foo) works. */
+#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU(type, name) \
+    __attribute__((__section__(".data.percpu"))) __typeof__(type)
per_cpu__##name
+
+/* We can use this directly for local CPU (faster). */
+DECLARE_PER_CPU(unsigned long, this_cpu_off);
+
+/* var is in discarded region: offset to particular copy we want */
+#define per_cpu(var, cpu) (*({				\
+	extern int simple_indentifier_##var(void);	\
+	RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]); }))
+
+#define __raw_get_cpu_var(var) (*({					\
+	extern int simple_indentifier_##var(void);			\
+	RELOC_HIDE(&per_cpu__##var, x86_read_percpu(this_cpu_off));	\
+}))
+
+#define __get_cpu_var(var) __raw_get_cpu_var(var)
+
+/* A macro to avoid #include hell... */
+#define percpu_modcopy(pcpudst, src, size)			\
+do {								\
+	unsigned int __i;					\
+	for_each_possible_cpu(__i)				\
+		memcpy((pcpudst)+__per_cpu_offset[__i],		\
+		       (src), (size));				\
+} while (0)
+
+#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
+#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+
+/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
+#define __percpu_seg "%%fs:"
+#else  /* !SMP */
+#include <asm-generic/percpu.h>
+#define __percpu_seg ""
 #endif	/* SMP */
 
+/* For arch-specific code, we can use direct single-insn ops (they
+ * don't give an lvalue though). */
+extern void __bad_percpu_size(void);
+
+#define percpu_to_op(op,var,val)				\
+	do {							\
+		typedef typeof(var) T__;			\
+		if (0) { T__ tmp__; tmp__ = (val); }		\
+		switch (sizeof(var)) {				\
+		case 1:						\
+			asm(op "b %1,"__percpu_seg"%0"		\
+			    : "+m" (var)			\
+			    :"ri" ((T__)val));			\
+			break;					\
+		case 2:						\
+			asm(op "w %1,"__percpu_seg"%0"		\
+			    : "+m" (var)			\
+			    :"ri" ((T__)val));			\
+			break;					\
+		case 4:						\
+			asm(op "l %1,"__percpu_seg"%0"		\
+			    : "+m" (var)			\
+			    :"ri" ((T__)val));			\
+			break;					\
+		default: __bad_percpu_size();			\
+		}						\
+	} while (0)
+
+#define percpu_from_op(op,var)					\
+	({							\
+		typeof(var) ret__;				\
+		switch (sizeof(var)) {				\
+		case 1:						\
+			asm(op "b "__percpu_seg"%1,%0"		\
+			    : "=r" (ret__)			\
+			    : "m" (var));			\
+			break;					\
+		case 2:						\
+			asm(op "w "__percpu_seg"%1,%0"		\
+			    : "=r" (ret__)			\
+			    : "m" (var));			\
+			break;					\
+		case 4:						\
+			asm(op "l "__percpu_seg"%1,%0"		\
+			    : "=r" (ret__)			\
+			    : "m" (var));			\
+			break;					\
+		default: __bad_percpu_size();			\
+		}						\
+		ret__; })
+
+#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
+#define x86_write_percpu(var,val) percpu_to_op("mov", per_cpu__##var,
val)
+#define x86_add_percpu(var,val) percpu_to_op("add", per_cpu__##var,
val)
+#define x86_sub_percpu(var,val) percpu_to_op("sub", per_cpu__##var,
val)
+#define x86_or_percpu(var,val) percpu_to_op("or", per_cpu__##var,
val)
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ARCH_I386_PERCPU__ */
==================================================================---
a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -377,7 +377,7 @@ struct thread_struct {
 	.vm86_info = NULL,						\
 	.sysenter_cs = __KERNEL_CS,					\
 	.io_bitmap_ptr = NULL,						\
-	.fs = __KERNEL_PDA,						\
+	.fs = __KERNEL_PERCPU,						\
 }
 
 /*
==================================================================---
a/include/asm-i386/segment.h
+++ b/include/asm-i386/segment.h
@@ -39,7 +39,7 @@
  *  25 - APM BIOS support 
  *
  *  26 - ESPFIX small SS
- *  27 - PDA				[ per-cpu private data area ]
+ *  27 - per-cpu			[ offset to per-cpu data area ]
  *  28 - unused
  *  29 - unused
  *  30 - unused
@@ -74,8 +74,8 @@
 #define GDT_ENTRY_ESPFIX_SS		(GDT_ENTRY_KERNEL_BASE + 14)
 #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
 
-#define GDT_ENTRY_PDA			(GDT_ENTRY_KERNEL_BASE + 15)
-#define __KERNEL_PDA (GDT_ENTRY_PDA * 8)
+#define GDT_ENTRY_PERCPU			(GDT_ENTRY_KERNEL_BASE + 15)
+#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
 
 #define GDT_ENTRY_DOUBLEFAULT_TSS	31
 
==================================================================---
a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -8,7 +8,6 @@
 #include <linux/kernel.h>
 #include <linux/threads.h>
 #include <linux/cpumask.h>
-#include <asm/pda.h>
 #endif
 
 #if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__)
@@ -112,7 +111,8 @@ do { } while (0)
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * so this is correct in the x86 case.
  */
-#define raw_smp_processor_id() (read_pda(cpu_number))
+DECLARE_PER_CPU(int, cpu_number);
+#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
 
 extern cpumask_t cpu_callout_map;
 extern cpumask_t cpu_callin_map;
==================================================================---
a/include/asm-i386/unwind.h
+++ b/include/asm-i386/unwind.h
@@ -71,7 +71,7 @@ static inline void arch_unw_init_blocked
 	info->regs.xss = __KERNEL_DS;
 	info->regs.xds = __USER_DS;
 	info->regs.xes = __USER_DS;
-	info->regs.xfs = __KERNEL_PDA;
+	info->regs.xfs = __KERNEL_PERCPU;
 }
 
 extern asmlinkage int arch_unwind_init_running(struct unwind_frame_info *,

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 01/28] revert account-for-module-percpu-space-separately-from-kernel-percpu

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>

---
 include/asm-i386/percpu.h |   10 ----------
 1 file changed, 10 deletions(-)

==================================================================---
a/include/asm-i386/percpu.h
+++ b/include/asm-i386/percpu.h
@@ -4,16 +4,6 @@
 #ifndef __ASSEMBLY__
 #include <asm-generic/percpu.h>
 #else
-
-#ifdef CONFIG_MODULES
-# define PERCPU_MODULE_RESERVE 8192
-#else
-# define PERCPU_MODULE_RESERVE 0
-#endif
-
-#define PERCPU_ENOUGH_ROOM \
-	(ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES) + \
-	 PERCPU_MODULE_RESERVE)
 
 /*
  * PER_CPU finds an address of a per-cpu variable.

--

Jeremy Fitzhardinge

2007-Apr-18 13:02 UTC

head link

[PATCH 11/28] x86: incremental update for i386 and x86-64 check_bugs

i386 bugs.c shouldn't refer to identify_boot_cpu yet, since it doesn't
get introduced until the identify_cpu patch.

Remove spurious comments, headers and keywords from x86-64 bugs.[ch].

Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>

---
 arch/i386/kernel/cpu/bugs.c |    2 +-
 arch/x86_64/kernel/bugs.c   |    9 +--------
 include/asm-i386/bugs.h     |    2 +-
 3 files changed, 3 insertions(+), 10 deletions(-)

==================================================================---
a/arch/i386/kernel/cpu/bugs.c
+++ b/arch/i386/kernel/cpu/bugs.c
@@ -177,7 +177,7 @@ static void __init check_config(void)
 
 void __init check_bugs(void)
 {
-	identify_boot_cpu();
+	identify_cpu(&boot_cpu_data);
 #ifndef CONFIG_SMP
 	printk("CPU: ");
 	print_cpu_info(&boot_cpu_data);
==================================================================---
a/arch/x86_64/kernel/bugs.c
+++ b/arch/x86_64/kernel/bugs.c
@@ -3,19 +3,12 @@
  *
  *  Copyright (C) 1994  Linus Torvalds
  *  Copyright (C) 2000  SuSE
- *
- * This is included by init/main.c to check for architecture-dependent bugs.
- *
- * Needs:
- *	void check_bugs(void);
  */
 
 #include <linux/kernel.h>
+#include <linux/init.h>
 #include <asm/alternative.h>
 #include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/msr.h>
-#include <asm/pda.h>
 
 void __init check_bugs(void)
 {
==================================================================---
a/include/asm-i386/bugs.h
+++ b/include/asm-i386/bugs.h
@@ -7,6 +7,6 @@
 #ifndef _ASM_I386_BUG_H
 #define _ASM_I386_BUG_H
 
-extern void __init check_bugs(void);
+void check_bugs(void);
 
 #endif	/* _ASM_I386_BUG_H */

--

Chuck Ebbert

2007-Apr-19 13:48 UTC

head link

[PATCH 10/28] i386: map enough initial memory to create lowmem mappings

Jeremy Fitzhardinge wrote:> head.S creates the very initial pagetable for the kernel.  This just
> maps enough space for the kernel itself, and an allocation bitmap.
> The amount of mapped memory is rounded up to 4Mbytes, and so this
> typically ends up mapping 8Mbytes of memory.
> 
> When booting, pagetable_init() needs to create mappings for all
> lowmem, and the pagetables for these mappings are allocated from the
> free pages around the kernel in low memory.  If the number of
> pagetable pages + kernel size exceeds head.S's initial mapping, it
> will end up faulting on an unmapped page.  This will only happen with
> specific combinations of kernel size and memory size.
> 
> This patch makes sure that head.S also maps enough space to fit the
> kernel pagetables as well as the kernel itself.  It ends up using an
> additional two pages of unreclaimable memory.
> 
> Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
> Acked-by: "H. Peter Anvin" <hpa@zytor.com>
> Cc: Andi Kleen <ak@suse.de>
> Cc: Zachary Amsden <zach@vmware.com>
> Cc: Chris Wright <chrisw@sous-sol.org>
> Cc: "Eric W. Biederman" <ebiederm@xmission.com>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>,
Is some version of this going in for 2.6.21, or is it not a real problem?

Andi Kleen

2007-Apr-23 10:55 UTC

head link

[PATCH 10/28] i386: map enough initial memory to create lowmem mappings

On Monday 23 April 2007 19:45:41 H. Peter Anvin wrote:> Eric W. Biederman wrote:
> > 
> > - I know of one system that had BIOS tables at 16MB I believe (and
> >   thus had a fairly low hole).
> > 
> 
> Please name names, otherwise this is just rumouring.  Seriously.  We 
> have enough cargo-cult programming as it is.
Unisys did this at some point in their large machines, but they fixed
that in a later BIOS.

Anyways, boot up should be robust against any holes if possible.

-Andi

Jeremy Fitzhardinge

2007-Apr-23 12:22 UTC

head link

[PATCH 10/28] i386: map enough initial memory to create lowmem mappings

H. Peter Anvin wrote:> It would be *trivial* to make a certain number of page table slots
> available at the end of the head.S-generated map. 
Or you could use a fixmap.

    J

Maybe Matching Threads

Search for more reasonably related threads

Linux Virtualization - Apr 2007 - [PATCH 00/28] Updates for firstfloor paravirt-ops patches

[PATCH 14/28] fix paravirt-documentation

[PATCH 19/28] Dont implement native_kmap_atomic_pte for !HIGHPTE

[PATCH 07/28] cleanups to help using per-cpu variables from asm

[PATCH 08/28] Define per_cpu_offset

[PATCH 03/28] fix allow-percpu-variables-to-be-page-aligned.patch

[PATCH 13/28] paravirt: flush lazy mmu updates on kunmap_atomic

[PATCH 00/28] Updates for firstfloor paravirt-ops patches

[PATCH 04/28] deflate stack usage in lib/inflate.c

[PATCH 12/28] i386: now its ok to use identify_boot_cpu

[PATCH 28/28] Add a sched_clock paravirt_op

[PATCH 21/28] Implement vmi_kmap_atomic_pte

[PATCH 23/28] Fix BusLogic to stop using check_region

[PATCH 05/28] Page-align the GDT

[PATCH 10/28] i386: map enough initial memory to create lowmem mappings

[PATCH 22/28] Convert VMI timer to use clock events

[PATCH 02/28] Account for module percpu space separately from kernel percpu

[PATCH 09/28] Fix UP gdt bugs

[PATCH 15/28] In compat mode, the return value here was uninitialized.

[PATCH 20/28] Now that the VDSO can be relocated, we can support it in VMI configurations.

[PATCH 18/28] Copying of the pgd range must happen under the pgd_lock

[PATCH 17/28] x86: cleanup arch/i386/kernel/cpu/mcheck/p4.c

[PATCH 25/28] From: Jeremy Fitzhardinge <jeremy@goop.org>

[PATCH 27/28] paravirt: little compile fixes for vmi.c

[PATCH 24/28] paravirt: drop unused ptep_get_and_clear

[PATCH 26/28] From: Andrew Morton <akpm@linux-foundation.org>

[PATCH 16/28] kRemove a warning about unused variable in !CONFIG_ACPI compilation.

[PATCH 06/28] Convert PDA into the percpu section

[PATCH 01/28] revert account-for-module-percpu-space-separately-from-kernel-percpu

[PATCH 11/28] x86: incremental update for i386 and x86-64 check_bugs

[PATCH 10/28] i386: map enough initial memory to create lowmem mappings

[PATCH 10/28] i386: map enough initial memory to create lowmem mappings

[PATCH 10/28] i386: map enough initial memory to create lowmem mappings

Maybe Matching Threads