This is the kernel side code of 3-level event channel ABI, which corresponds to RFC V5 in Xen side. Some notable changes: * More code shared between 2/3-level ABI. * evtchn_cpu_mask is allocated dynamically in CPU hotplug bringup path. Diffstat: arch/x86/xen/enlighten.c | 12 + drivers/xen/events.c | 930 +++++++++++++++++++++++++++------ drivers/xen/evtchn.c | 13 +- include/xen/events.h | 12 + include/xen/interface/event_channel.h | 46 +- include/xen/interface/xen.h | 13 +- 6 files changed, 847 insertions(+), 179 deletions(-)
This typedef slipped into Linux header file, remove it.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 include/xen/interface/event_channel.h |    1 -
 1 file changed, 1 deletion(-)
diff --git a/include/xen/interface/event_channel.h
b/include/xen/interface/event_channel.h
index f494292..293c3f0 100644
--- a/include/xen/interface/event_channel.h
+++ b/include/xen/interface/event_channel.h
@@ -188,7 +188,6 @@ struct evtchn_reset {
 	/* IN parameters. */
 	domid_t dom;
 };
-typedef struct evtchn_reset evtchn_reset_t;
 
 struct evtchn_op {
 	uint32_t cmd; /* EVTCHNOP_* */
-- 
1.7.10.4
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 drivers/xen/events.c |   24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 6b78378..90ac37a 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -1212,7 +1212,7 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 
 	spin_lock_irqsave(&debug_lock, flags);
 
-	printk("\nvcpu %d\n  ", cpu);
+	printk(KERN_DEBUG "\nvcpu %d\n  ", cpu);
 
 	for_each_online_cpu(i) {
 		int pending;
@@ -1220,27 +1220,27 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 		pending = (get_irq_regs() && i == cpu)
 			? xen_irqs_disabled(get_irq_regs())
 			: v->evtchn_upcall_mask;
-		printk("%d: masked=%d pending=%d event_sel
%0*"PRI_xen_ulong"\n  ", i,
+		printk(KERN_DEBUG "%d: masked=%d pending=%d event_sel
%0*"PRI_xen_ulong"\n  ", i,
 		       pending, v->evtchn_upcall_pending,
 		       (int)(sizeof(v->evtchn_pending_sel)*2),
 		       v->evtchn_pending_sel);
 	}
 	v = per_cpu(xen_vcpu, cpu);
 
-	printk("\npending:\n   ");
+	printk(KERN_DEBUG "\npending:\n   ");
 	for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
-		printk("%0*"PRI_xen_ulong"%s",
+		printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s",
 		       (int)sizeof(sh->evtchn_pending[0])*2,
 		       sh->evtchn_pending[i],
 		       i % 8 == 0 ? "\n   " : " ");
-	printk("\nglobal mask:\n   ");
+	printk(KERN_DEBUG "\nglobal mask:\n   ");
 	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
-		printk("%0*"PRI_xen_ulong"%s",
+		printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s",
 		       (int)(sizeof(sh->evtchn_mask[0])*2),
 		       sh->evtchn_mask[i],
 		       i % 8 == 0 ? "\n   " : " ");
 
-	printk("\nglobally unmasked:\n   ");
+	printk(KERN_DEBUG "\nglobally unmasked:\n   ");
 	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
 		printk("%0*"PRI_xen_ulong"%s",
 		       (int)(sizeof(sh->evtchn_mask[0])*2),
@@ -1249,25 +1249,25 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 
 	printk("\nlocal cpu%d mask:\n   ", cpu);
 	for (i = (NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--)
-		printk("%0*"PRI_xen_ulong"%s",
(int)(sizeof(cpu_evtchn[0])*2),
+		printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s",
(int)(sizeof(cpu_evtchn[0])*2),
 		       cpu_evtchn[i],
 		       i % 8 == 0 ? "\n   " : " ");
 
-	printk("\nlocally unmasked:\n   ");
+	printk(KERN_DEBUG "\nlocally unmasked:\n   ");
 	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
 		xen_ulong_t pending = sh->evtchn_pending[i]
 			& ~sh->evtchn_mask[i]
 			& cpu_evtchn[i];
-		printk("%0*"PRI_xen_ulong"%s",
+		printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s",
 		       (int)(sizeof(sh->evtchn_mask[0])*2),
 		       pending, i % 8 == 0 ? "\n   " : " ");
 	}
 
-	printk("\npending list:\n");
+	printk(KERN_DEBUG "\npending list:\n");
 	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
 		if (sync_test_bit(i, BM(sh->evtchn_pending))) {
 			int word_idx = i / BITS_PER_EVTCHN_WORD;
-			printk("  %d: event %d -> irq %d%s%s%s\n",
+			printk(KERN_DEBUG "  %d: event %d -> irq %d%s%s%s\n",
 			       cpu_from_evtchn(i), i,
 			       evtchn_to_irq[i],
 			       sync_test_bit(word_idx, BM(&v->evtchn_pending_sel))
-- 
1.7.10.4
Make the per-cpu selector L1 to be consistent with description in __xen_evtchn_do_upcall''s comment. Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- drivers/xen/events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 90ac37a..38e30aa 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -1271,7 +1271,7 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) cpu_from_evtchn(i), i, evtchn_to_irq[i], sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) - ? "" : " l2-clear", + ? "" : " l1-clear", !sync_test_bit(i, BM(sh->evtchn_mask)) ? "" : " globally-masked", sync_test_bit(i, BM(cpu_evtchn)) -- 1.7.10.4
Stay in sync with Xen public headers:
* event_channel.h:
  * EVTCHNOP_query_extended_abis
  * EVTCHNOP_register_3level
* xen.h:
  * NR_EVENT_CHANNEL*
EVTCHNOP_query_extended_aibs is pretty self-explanatory.
Other structure and macro definitions belong to the 3-level event channel ABI.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 include/xen/interface/event_channel.h |   45 +++++++++++++++++++++++++++++++++
 include/xen/interface/xen.h           |   13 +++++++++-
 2 files changed, 57 insertions(+), 1 deletion(-)
diff --git a/include/xen/interface/event_channel.h
b/include/xen/interface/event_channel.h
index 293c3f0..155454e 100644
--- a/include/xen/interface/event_channel.h
+++ b/include/xen/interface/event_channel.h
@@ -189,6 +189,51 @@ struct evtchn_reset {
 	domid_t dom;
 };
 
+/*
+ * EVTCHNOP_query_extended_abis: Query the hypervisor for supported extended
+ * event channel ABIs.
+ */
+#define EVTCHNOP_query_extended_abis 11
+#define EVTCHN_EXTENDED_NONE 0
+#define _EVTCHN_EXTENDED_L3  1
+#define EVTCHN_EXTENDED_L3   (1UL << _EVTCHN_EXTENDED_L3)
+struct evtchn_query_extended_abis {
+	/* OUT parameters. */
+	uint64_t abis;
+};
+
+/*
+ * EVTCHNOP_register_3level: Register 3-level event channel.
+ */
+#define EVTCHNOP_register_3level 12
+/*
+ * 64 bits guests need 8 pages for evtchn_pending and evtchn_mask for 256k
+ * event channels while 32 bits ones only need 1 page for 32k event channels.
+ */
+#define EVTCHN_MAX_L3_PAGES  8
+/*
+ * A guest should register the bitmaps first, then register L2 selector for
+ * individual cpu.
+ */
+#define REGISTER_BITMAPS     1
+#define REGISTER_L2_SELECTOR 2
+struct evtchn_register_3level {
+	/* IN parameters. */
+	uint32_t cmd;
+	union {
+		struct {
+			uint32_t nr_pages;
+			GUEST_HANDLE(xen_pfn_t) evtchn_pending;
+			GUEST_HANDLE(xen_pfn_t) evtchn_mask;
+		} bitmaps;
+		struct {
+			uint32_t  cpu_id;
+			xen_pfn_t mfn;    /* mfn for L2 selector */
+			xen_pfn_t offset; /* offset of L2 selector */
+		} l2_selector;
+	} u;
+};
+
 struct evtchn_op {
 	uint32_t cmd; /* EVTCHNOP_* */
 	union {
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 53ec416..9b0248d 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -283,9 +283,20 @@ DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);
 
 /*
  * Event channel endpoints per domain:
+ * 2-level for x86:
  *  1024 if a long is 32 bits; 4096 if a long is 64 bits.
+ * 3-level for x86:
+ *  32k if a long is 32 bits; 256k if a long is 64 bits.
+ * 2-level for ARM:
+ *  4096 for both 32 bits and 64 bits.
+ * 3-level for ARM:
+ *  256k for both 32 bits and 64 bits.
  */
-#define NR_EVENT_CHANNELS (sizeof(xen_ulong_t) * sizeof(xen_ulong_t) * 64)
+#define NR_EVENT_CHANNELS_L2 (sizeof(xen_ulong_t) * sizeof(xen_ulong_t) * 64)
+#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * sizeof(xen_ulong_t) * 8)
+#if !defined(__XEN__) && !defined(__XEN_TOOLS__)
+#define NR_EVENT_CHANNELS NR_EVENT_CHANNELS_L2 /* for compatibility */
+#endif
 
 struct vcpu_time_info {
 	/*
-- 
1.7.10.4
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 drivers/xen/events.c |   10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 38e30aa..eca6488 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -369,6 +369,12 @@ static inline int test_evtchn(int port)
 	return sync_test_bit(port, BM(&s->evtchn_pending[0]));
 }
 
+static inline int test_and_set_mask(int port)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0]));
+}
+
 
 /**
  * notify_remote_via_irq - send event to remote end of event channel via irq
@@ -1506,7 +1512,7 @@ int resend_irq_on_evtchn(unsigned int irq)
 	if (!VALID_EVTCHN(evtchn))
 		return 1;
 
-	masked = sync_test_and_set_bit(evtchn, BM(s->evtchn_mask));
+	masked = test_and_set_mask(evtchn);
 	sync_set_bit(evtchn, BM(s->evtchn_pending));
 	if (!masked)
 		unmask_evtchn(evtchn);
@@ -1555,7 +1561,7 @@ static int retrigger_dynirq(struct irq_data *data)
 	if (VALID_EVTCHN(evtchn)) {
 		int masked;
 
-		masked = sync_test_and_set_bit(evtchn, BM(sh->evtchn_mask));
+		masked = test_and_set_mask(evtchn);
 		sync_set_bit(evtchn, BM(sh->evtchn_pending));
 		if (!masked)
 			unmask_evtchn(evtchn);
-- 
1.7.10.4
Wei Liu
2013-Mar-19  15:22 UTC
[RFC PATCH V5 06/14] xen: replace raw bit ops with functions
There is already a function called set_evtchn() for that job.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 drivers/xen/events.c |    6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index eca6488..6e226c3 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -1507,13 +1507,12 @@ static int set_affinity_irq(struct irq_data *data, const
struct cpumask *dest,
 int resend_irq_on_evtchn(unsigned int irq)
 {
 	int masked, evtchn = evtchn_from_irq(irq);
-	struct shared_info *s = HYPERVISOR_shared_info;
 
 	if (!VALID_EVTCHN(evtchn))
 		return 1;
 
 	masked = test_and_set_mask(evtchn);
-	sync_set_bit(evtchn, BM(s->evtchn_pending));
+	set_evtchn(evtchn);
 	if (!masked)
 		unmask_evtchn(evtchn);
 
@@ -1555,14 +1554,13 @@ static void mask_ack_dynirq(struct irq_data *data)
 static int retrigger_dynirq(struct irq_data *data)
 {
 	int evtchn = evtchn_from_irq(data->irq);
-	struct shared_info *sh = HYPERVISOR_shared_info;
 	int ret = 0;
 
 	if (VALID_EVTCHN(evtchn)) {
 		int masked;
 
 		masked = test_and_set_mask(evtchn);
-		sync_set_bit(evtchn, BM(sh->evtchn_pending));
+		set_evtchn(evtchn);
 		if (!masked)
 			unmask_evtchn(evtchn);
 		ret = 1;
-- 
1.7.10.4
Wei Liu
2013-Mar-19  15:22 UTC
[RFC PATCH V5 07/14] xen: generalized event channel operations
Use global pointers in common operations to allow for better code sharing
between 2 and 3 level event channel ABI.
Function pointers are used to deal with functions which are not suitable for
sharing.
Also update drivers/xen/evtchn.c to use exported variable instead of macro.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 drivers/xen/events.c |  199 +++++++++++++++++++++++++++++++-------------------
 drivers/xen/evtchn.c |   13 ++--
 include/xen/events.h |    3 +
 3 files changed, 135 insertions(+), 80 deletions(-)
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 6e226c3..217efb2 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -56,6 +56,27 @@
 #include <xen/interface/sched.h>
 #include <asm/hw_irq.h>
 
+/* extended event channel ABI in use, default is EVTCHN_EXTENDED_NONE */
+uint64_t xen_evtchn_extended = EVTCHN_EXTENDED_NONE;
+EXPORT_SYMBOL_GPL(xen_evtchn_extended);
+/* number of event channels */
+unsigned int xen_nr_event_channels;
+EXPORT_SYMBOL_GPL(xen_nr_event_channels);
+
+struct evtchn_ops {
+	void (*unmask)(int port);
+	irqreturn_t (*debug_interrupt)(int irq, void *dev_id);
+	void (*do_upcall)(void);
+};
+
+static const struct evtchn_ops *eops;
+
+/* The following pointers point to pending bitmap and mask bitmap. */
+static xen_ulong_t *evtchn_pending;
+static xen_ulong_t *evtchn_mask;
+/* The following per-cpu var points to selector(s). */
+static DEFINE_PER_CPU(xen_ulong_t *[1], evtchn_sel);
+
 /*
  * This lock protects updates to the following mapping and reference-count
  * arrays. The lock does not need to be acquired to read the mapping tables.
@@ -135,7 +156,7 @@ static bool (*pirq_needs_eoi)(unsigned irq);
 /* Find the first set bit in a evtchn mask */
 #define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD)
 
-static DEFINE_PER_CPU(xen_ulong_t [NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD],
+static DEFINE_PER_CPU(xen_ulong_t [NR_EVENT_CHANNELS_L2/BITS_PER_EVTCHN_WORD],
 		      cpu_evtchn_mask);
 
 /* Xen will never allocate port zero for any purpose. */
@@ -310,12 +331,11 @@ static bool pirq_needs_eoi_flag(unsigned irq)
 }
 
 static inline xen_ulong_t active_evtchns(unsigned int cpu,
-					 struct shared_info *sh,
 					 unsigned int idx)
 {
-	return sh->evtchn_pending[idx] &
+	return evtchn_pending[idx] &
 		per_cpu(cpu_evtchn_mask, cpu)[idx] &
-		~sh->evtchn_mask[idx];
+		~evtchn_mask[idx];
 }
 
 static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
@@ -353,26 +373,22 @@ static void init_evtchn_cpu_bindings(void)
 
 static inline void clear_evtchn(int port)
 {
-	struct shared_info *s = HYPERVISOR_shared_info;
-	sync_clear_bit(port, BM(&s->evtchn_pending[0]));
+	sync_clear_bit(port, BM(&evtchn_pending[0]));
 }
 
 static inline void set_evtchn(int port)
 {
-	struct shared_info *s = HYPERVISOR_shared_info;
-	sync_set_bit(port, BM(&s->evtchn_pending[0]));
+	sync_set_bit(port, BM(&evtchn_pending[0]));
 }
 
 static inline int test_evtchn(int port)
 {
-	struct shared_info *s = HYPERVISOR_shared_info;
-	return sync_test_bit(port, BM(&s->evtchn_pending[0]));
+	return sync_test_bit(port, BM(&evtchn_pending[0]));
 }
 
 static inline int test_and_set_mask(int port)
 {
-	struct shared_info *s = HYPERVISOR_shared_info;
-	return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0]));
+	return sync_test_and_set_bit(port, BM(&evtchn_mask[0]));
 }
 
 
@@ -395,24 +411,40 @@ EXPORT_SYMBOL_GPL(notify_remote_via_irq);
 
 static void mask_evtchn(int port)
 {
-	struct shared_info *s = HYPERVISOR_shared_info;
-	sync_set_bit(port, BM(&s->evtchn_mask[0]));
+	sync_set_bit(port, BM(&evtchn_mask[0]));
+}
+
+static inline void __unmask_local_port_l2(int port)
+{
+	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+	int cpu = smp_processor_id();
+
+	sync_clear_bit(port, BM(&evtchn_mask[0]));
+
+	/*
+	 * The following is basically the equivalent of
+	 * ''hw_resend_irq''. Just like a real IO-APIC we
''lose
+	 * the interrupt edge'' if the channel is masked.
+	 */
+	if (sync_test_bit(port, BM(&evtchn_pending[0])) &&
+	    !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD,
+				   BM(per_cpu(evtchn_sel, cpu)[0])))
+		vcpu_info->evtchn_upcall_pending = 1;
 }
 
 static void unmask_evtchn(int port)
 {
-	struct shared_info *s = HYPERVISOR_shared_info;
 	unsigned int cpu = get_cpu();
-	int do_hypercall = 0, evtchn_pending = 0;
+	int do_hypercall = 0, _evtchn_pending = 0;
 
 	BUG_ON(!irqs_disabled());
 
 	if (unlikely((cpu != cpu_from_evtchn(port))))
 		do_hypercall = 1;
 	else
-		evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0]));
+		_evtchn_pending = sync_test_bit(port, BM(&evtchn_pending[0]));
 
-	if (unlikely(evtchn_pending && xen_hvm_domain()))
+	if (unlikely(_evtchn_pending && xen_hvm_domain()))
 		do_hypercall = 1;
 
 	/* Slow path (hypercall) if this is a non-local port or if this is
@@ -421,21 +453,8 @@ static void unmask_evtchn(int port)
 	if (do_hypercall) {
 		struct evtchn_unmask unmask = { .port = port };
 		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
-	} else {
-		struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
-
-		sync_clear_bit(port, BM(&s->evtchn_mask[0]));
-
-		/*
-		 * The following is basically the equivalent of
-		 * ''hw_resend_irq''. Just like a real IO-APIC we
''lose
-		 * the interrupt edge'' if the channel is masked.
-		 */
-		if (evtchn_pending &&
-		    !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD,
-					   BM(&vcpu_info->evtchn_pending_sel)))
-			vcpu_info->evtchn_upcall_pending = 1;
-	}
+	} else
+		eops->unmask(port);
 
 	put_cpu();
 }
@@ -938,7 +957,7 @@ static int find_virq(unsigned int virq, unsigned int cpu)
 	int port, rc = -ENOENT;
 
 	memset(&status, 0, sizeof(status));
-	for (port = 0; port <= NR_EVENT_CHANNELS; port++) {
+	for (port = 0; port <= xen_nr_event_channels; port++) {
 		status.dom = DOMID_SELF;
 		status.port = port;
 		rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status);
@@ -1163,7 +1182,7 @@ int evtchn_get(unsigned int evtchn)
 	struct irq_info *info;
 	int err = -ENOENT;
 
-	if (evtchn >= NR_EVENT_CHANNELS)
+	if (evtchn >= xen_nr_event_channels)
 		return -EINVAL;
 
 	mutex_lock(&irq_mapping_update_lock);
@@ -1208,13 +1227,12 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector
vector)
 
 irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 {
-	struct shared_info *sh = HYPERVISOR_shared_info;
-	int cpu = smp_processor_id();
-	xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
-	int i;
-	unsigned long flags;
+	irqreturn_t rc;
 	static DEFINE_SPINLOCK(debug_lock);
+	unsigned long flags;
+	int cpu = smp_processor_id();
 	struct vcpu_info *v;
+	int i;
 
 	spin_lock_irqsave(&debug_lock, flags);
 
@@ -1228,65 +1246,80 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 			: v->evtchn_upcall_mask;
 		printk(KERN_DEBUG "%d: masked=%d pending=%d event_sel
%0*"PRI_xen_ulong"\n  ", i,
 		       pending, v->evtchn_upcall_pending,
-		       (int)(sizeof(v->evtchn_pending_sel)*2),
-		       v->evtchn_pending_sel);
+		       (int)(sizeof(*per_cpu(evtchn_sel, cpu)[0])*2),
+		       *per_cpu(evtchn_sel, cpu)[0]);
 	}
+
+	rc = eops->debug_interrupt(irq, dev_id);
+
+	spin_unlock_irqrestore(&debug_lock, flags);
+	return rc;
+}
+
+static irqreturn_t xen_debug_interrupt_l2(int irq, void *dev_id)
+{
+	int cpu = smp_processor_id();
+	xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
+	int i;
+	unsigned long nr_elems = NR_EVENT_CHANNELS_L2 / BITS_PER_EVTCHN_WORD;
+	struct vcpu_info *v;
+
 	v = per_cpu(xen_vcpu, cpu);
 
 	printk(KERN_DEBUG "\npending:\n   ");
-	for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
+	for (i = nr_elems; i >= 0; i--)
 		printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s",
-		       (int)sizeof(sh->evtchn_pending[0])*2,
-		       sh->evtchn_pending[i],
+		       (int)sizeof(evtchn_pending[0])*2,
+		       evtchn_pending[i],
 		       i % 8 == 0 ? "\n   " : " ");
 	printk(KERN_DEBUG "\nglobal mask:\n   ");
-	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+	for (i = nr_elems; i >= 0; i--)
 		printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s",
-		       (int)(sizeof(sh->evtchn_mask[0])*2),
-		       sh->evtchn_mask[i],
+		       (int)(sizeof(evtchn_mask[0])*2),
+		       evtchn_mask[i],
 		       i % 8 == 0 ? "\n   " : " ");
 
 	printk(KERN_DEBUG "\nglobally unmasked:\n   ");
-	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+	for (i = nr_elems; i >= 0; i--)
 		printk("%0*"PRI_xen_ulong"%s",
-		       (int)(sizeof(sh->evtchn_mask[0])*2),
-		       sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
+		       (int)(sizeof(evtchn_mask[0])*2),
+		       evtchn_pending[i] & ~evtchn_mask[i],
 		       i % 8 == 0 ? "\n   " : " ");
 
 	printk("\nlocal cpu%d mask:\n   ", cpu);
-	for (i = (NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--)
-		printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s",
(int)(sizeof(cpu_evtchn[0])*2),
+	for (i = (NR_EVENT_CHANNELS_L2/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--)
+		printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s",
+		       (int)(sizeof(cpu_evtchn[0])*2),
 		       cpu_evtchn[i],
 		       i % 8 == 0 ? "\n   " : " ");
 
 	printk(KERN_DEBUG "\nlocally unmasked:\n   ");
-	for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
-		xen_ulong_t pending = sh->evtchn_pending[i]
-			& ~sh->evtchn_mask[i]
+	for (i = nr_elems-1; i >= 0; i--) {
+		xen_ulong_t pending = evtchn_pending[i]
+			& ~evtchn_mask[i]
 			& cpu_evtchn[i];
 		printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s",
-		       (int)(sizeof(sh->evtchn_mask[0])*2),
+		       (int)(sizeof(evtchn_mask[0])*2),
 		       pending, i % 8 == 0 ? "\n   " : " ");
 	}
 
 	printk(KERN_DEBUG "\npending list:\n");
-	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
-		if (sync_test_bit(i, BM(sh->evtchn_pending))) {
+	for (i = 0; i < NR_EVENT_CHANNELS_L2; i++) {
+		if (sync_test_bit(i, BM(evtchn_pending))) {
 			int word_idx = i / BITS_PER_EVTCHN_WORD;
 			printk(KERN_DEBUG "  %d: event %d -> irq %d%s%s%s\n",
 			       cpu_from_evtchn(i), i,
 			       evtchn_to_irq[i],
-			       sync_test_bit(word_idx, BM(&v->evtchn_pending_sel))
+			       sync_test_bit(word_idx,
+				             BM(per_cpu(evtchn_sel, cpu)[0]))
 					     ? "" : " l1-clear",
-			       !sync_test_bit(i, BM(sh->evtchn_mask))
+			       !sync_test_bit(i, BM(evtchn_mask))
 					     ? "" : " globally-masked",
 			       sync_test_bit(i, BM(cpu_evtchn))
 					     ? "" : " locally-masked");
 		}
 	}
 
-	spin_unlock_irqrestore(&debug_lock, flags);
-
 	return IRQ_HANDLED;
 }
 
@@ -1308,13 +1341,12 @@ static DEFINE_PER_CPU(unsigned int, current_bit_idx);
  * a bitset of words which contain pending event bits.  The second
  * level is a bitset of pending events themselves.
  */
-static void __xen_evtchn_do_upcall(void)
+static void __xen_evtchn_do_upcall_l2(void)
 {
 	int start_word_idx, start_bit_idx;
 	int word_idx, bit_idx;
 	int i;
 	int cpu = get_cpu();
-	struct shared_info *s = HYPERVISOR_shared_info;
 	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
 	unsigned count;
 
@@ -1331,7 +1363,7 @@ static void __xen_evtchn_do_upcall(void)
 		 * selector flag. xchg_xen_ulong must contain an
 		 * appropriate barrier.
 		 */
-		pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0);
+		pending_words = xchg_xen_ulong(per_cpu(evtchn_sel, cpu)[0], 0);
 
 		start_word_idx = __this_cpu_read(current_word_idx);
 		start_bit_idx = __this_cpu_read(current_bit_idx);
@@ -1354,7 +1386,7 @@ static void __xen_evtchn_do_upcall(void)
 			}
 			word_idx = EVTCHN_FIRST_BIT(words);
 
-			pending_bits = active_evtchns(cpu, s, word_idx);
+			pending_bits = active_evtchns(cpu, word_idx);
 			bit_idx = 0; /* usually scan entire word from start */
 			if (word_idx == start_word_idx) {
 				/* We scan the starting word in two parts */
@@ -1425,7 +1457,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
 	exit_idle();
 #endif
 
-	__xen_evtchn_do_upcall();
+	eops->do_upcall();
 
 	irq_exit();
 	set_irq_regs(old_regs);
@@ -1433,7 +1465,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
 
 void xen_hvm_evtchn_do_upcall(void)
 {
-	__xen_evtchn_do_upcall();
+	eops->do_upcall();
 }
 EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall);
 
@@ -1729,14 +1761,14 @@ void xen_irq_resume(void)
 	init_evtchn_cpu_bindings();
 
 	/* New event-channel space is not ''live'' yet. */
-	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+	for (evtchn = 0; evtchn < xen_nr_event_channels; evtchn++)
 		mask_evtchn(evtchn);
 
 	/* No IRQ <-> event-channel mappings. */
 	list_for_each_entry(info, &xen_irq_list_head, list)
 		info->evtchn = 0; /* zap event-channel binding */
 
-	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+	for (evtchn = 0; evtchn < xen_nr_event_channels; evtchn++)
 		evtchn_to_irq[evtchn] = -1;
 
 	for_each_possible_cpu(cpu) {
@@ -1829,20 +1861,39 @@ void xen_callback_vector(void)
 void xen_callback_vector(void) {}
 #endif
 
+const struct evtchn_ops evtchn_l2_ops = {
+	.unmask = __unmask_local_port_l2,
+	.debug_interrupt = xen_debug_interrupt_l2,
+	.do_upcall = __xen_evtchn_do_upcall_l2
+};
+
 void __init xen_init_IRQ(void)
 {
 	int i;
+	int cpu;
+	struct shared_info *s = HYPERVISOR_shared_info;
+
+	evtchn_pending = s->evtchn_pending;
+	evtchn_mask = s->evtchn_mask;
+	for_each_possible_cpu(cpu) {
+		struct vcpu_info *vcpu_info = per_cpu(xen_vcpu, cpu);
+		per_cpu(evtchn_sel, cpu)[0] = &vcpu_info->evtchn_pending_sel;
+	}
+
+	xen_evtchn_extended = EVTCHN_EXTENDED_NONE;
+	xen_nr_event_channels = NR_EVENT_CHANNELS_L2;
+	eops = &evtchn_l2_ops;
 
-	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
+	evtchn_to_irq = kcalloc(xen_nr_event_channels, sizeof(*evtchn_to_irq),
 				    GFP_KERNEL);
 	BUG_ON(!evtchn_to_irq);
-	for (i = 0; i < NR_EVENT_CHANNELS; i++)
+	for (i = 0; i < xen_nr_event_channels; i++)
 		evtchn_to_irq[i] = -1;
 
 	init_evtchn_cpu_bindings();
 
 	/* No event channels are ''live'' right now. */
-	for (i = 0; i < NR_EVENT_CHANNELS; i++)
+	for (i = 0; i < xen_nr_event_channels; i++)
 		mask_evtchn(i);
 
 	pirq_needs_eoi = pirq_needs_eoi_flag;
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index b2db77e..ac7a96e 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -232,7 +232,7 @@ static ssize_t evtchn_write(struct file *file, const char
__user *buf,
 	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) {
 		unsigned port = kbuf[i];
 
-		if (port < NR_EVENT_CHANNELS &&
+		if (port < xen_nr_event_channels &&
 		    get_port_user(port) == u &&
 		    !get_port_enabled(port)) {
 			set_port_enabled(port, true);
@@ -374,7 +374,7 @@ static long evtchn_ioctl(struct file *file,
 			break;
 
 		rc = -EINVAL;
-		if (unbind.port >= NR_EVENT_CHANNELS)
+		if (unbind.port >= xen_nr_event_channels)
 			break;
 
 		spin_lock_irq(&port_user_lock);
@@ -402,7 +402,7 @@ static long evtchn_ioctl(struct file *file,
 		if (copy_from_user(¬ify, uarg, sizeof(notify)))
 			break;
 
-		if (notify.port >= NR_EVENT_CHANNELS) {
+		if (notify.port >= xen_nr_event_channels) {
 			rc = -EINVAL;
 		} else if (get_port_user(notify.port) != u) {
 			rc = -ENOTCONN;
@@ -492,7 +492,7 @@ static int evtchn_release(struct inode *inode, struct file
*filp)
 
 	free_page((unsigned long)u->ring);
 
-	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+	for (i = 0; i < xen_nr_event_channels; i++) {
 		if (get_port_user(i) != u)
 			continue;
 
@@ -501,7 +501,7 @@ static int evtchn_release(struct inode *inode, struct file
*filp)
 
 	spin_unlock_irq(&port_user_lock);
 
-	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+	for (i = 0; i < xen_nr_event_channels; i++) {
 		if (get_port_user(i) != u)
 			continue;
 
@@ -538,7 +538,8 @@ static int __init evtchn_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
-	port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL);
+	port_user = kcalloc(xen_nr_event_channels,
+			    sizeof(*port_user), GFP_KERNEL);
 	if (port_user == NULL)
 		return -ENOMEM;
 
diff --git a/include/xen/events.h b/include/xen/events.h
index c6bfe01..24cf421 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -111,4 +111,7 @@ int xen_test_irq_shared(int irq);
 
 /* initialize Xen IRQ subsystem */
 void xen_init_IRQ(void);
+extern unsigned int xen_nr_event_channels;
+extern uint64_t xen_evtchn_extended;
+
 #endif	/* _XEN_EVENTS_H */
-- 
1.7.10.4
Wei Liu
2013-Mar-19  15:22 UTC
[RFC PATCH V5 08/14] xen: dynamically allocate cpu_evtchn_mask
The size of cpu_evtchn_mask can change, use dynamic allocation to cope with
this. To save space, cpu_evtchn_mask is not allocated for offline cpus. It
will get allocated as soon as a cpu goes online.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 drivers/xen/events.c |   57 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 53 insertions(+), 4 deletions(-)
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 217efb2..ee35ff9 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -30,6 +30,7 @@
 #include <linux/slab.h>
 #include <linux/irqnr.h>
 #include <linux/pci.h>
+#include <linux/cpu.h>
 
 #ifdef CONFIG_X86
 #include <asm/desc.h>
@@ -156,8 +157,7 @@ static bool (*pirq_needs_eoi)(unsigned irq);
 /* Find the first set bit in a evtchn mask */
 #define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD)
 
-static DEFINE_PER_CPU(xen_ulong_t [NR_EVENT_CHANNELS_L2/BITS_PER_EVTCHN_WORD],
-		      cpu_evtchn_mask);
+static DEFINE_PER_CPU(xen_ulong_t *, cpu_evtchn_mask);
 
 /* Xen will never allocate port zero for any purpose. */
 #define VALID_EVTCHN(chn)	((chn) != 0)
@@ -356,6 +356,9 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned
int cpu)
 static void init_evtchn_cpu_bindings(void)
 {
 	int i;
+	unsigned int nr = xen_nr_event_channels / BITS_PER_EVTCHN_WORD;
+	unsigned int nr_bytes = nr * sizeof(xen_ulong_t);
+
 #ifdef CONFIG_SMP
 	struct irq_info *info;
 
@@ -366,9 +369,9 @@ static void init_evtchn_cpu_bindings(void)
 	}
 #endif
 
-	for_each_possible_cpu(i)
+	for_each_online_cpu(i)
 		memset(per_cpu(cpu_evtchn_mask, i),
-		       (i == 0) ? ~0 : 0, sizeof(*per_cpu(cpu_evtchn_mask, i)));
+		       (i == 0) ? ~0 : 0, nr_bytes);
 }
 
 static inline void clear_evtchn(int port)
@@ -1867,6 +1870,41 @@ const struct evtchn_ops evtchn_l2_ops = {
 	.do_upcall = __xen_evtchn_do_upcall_l2
 };
 
+static int __cpuinit xen_events_notifier_cb(struct notifier_block *self,
+					    unsigned long action,
+					    void *hcpu)
+{
+	int cpu = (long)hcpu;
+	int rc = NOTIFY_OK;
+	void *p;
+	unsigned int nr = xen_nr_event_channels / BITS_PER_EVTCHN_WORD;
+	unsigned int nr_bytes = nr * sizeof(xen_ulong_t);
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		if (!per_cpu(cpu_evtchn_mask, cpu)) {
+			p = kzalloc_node(sizeof(xen_ulong_t) * nr,
+					 GFP_KERNEL, cpu_to_node(cpu));
+			if (!p)
+				rc = NOTIFY_BAD;
+			else {
+				per_cpu(cpu_evtchn_mask, cpu) = p;
+				memset(per_cpu(cpu_evtchn_mask, cpu),
+				       (cpu == 0) ? ~0 : 0, nr_bytes);
+				rc = NOTIFY_OK;
+			}
+		}
+		break;
+	default:
+		break;
+	}
+	return rc;
+}
+
+static struct notifier_block xen_events_notifier __cpuinitdata = {
+	.notifier_call = xen_events_notifier_cb,
+};
+
 void __init xen_init_IRQ(void)
 {
 	int i;
@@ -1890,6 +1928,17 @@ void __init xen_init_IRQ(void)
 	for (i = 0; i < xen_nr_event_channels; i++)
 		evtchn_to_irq[i] = -1;
 
+	for_each_online_cpu(cpu) {
+		void *p;
+		unsigned int nr = xen_nr_event_channels / BITS_PER_EVTCHN_WORD;
+
+		p = kzalloc_node(sizeof(xen_ulong_t) * nr,
+				 GFP_KERNEL, cpu_to_node(cpu));
+		BUG_ON(!p);
+		per_cpu(cpu_evtchn_mask, cpu) = p;
+	}
+	register_cpu_notifier(&xen_events_notifier);
+
 	init_evtchn_cpu_bindings();
 
 	/* No event channels are ''live'' right now. */
-- 
1.7.10.4
Wei Liu
2013-Mar-19  15:22 UTC
[RFC PATCH V5 09/14] xen: implement 3-level event channel routines
Implement several routines for 3-level event channel ABI. Some routines are
shared between 2/3-level ABIs.
For N-level (now only 2 and 3) event channel ABIs, the active events are
processed in a top-down approach, i.e. L1 -> L2 -> .. -> L(n-1) ->
bitmap. The
selectors are processed recursively, the event bitmap is processed by a
dedicated function called process_port.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 drivers/xen/events.c |  376 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 293 insertions(+), 83 deletions(-)
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index ee35ff9..fe1831b 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -76,7 +76,12 @@ static const struct evtchn_ops *eops;
 static xen_ulong_t *evtchn_pending;
 static xen_ulong_t *evtchn_mask;
 /* The following per-cpu var points to selector(s). */
-static DEFINE_PER_CPU(xen_ulong_t *[1], evtchn_sel);
+static DEFINE_PER_CPU(xen_ulong_t *[2], evtchn_sel);
+/*
+ * 2nd level selector for 3-level event channel, ''8'' stands
for 8 bits
+ * per byte.
+ */
+static DEFINE_PER_CPU(xen_ulong_t [sizeof(xen_ulong_t) * 8], evtchn_sel_l2);
 
 /*
  * This lock protects updates to the following mapping and reference-count
@@ -150,6 +155,11 @@ static bool (*pirq_needs_eoi)(unsigned irq);
  */
 #define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8)
 /*
+ * If xen_ulong_t is 8 byte, it''s 64 bits wide, 2^6 == 64, otherwise
+ * it is 32 bits, 2^5 == 32
+ */
+#define EVTCHN_WORD_BITORDER (sizeof(xen_ulong_t) == 8 ? 6 : 5)
+/*
  * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t
  * array. Primarily to avoid long lines (hence the terse name).
  */
@@ -435,6 +445,29 @@ static inline void __unmask_local_port_l2(int port)
 		vcpu_info->evtchn_upcall_pending = 1;
 }
 
+static inline void __unmask_local_port_l3(int port)
+{
+	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+	int cpu = smp_processor_id();
+	unsigned int l1bit = port >> (EVTCHN_WORD_BITORDER << 1);
+	unsigned int l2bit = port >> EVTCHN_WORD_BITORDER;
+
+	sync_clear_bit(port, BM(&evtchn_mask[0]));
+
+	/*
+	 * The following is basically the equivalent of
+	 * ''hw_resend_irq''. Just like a real IO-APIC we
''lose
+	 * the interrupt edge'' if the channel is masked.
+	 */
+	if (sync_test_bit(port, BM(&evtchn_pending[0])) &&
+	    !sync_test_and_set_bit(l2bit,
+				   BM(per_cpu(evtchn_sel, cpu)[1])) &&
+	    !sync_test_and_set_bit(l1bit,
+				   BM(per_cpu(evtchn_sel, cpu)[0])))
+		vcpu_info->evtchn_upcall_pending = 1;
+
+}
+
 static void unmask_evtchn(int port)
 {
 	unsigned int cpu = get_cpu();
@@ -1326,119 +1359,254 @@ static irqreturn_t xen_debug_interrupt_l2(int irq,
void *dev_id)
 	return IRQ_HANDLED;
 }
 
+static irqreturn_t xen_debug_interrupt_l3(int irq, void *dev_id)
+{
+	int cpu = smp_processor_id();
+	xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
+	unsigned long nr_elems = NR_EVENT_CHANNELS_L3 / BITS_PER_EVTCHN_WORD;
+	int i;
+	struct vcpu_info *v;
+
+	v = per_cpu(xen_vcpu, cpu);
+
+	printk(KERN_DEBUG "\npending (only show words which have bits set to
1):\n   ");
+	for (i = nr_elems-1; i >= 0; i--)
+		if (evtchn_pending[i] != 0UL) {
+			printk(KERN_DEBUG " word index %d %0*"PRI_xen_ulong"\n",
+			       i,
+			       (int)(sizeof(evtchn_pending[0])*2),
+			       evtchn_pending[i]);
+		}
+
+	printk(KERN_DEBUG "\nglobal mask (only show words which have bits set to
0):\n   ");
+	for (i = nr_elems-1; i >= 0; i--)
+		if (evtchn_mask[i] != ~0UL) {
+			printk(KERN_DEBUG " word index %d %0*"PRI_xen_ulong"\n",
+			       i,
+			       (int)(sizeof(evtchn_mask[0])*2),
+			       evtchn_mask[i]);
+		}
+
+	printk(KERN_DEBUG "\nglobally unmasked (only show result words which have
bits set to 1):\n   ");
+	for (i = nr_elems-1; i >= 0; i--)
+		if ((evtchn_pending[i] & ~evtchn_mask[i]) != 0UL) {
+			printk(KERN_DEBUG " word index %d %0*"PRI_xen_ulong"\n",
+			       i,
+			       (int)(sizeof(evtchn_mask[0])*2),
+			       evtchn_pending[i] & ~evtchn_mask[i]);
+		}
+
+	printk(KERN_DEBUG "\nlocal cpu%d mask (only show words which have bits
set to 1):\n   ", cpu);
+	for (i = (NR_EVENT_CHANNELS_L3/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--)
+		if (cpu_evtchn[i] != 0UL) {
+			printk(KERN_DEBUG " word index %d %0*"PRI_xen_ulong"\n",
+			       i,
+			       (int)(sizeof(cpu_evtchn[0])*2),
+			       cpu_evtchn[i]);
+		}
+
+	printk(KERN_DEBUG "\nlocally unmasked (only show result words which have
bits set to 1):\n   ");
+	for (i = nr_elems-1; i >= 0; i--) {
+		xen_ulong_t pending = evtchn_pending[i]
+			& ~evtchn_mask[i]
+			& cpu_evtchn[i];
+		if (pending != 0UL) {
+			printk(KERN_DEBUG " word index %d %0*"PRI_xen_ulong"\n",
+			       i,
+			       (int)(sizeof(evtchn_mask[0])*2),
+			       pending);
+		}
+	}
+
+	printk(KERN_DEBUG "\npending list:\n");
+	for (i = 0; i < NR_EVENT_CHANNELS_L3; i++) {
+		if (sync_test_bit(i, evtchn_pending)) {
+			int word_idx = i / (BITS_PER_EVTCHN_WORD * BITS_PER_EVTCHN_WORD);
+			int word_idx_l2 = i / BITS_PER_EVTCHN_WORD;
+			printk(KERN_DEBUG "  %d: event %d -> irq %d%s%s%s%s\n",
+			       cpu_from_evtchn(i), i,
+			       evtchn_to_irq[i],
+			       !sync_test_bit(word_idx, BM(per_cpu(evtchn_sel, cpu)[0]))
+			       ? "" : " l1-clear",
+			       !sync_test_bit(word_idx_l2, BM(per_cpu(evtchn_sel, cpu)[1]))
+			       ? "" : " l2-clear",
+			       sync_test_bit(i, BM(evtchn_mask))
+			       ? "" : " globally-masked",
+			       sync_test_bit(i, BM(cpu_evtchn))
+			       ? "" : " locally-masked");
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
 static DEFINE_PER_CPU(unsigned, xed_nesting_count);
-static DEFINE_PER_CPU(unsigned int, current_word_idx);
-static DEFINE_PER_CPU(unsigned int, current_bit_idx);
+static DEFINE_PER_CPU(unsigned int[3], current_idx);
 
 /*
  * Mask out the i least significant bits of w
  */
 #define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i))
 
+static __always_inline void process_port(int cpu,
+					 unsigned int base,
+					 unsigned int *idx,
+					 unsigned int *idx_array)
+{
+	xen_ulong_t pending_bits, bits;
+	int port, irq;
+	struct irq_desc *desc;
+
+	pending_bits = active_evtchns(cpu, base >> EVTCHN_WORD_BITORDER);
+
+	do {
+		bits = MASK_LSBS(pending_bits, *idx);
+
+		/* If we masked out all events, move on. */
+		if (bits == 0)
+			break;
+
+		*idx = EVTCHN_FIRST_BIT(bits);
+
+		/* Process port. */
+		port = base + *idx;
+		irq = evtchn_to_irq[port];
+
+		if (irq != -1) {
+			desc = irq_to_desc(irq);
+			if (desc)
+				generic_handle_irq_desc(irq, desc);
+		}
+
+		*idx = (*idx + 1) % BITS_PER_EVTCHN_WORD;
+
+		/* Next caller starts at last processed + 1 */
+		/*
+		 * As this routine is shared by 2/3-level event
+		 * channel, we need to write all three current_idx
+		 * elements. In the 2-level case, the caller /should/
+		 * always set idx_array[2] to ~0U, so in practice the
+		 * write to current_idx[1] is equivalent to writing
+		 * idx_array[1].
+		 */
+		__this_cpu_write(current_idx[0],
+				 idx_array[1] ? idx_array[0] :
+				 (idx_array[0]+1) % BITS_PER_EVTCHN_WORD);
+		__this_cpu_write(current_idx[1],
+				 idx_array[2] ? idx_array[1] :
+				 (idx_array[1]+1) % BITS_PER_EVTCHN_WORD);
+		__this_cpu_write(current_idx[2], idx_array[2]);
+	} while (*idx != 0);
+}
+
 /*
- * Search the CPUs pending events bitmasks.  For each one found, map
- * the event number to an irq, and feed it into do_IRQ() for
- * handling.
+ * This function process active event channel top-down, L1 -> L2 ->
+ * .. -> L(n-1) -> bitmap. The selectors are processed recursively,
+ * the event bitmap is processed by process_port
  *
- * Xen uses a two-level bitmap to speed searching.  The first level is
- * a bitset of words which contain pending event bits.  The second
- * level is a bitset of pending events themselves.
+ * @cpu: current cpu id
+ * @base: accumulated offsets along selector processing
+ * @start_idx: array used to resume index
+ * @idx: array of current processing index
+ * @sel_idx: selector word index
+ * @level: current processing level, from 0 to highest_level
+ * @highest_level: highest recursion level
+ *
+ * If level == higest_level, we reach the event bitmap.  level
+ * variable starts from 0, so highest_level for 2-level ABI is 1,
+ * while for 3-level ABI it is 2.
  */
-static void __xen_evtchn_do_upcall_l2(void)
+static void process(int cpu,
+		    unsigned int base,
+		    unsigned int *start_idx,
+		    unsigned int *idx,
+		    unsigned int sel_idx,
+		    unsigned short level,
+		    unsigned short highest_level)
 {
-	int start_word_idx, start_bit_idx;
-	int word_idx, bit_idx;
 	int i;
-	int cpu = get_cpu();
-	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
-	unsigned count;
+	xen_ulong_t pending_words;
 
-	do {
-		xen_ulong_t pending_words;
+	if (level == highest_level) {
+		process_port(cpu, base, &idx[level], idx);
+		return;
+	}
 
-		vcpu_info->evtchn_upcall_pending = 0;
+	pending_words +		xchg_xen_ulong(&per_cpu(evtchn_sel, cpu)[level][sel_idx],
0);
 
-		if (__this_cpu_inc_return(xed_nesting_count) - 1)
-			goto out;
+	/* This loop is used to process selectors. */
+	for (i = 0; pending_words != 0; i++) {
+		xen_ulong_t words;
+		unsigned int saved_base;
+
+		words = MASK_LSBS(pending_words, idx[level]);
 
 		/*
-		 * Master flag must be cleared /before/ clearing
-		 * selector flag. xchg_xen_ulong must contain an
-		 * appropriate barrier.
+		 * If we masked out all events, wrap to beginning.
 		 */
-		pending_words = xchg_xen_ulong(per_cpu(evtchn_sel, cpu)[0], 0);
-
-		start_word_idx = __this_cpu_read(current_word_idx);
-		start_bit_idx = __this_cpu_read(current_bit_idx);
-
-		word_idx = start_word_idx;
+		if (words == 0) {
+			idx[level] = 0;
+			start_idx[level+1] = 0;
+			continue;
+		}
 
-		for (i = 0; pending_words != 0; i++) {
-			xen_ulong_t pending_bits;
-			xen_ulong_t words;
+		idx[level] = EVTCHN_FIRST_BIT(words);
+
+		idx[level+1] = 0; /* usually scan entire word from start */
+		if (idx[level] == start_idx[level]) {
+			/* We scan the starting word in two parts */
+			if (i == 0)
+				/* 1st time: start in the middle */
+				idx[level+1] = start_idx[level+1];
+			else
+				/* 2nd time: mask bits done already */
+				idx[level+1] &= (1UL << start_idx[level+1]) - 1;
+		}
 
-			words = MASK_LSBS(pending_words, word_idx);
+		saved_base = base;
+		base += (idx[level] <<
+			 (EVTCHN_WORD_BITORDER * (highest_level-level)));
 
-			/*
-			 * If we masked out all events, wrap to beginning.
-			 */
-			if (words == 0) {
-				word_idx = 0;
-				bit_idx = 0;
-				continue;
-			}
-			word_idx = EVTCHN_FIRST_BIT(words);
-
-			pending_bits = active_evtchns(cpu, word_idx);
-			bit_idx = 0; /* usually scan entire word from start */
-			if (word_idx == start_word_idx) {
-				/* We scan the starting word in two parts */
-				if (i == 0)
-					/* 1st time: start in the middle */
-					bit_idx = start_bit_idx;
-				else
-					/* 2nd time: mask bits done already */
-					bit_idx &= (1UL << start_bit_idx) - 1;
-			}
+		process(cpu, base, start_idx, idx, idx[level],
+			level+1, highest_level);
 
-			do {
-				xen_ulong_t bits;
-				int port, irq;
-				struct irq_desc *desc;
+		base = saved_base;
 
-				bits = MASK_LSBS(pending_bits, bit_idx);
+		/* Scan start_l1i twice; all others once. */
+		if ((idx[level] != start_idx[level]) || (i != 0))
+			pending_words &= ~(1UL << idx[level]);
 
-				/* If we masked out all events, move on. */
-				if (bits == 0)
-					break;
+		idx[level] = (idx[level] + 1) % BITS_PER_EVTCHN_WORD;
+	}
+}
 
-				bit_idx = EVTCHN_FIRST_BIT(bits);
 
-				/* Process port. */
-				port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx;
-				irq = evtchn_to_irq[port];
+/* This routine is shared between 2/3-level ABI */
+static void ___xen_evtchn_do_upcall(unsigned int *start_idx,
+				    unsigned int *idx,
+				    unsigned short highest_level)
+{
+	int cpu = get_cpu();
+	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+	unsigned count;
 
-				if (irq != -1) {
-					desc = irq_to_desc(irq);
-					if (desc)
-						generic_handle_irq_desc(irq, desc);
-				}
+	do {
+		vcpu_info->evtchn_upcall_pending = 0;
 
-				bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD;
+		if (__this_cpu_inc_return(xed_nesting_count) - 1)
+			goto out;
 
-				/* Next caller starts at last processed + 1 */
-				__this_cpu_write(current_word_idx,
-						 bit_idx ? word_idx :
-						 (word_idx+1) % BITS_PER_EVTCHN_WORD);
-				__this_cpu_write(current_bit_idx, bit_idx);
-			} while (bit_idx != 0);
+		start_idx[0] = __this_cpu_read(current_idx[0]);
+		start_idx[1] = __this_cpu_read(current_idx[1]);
+		start_idx[2] = __this_cpu_read(current_idx[2]);
 
-			/* Scan start_l1i twice; all others once. */
-			if ((word_idx != start_word_idx) || (i != 0))
-				pending_words &= ~(1UL << word_idx);
+		idx[0] = start_idx[0];
 
-			word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD;
-		}
+		process(cpu, 0 /* base */, start_idx, idx,
+			0 /* selector index */,
+			0 /* starting from L1 (1-1=0) */,
+			highest_level);
 
 		BUG_ON(!irqs_disabled());
 
@@ -1451,6 +1619,42 @@ out:
 	put_cpu();
 }
 
+/*
+ * Search the CPUs pending events bitmasks.  For each one found, map
+ * the event number to an irq, and feed it into do_IRQ() for
+ * handling.
+ *
+ * Xen uses a two-level bitmap to speed searching.  The first level is
+ * a bitset of words which contain pending event bits.  The second
+ * level is a bitset of pending events themselves.
+ */
+static void __xen_evtchn_do_upcall_l2(void)
+{
+	/*
+	 * Need three elements to feed into __process_port, but the
+	 * third element is never used for 2-level ABI and should
+	 * always be set to ~0U.
+	 */
+	unsigned int start_idx[3] = { 0, 0, ~0U };
+	unsigned int idx[3] = { 0, 0, ~0U };
+
+	___xen_evtchn_do_upcall(start_idx, idx, 1);
+}
+
+static void __xen_evtchn_do_upcall_l3(void)
+{
+	/*
+	 * Need three elements to feed into __process_port, but the
+	 * third element is never used for 2-level ABI and should
+	 * always be set to ~0U.
+	 */
+	unsigned int start_idx[3] = { 0, 0, 0 };
+	unsigned int idx[3] = { 0, 0, 0 };
+
+	___xen_evtchn_do_upcall(start_idx, idx, 2);
+
+}
+
 void xen_evtchn_do_upcall(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
@@ -1870,6 +2074,12 @@ const struct evtchn_ops evtchn_l2_ops = {
 	.do_upcall = __xen_evtchn_do_upcall_l2
 };
 
+const struct evtchn_ops evtchn_l3_ops = {
+	.unmask = __unmask_local_port_l3,
+	.debug_interrupt = xen_debug_interrupt_l3,
+	.do_upcall = __xen_evtchn_do_upcall_l3
+};
+
 static int __cpuinit xen_events_notifier_cb(struct notifier_block *self,
 					    unsigned long action,
 					    void *hcpu)
-- 
1.7.10.4
Wei Liu
2013-Mar-19  15:22 UTC
[RFC PATCH V5 10/14] xen: document 2/3-level event channel ABI
Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- drivers/xen/events.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index fe1831b..ee33421 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -57,6 +57,47 @@ #include <xen/interface/sched.h> #include <asm/hw_irq.h> +/* + * The 2-level (default) event channel ABI: + * + * This is the default ABI, it is guaranteed to be supported. The name + * comes from its 2-level lookup path. + * + * The first level is a per-cpu selector in struct vcpu_info. The size + * of L1 selector is sizeof(xen_ulong_t), in which each bit represents + * a xen_ulong_t word in the event bitmap (second level). + * + * The second level is a shared bitmap of events, embedded in shared + * info page. + * + * The lookup path is as followed. We first look at each bit of the L1 + * selector. A non-zero bit in L1 selector indicates one or more bits + * in the corresponding word in L2 bitmap is / are set. In this case + * we pick up the word in bitmap, process each non-zero bit in the + * word and process the event. + * + * + * The 3-level event channel ABI: + * + * This ABI is more or less the same as the 2-level ABI. In this ABI: + * + * The first level is a per-cpu selector in struct vcpu_info. In fact, + * we reuse the same selector in 2-level ABI. + * + * The second level is a per-cpu bitmap of xen_ulong_t words, whose + * size is the same as the second level bitmap in 2-level ABI. However + * we cannot reuse the same bitmap in shared info page because this + * bitmap is per-cpu. + * + * The third level is a shared bitmap of events, which is allocated at + * boot time by Linux kernel. + * + * The lookup path is as followed. The first two levels lookup is the + * same as the 2-level ABI, but after picking up the non-zero bit in + * L2 selector, we still need to go down one level furthur for the + * actual event bit. + */ + /* extended event channel ABI in use, default is EVTCHN_EXTENDED_NONE */ uint64_t xen_evtchn_extended = EVTCHN_EXTENDED_NONE; EXPORT_SYMBOL_GPL(xen_evtchn_extended); -- 1.7.10.4
Wei Liu
2013-Mar-19  15:22 UTC
[RFC PATCH V5 11/14] xen: introduce xen_event_channel_query_extended_abis
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 drivers/xen/events.c |   37 +++++++++++++++++++++++++++++++++++++
 include/xen/events.h |    3 +++
 2 files changed, 40 insertions(+)
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index ee33421..270821d 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -2109,6 +2109,43 @@ void xen_callback_vector(void)
 void xen_callback_vector(void) {}
 #endif
 
+/*
+ * This function returns the extended AIBs a guest can use.
+ * When
+ *  1) hypervisor doesn''t support extended ABIs (EVTCHNOP_* not
implemented)
+ *  2) hypervisor supports extended ABIs but this guest cannot use them
+ * it returns EVTCHN_EXTENDED_NONE
+ * otherwise it returns a or''ed bitmap of enabled ABIs
+ */
+uint64_t xen_event_channel_query_extended_abis(void)
+{
+	struct evtchn_query_extended_abis query;
+	int rc;
+
+	memset(&query, 0, sizeof(query));
+
+	rc = HYPERVISOR_event_channel_op(EVTCHNOP_query_extended_abis, &query);
+
+	if (rc < 0) {
+		printk(KERN_INFO
+		       "Hypervisor does not support extended event channel ABIs.");
+		return EVTCHN_EXTENDED_NONE;
+	}
+
+	printk(KERN_INFO "Hypervisor supports extended event channel
ABIs.\n");
+
+	printk(KERN_INFO
+	       "Extended event channel AIBs enabled for this guest:\n");
+	if (query.abis == EVTCHN_EXTENDED_NONE /* 0 */)
+		printk(KERN_INFO "  None (disabled by host administrator)\n");
+	else {
+		if (query.abis & EVTCHN_EXTENDED_L3)
+			printk(KERN_INFO "  3-level event channel ABI\n");
+	}
+
+	return query.abis;
+}
+
 const struct evtchn_ops evtchn_l2_ops = {
 	.unmask = __unmask_local_port_l2,
 	.debug_interrupt = xen_debug_interrupt_l2,
diff --git a/include/xen/events.h b/include/xen/events.h
index 24cf421..49d54ac 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -114,4 +114,7 @@ void xen_init_IRQ(void);
 extern unsigned int xen_nr_event_channels;
 extern uint64_t xen_evtchn_extended;
 
+/* Query hypervisor for supported / enabled extended event channel ABIs. */
+uint64_t xen_event_channel_query_extended_abis(void);
+
 #endif	/* _XEN_EVENTS_H */
-- 
1.7.10.4
Wei Liu
2013-Mar-19  15:22 UTC
[RFC PATCH V5 12/14] xen: introduce xen_event_channel_register_3level
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 drivers/xen/events.c |  172 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 161 insertions(+), 11 deletions(-)
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 270821d..6bb9a47 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -201,6 +201,16 @@ static bool (*pirq_needs_eoi)(unsigned irq);
  */
 #define EVTCHN_WORD_BITORDER (sizeof(xen_ulong_t) == 8 ? 6 : 5)
 /*
+ * If we use 3-level event channel and the event word size is 64 bits, we have
+ * 256k event channels in total, for 32 bits, we have 32k event channels in
+ * total. A page (4K) can represent 4096 * 8 = 32k event channels. So we can
+ * calculate pages needed for 3-level event channels is 1 page for 32 bits and
+ * 8 pages for 64 bits.
+ */
+#define BITMAP_PG_ORDER (BITS_PER_EVTCHN_WORD == 64 ? 3 : 0)
+#define BITMAP_NR_PAGES (BITMAP_PG_ORDER == 3 ? 8 : 1)
+
+/*
  * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t
  * array. Primarily to avoid long lines (hence the terse name).
  */
@@ -2146,6 +2156,115 @@ uint64_t xen_event_channel_query_extended_abis(void)
 	return query.abis;
 }
 
+static int xen_event_channel_register_3level_bitmaps(void)
+{
+	struct evtchn_register_3level reg;
+	int i;
+	int rc;
+	xen_ulong_t _evtchn_pending[EVTCHN_MAX_L3_PAGES];
+	xen_ulong_t _evtchn_mask[EVTCHN_MAX_L3_PAGES];
+
+	/*
+	 * can only register 3-level ABI in following states:
+	 * a) no extended ABIs in use
+	 * b) come from restore path which already has ABI set and
+	 *    pages allocated
+	 */
+	if (!(xen_evtchn_extended == EVTCHN_EXTENDED_NONE ||
+	      (xen_evtchn_extended == EVTCHN_EXTENDED_L3 &&
+	       evtchn_pending && evtchn_pending)))
+		return -EINVAL;
+
+	/*
+	 * If we come from restore path, we don''t need to allocate
+	 * pages.
+	 */
+	if (!evtchn_pending && !evtchn_mask) {
+		/* Get zeroed pages */
+		evtchn_pending +			(xen_ulong_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+							BITMAP_PG_ORDER);
+		evtchn_mask +			(xen_ulong_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+							BITMAP_PG_ORDER);
+		if (!evtchn_pending || !evtchn_mask) {
+			free_pages((unsigned long)evtchn_pending, BITMAP_PG_ORDER);
+			free_pages((unsigned long)evtchn_mask, BITMAP_PG_ORDER);
+			evtchn_pending = NULL;
+			evtchn_mask = NULL;
+			rc = -ENOMEM;
+			goto err;
+		}
+	}
+
+	memset(®, 0, sizeof(reg));
+
+	for (i = 0; i < BITMAP_NR_PAGES; i++) {
+		unsigned long offset = PAGE_SIZE * i;
+		_evtchn_pending[i] +			arbitrary_virt_to_mfn(
+				(void *)((unsigned long)evtchn_pending+offset));
+		_evtchn_mask[i] +			arbitrary_virt_to_mfn(
+				(void *)((unsigned long)evtchn_mask+offset));
+	}
+
+	reg.cmd = REGISTER_BITMAPS;
+	reg.u.bitmaps.nr_pages = BITMAP_NR_PAGES;
+	reg.u.bitmaps.evtchn_pending = _evtchn_pending;
+	reg.u.bitmaps.evtchn_mask = _evtchn_mask;
+
+	rc = HYPERVISOR_event_channel_op(EVTCHNOP_register_3level, ®);
+	if (rc) {
+		free_pages((unsigned long)evtchn_pending, BITMAP_PG_ORDER);
+		free_pages((unsigned long)evtchn_mask, BITMAP_PG_ORDER);
+		evtchn_pending = NULL;
+		evtchn_mask = NULL;
+	}
+
+err:
+	return rc;
+}
+
+int xen_event_channel_register_3level_l2selector(int cpu)
+{
+	struct evtchn_register_3level reg;
+	int rc;
+
+	memset(®, 0, sizeof(reg));
+
+	reg.cmd = REGISTER_L2_SELECTOR;
+
+	reg.u.l2_selector.cpu_id = cpu;
+	reg.u.l2_selector.mfn +		arbitrary_virt_to_mfn(&per_cpu(evtchn_sel_l2,
cpu));
+	reg.u.l2_selector.offset +		offset_in_page(&per_cpu(evtchn_sel_l2, cpu));
+
+	rc = HYPERVISOR_event_channel_op(EVTCHNOP_register_3level, ®);
+
+	if (rc == -EBUSY) /* already registered, this can happen in hotplug */
+		return 0;
+
+	if (!rc)
+		per_cpu(evtchn_sel, cpu)[1] = per_cpu(evtchn_sel_l2, cpu);
+
+	return rc;
+}
+
+static int xen_event_channel_register_3level(void)
+{
+	int rc;
+
+	rc = xen_event_channel_register_3level_bitmaps();
+	if (rc)
+		return rc;
+
+	rc = xen_event_channel_register_3level_l2selector(0);
+
+	return rc;
+}
+
 const struct evtchn_ops evtchn_l2_ops = {
 	.unmask = __unmask_local_port_l2,
 	.debug_interrupt = xen_debug_interrupt_l2,
@@ -2158,6 +2277,47 @@ const struct evtchn_ops evtchn_l3_ops = {
 	.do_upcall = __xen_evtchn_do_upcall_l3
 };
 
+void xen_set_event_channel_extended(uint64_t abi)
+{
+	struct shared_info *s = HYPERVISOR_shared_info;
+	int cpu;
+
+	switch (abi) {
+	case EVTCHN_EXTENDED_NONE:
+		evtchn_pending = s->evtchn_pending;
+		evtchn_mask = s->evtchn_mask;
+		for_each_possible_cpu(cpu) {
+			struct vcpu_info *vcpu_info = per_cpu(xen_vcpu, cpu);
+			per_cpu(evtchn_sel, cpu)[0] +				&vcpu_info->evtchn_pending_sel;
+		}
+		xen_evtchn_extended = EVTCHN_EXTENDED_NONE;
+		xen_nr_event_channels = NR_EVENT_CHANNELS_L2;
+		eops = &evtchn_l2_ops;
+		printk(KERN_INFO "Using 2-level event channel ABI.\n");
+		break;
+	case EVTCHN_EXTENDED_L3:
+		/* evtchn_pending/mask already set */
+		for_each_possible_cpu(cpu) {
+			struct vcpu_info *vcpu_info = per_cpu(xen_vcpu, cpu);
+			per_cpu(evtchn_sel, cpu)[0] +				&vcpu_info->evtchn_pending_sel;
+			per_cpu(evtchn_sel, cpu)[1] +				per_cpu(evtchn_sel_l2, cpu);
+		}
+		xen_evtchn_extended = EVTCHN_EXTENDED_L3;
+		xen_nr_event_channels = NR_EVENT_CHANNELS_L3;
+		eops = &evtchn_l3_ops;
+		printk(KERN_INFO "Using 3-level event channel ABI.\n");
+		break;
+	default:
+		printk(KERN_EMERG
+		       "Trying to set unsupported event channel ABI %llx\n",
+		       abi);
+		BUG();
+	}
+}
+
 static int __cpuinit xen_events_notifier_cb(struct notifier_block *self,
 					    unsigned long action,
 					    void *hcpu)
@@ -2197,18 +2357,8 @@ void __init xen_init_IRQ(void)
 {
 	int i;
 	int cpu;
-	struct shared_info *s = HYPERVISOR_shared_info;
-
-	evtchn_pending = s->evtchn_pending;
-	evtchn_mask = s->evtchn_mask;
-	for_each_possible_cpu(cpu) {
-		struct vcpu_info *vcpu_info = per_cpu(xen_vcpu, cpu);
-		per_cpu(evtchn_sel, cpu)[0] = &vcpu_info->evtchn_pending_sel;
-	}
 
-	xen_evtchn_extended = EVTCHN_EXTENDED_NONE;
-	xen_nr_event_channels = NR_EVENT_CHANNELS_L2;
-	eops = &evtchn_l2_ops;
+	xen_set_event_channel_extended(EVTCHN_EXTENDED_NONE);
 
 	evtchn_to_irq = kcalloc(xen_nr_event_channels, sizeof(*evtchn_to_irq),
 				    GFP_KERNEL);
-- 
1.7.10.4
Wei Liu
2013-Mar-19  15:22 UTC
[RFC PATCH V5 13/14] xen: introduce xen_event_channel_register_extended
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 drivers/xen/events.c |   26 ++++++++++++++++++++++++++
 include/xen/events.h |    6 ++++++
 2 files changed, 32 insertions(+)
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 6bb9a47..6f21f27 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -2265,6 +2265,32 @@ static int xen_event_channel_register_3level(void)
 	return rc;
 }
 
+int xen_event_channel_register_extended(uint64_t abi)
+{
+	int rc = -EINVAL;
+
+	switch (abi) {
+	case EVTCHN_EXTENDED_L3:
+		rc = xen_event_channel_register_3level();
+		if (rc == 0)
+			printk(KERN_INFO
+			       "Register 3-level event channel succeed.\n");
+		else
+			printk(KERN_INFO
+			       "Register 3-level event channel failed: %d\n",
+			       rc);
+		break;
+	default:
+		printk(KERN_EMERG
+		       "Trying to register unsupported event channel ABI %llx\n",
+		       abi);
+		BUG();
+	}
+
+	return rc;
+}
+
+
 const struct evtchn_ops evtchn_l2_ops = {
 	.unmask = __unmask_local_port_l2,
 	.debug_interrupt = xen_debug_interrupt_l2,
diff --git a/include/xen/events.h b/include/xen/events.h
index 49d54ac..a6a6024 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -117,4 +117,10 @@ extern uint64_t xen_evtchn_extended;
 /* Query hypervisor for supported / enabled extended event channel ABIs. */
 uint64_t xen_event_channel_query_extended_abis(void);
 
+/* Set extended event channel to "abi". */
+void xen_set_event_channel_extended(uint64_t abi);
+
+/* Register extended event channel. */
+int xen_event_channel_register_extended(uint64_t abi);
+
 #endif	/* _XEN_EVENTS_H */
-- 
1.7.10.4
CPU hotplug is supported.
Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 arch/x86/xen/enlighten.c |   12 ++++++++++++
 drivers/xen/events.c     |   22 +++++++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 3556678..18edf66 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -213,6 +213,18 @@ void xen_vcpu_restore(void)
 		    HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
 			BUG();
 	}
+
+	/*
+	 * If we use any extended event channel ABI, should try to
+	 * re-setup it in restore path. Currently only 3-level ABI is
+	 * implemented, so simplify the code a bit.
+	 */
+	if (xen_evtchn_extended & EVTCHN_EXTENDED_L3) {
+		int rc;
+		rc = xen_event_channel_register_extended(EVTCHN_EXTENDED_L3);
+		if (rc)
+			xen_set_event_channel_extended(EVTCHN_EXTENDED_NONE);
+	}
 }
 
 static void __init xen_banner(void)
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 6f21f27..b7e5bc1 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -2368,6 +2368,11 @@ static int __cpuinit xen_events_notifier_cb(struct
notifier_block *self,
 				rc = NOTIFY_OK;
 			}
 		}
+		if (rc == NOTIFY_OK &&
+		    xen_evtchn_extended & EVTCHN_EXTENDED_L3) {
+			rc = xen_event_channel_register_3level_l2selector(cpu);
+			rc = (rc == 0 ? NOTIFY_OK : NOTIFY_BAD);
+		}
 		break;
 	default:
 		break;
@@ -2383,8 +2388,23 @@ void __init xen_init_IRQ(void)
 {
 	int i;
 	int cpu;
+	uint64_t evtchn_ext_abis;
+	int rc, fallback_to_default_evtchn = 0;
+
+	evtchn_ext_abis = xen_event_channel_query_extended_abis();
+
+	if (evtchn_ext_abis == EVTCHN_EXTENDED_NONE)
+		fallback_to_default_evtchn = 1;
+	else if (evtchn_ext_abis & EVTCHN_EXTENDED_L3) {
+		rc = xen_event_channel_register_extended(EVTCHN_EXTENDED_L3);
+		if (rc == 0)
+			xen_set_event_channel_extended(EVTCHN_EXTENDED_L3);
+		else
+			fallback_to_default_evtchn = 1;
+	}
 
-	xen_set_event_channel_extended(EVTCHN_EXTENDED_NONE);
+	if (fallback_to_default_evtchn)
+		xen_set_event_channel_extended(EVTCHN_EXTENDED_NONE);
 
 	evtchn_to_irq = kcalloc(xen_nr_event_channels, sizeof(*evtchn_to_irq),
 				    GFP_KERNEL);
-- 
1.7.10.4