thr3ads.net - Xen devel - Implement 3-level event channel routines in Linux. [Dec 2012]

If this information is useful, please help other people find it:
Share via:

Wei Liu

2012-Dec-31 18:38 UTC

Implement 3-level event channel routines in Linux.

This patch series implements 3-level event channel routines in Linux kernel.

My thought is that 3-level event channel is only useful for Dom0 or driver
domain, so it is not enabled by default. Enable it with evtchn_level=3 in
kernel command line.

HVM is not supported at the moment. As it is not very likely it will need this.
And I haven''t found a right place to issue the hypercall.

My understaning is that PVH has more or less the same initialization process as
PV, so the current implementation should work for PVH as well. Please correct
me if I''m wrong.

Wei Liu

2012-Dec-31 18:38 UTC

head link

[RFC PATCH 1/3] Xen: generalized event channel operations.

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 drivers/xen/events.c |  110 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 76 insertions(+), 34 deletions(-)

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 7595581..835101f 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -51,6 +51,23 @@
 #include <xen/interface/hvm/hvm_op.h>
 #include <xen/interface/hvm/params.h>
 
+/* N-level event channel, starting from 2 */
+static unsigned int evtchn_level = 2;
+
+struct evtchn_ops {
+	unsigned long (*active_evtchns)(unsigned int,
+					struct shared_info*, unsigned int);
+	void (*clear_evtchn)(int);
+	void (*set_evtchn)(int);
+	int (*test_evtchn)(int);
+	void (*mask_evtchn)(int);
+	void (*unmask_evtchn)(int);
+	int (*is_masked)(int);
+	void (*xen_evtchn_do_upcall)(void);
+	irqreturn_t (*xen_debug_interrupt)(int, void*);
+};
+static struct evtchn_ops *eops;
+
 /*
  * This lock protects updates to the following mapping and reference-count
  * arrays. The lock does not need to be acquired to read the mapping tables.
@@ -285,9 +302,9 @@ static bool pirq_needs_eoi_flag(unsigned irq)
 	return info->u.pirq.flags & PIRQ_NEEDS_EOI;
 }
 
-static inline unsigned long active_evtchns(unsigned int cpu,
-					   struct shared_info *sh,
-					   unsigned int idx)
+static inline unsigned long __active_evtchns_l2(unsigned int cpu,
+						struct shared_info *sh,
+						unsigned int idx)
 {
 	return sh->evtchn_pending[idx] &
 		per_cpu(cpu_evtchn_mask, cpu)[idx] &
@@ -309,6 +326,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned
int cpu)
 	info_for_irq(irq)->cpu = cpu;
 }
 
+
 static void init_evtchn_cpu_bindings(void)
 {
 	int i;
@@ -327,25 +345,24 @@ static void init_evtchn_cpu_bindings(void)
 		       (i == 0) ? ~0 : 0, sizeof(*per_cpu(cpu_evtchn_mask, i)));
 }
 
-static inline void clear_evtchn(int port)
+static inline void __clear_evtchn_l2(int port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	sync_clear_bit(port, &s->evtchn_pending[0]);
 }
 
-static inline void set_evtchn(int port)
+static inline void __set_evtchn_l2(int port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	sync_set_bit(port, &s->evtchn_pending[0]);
 }
 
-static inline int test_evtchn(int port)
+static inline int __test_evtchn_l2(int port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	return sync_test_bit(port, &s->evtchn_pending[0]);
 }
 
-
 /**
  * notify_remote_via_irq - send event to remote end of event channel via irq
  * @irq: irq of event channel to send event to
@@ -363,13 +380,13 @@ void notify_remote_via_irq(int irq)
 }
 EXPORT_SYMBOL_GPL(notify_remote_via_irq);
 
-static void mask_evtchn(int port)
+static void __mask_evtchn_l2(int port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	sync_set_bit(port, &s->evtchn_mask[0]);
 }
 
-static void unmask_evtchn(int port)
+static void __unmask_evtchn_l2(int port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	unsigned int cpu = get_cpu();
@@ -521,7 +538,7 @@ static void eoi_pirq(struct irq_data *data)
 	irq_move_irq(data);
 
 	if (VALID_EVTCHN(evtchn))
-		clear_evtchn(evtchn);
+		eops->clear_evtchn(evtchn);
 
 	if (pirq_needs_eoi(data->irq)) {
 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
@@ -567,7 +584,7 @@ static unsigned int __startup_pirq(unsigned int irq)
 	info->evtchn = evtchn;
 
 out:
-	unmask_evtchn(evtchn);
+	eops->unmask_evtchn(evtchn);
 	eoi_pirq(irq_get_irq_data(irq));
 
 	return 0;
@@ -590,7 +607,7 @@ static void shutdown_pirq(struct irq_data *data)
 	if (!VALID_EVTCHN(evtchn))
 		return;
 
-	mask_evtchn(evtchn);
+	eops->mask_evtchn(evtchn);
 
 	close.port = evtchn;
 	if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
@@ -1164,7 +1181,7 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector
vector)
 	notify_remote_via_irq(irq);
 }
 
-irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
+static irqreturn_t __xen_debug_interrupt_l2(int irq, void *dev_id)
 {
 	struct shared_info *sh = HYPERVISOR_shared_info;
 	int cpu = smp_processor_id();
@@ -1245,6 +1262,11 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
+{
+	return eops->xen_debug_interrupt(irq, dev_id);
+}
+
 static DEFINE_PER_CPU(unsigned, xed_nesting_count);
 static DEFINE_PER_CPU(unsigned int, current_word_idx);
 static DEFINE_PER_CPU(unsigned int, current_bit_idx);
@@ -1263,7 +1285,7 @@ static DEFINE_PER_CPU(unsigned int, current_bit_idx);
  * a bitset of words which contain pending event bits.  The second
  * level is a bitset of pending events themselves.
  */
-static void __xen_evtchn_do_upcall(void)
+static void __xen_evtchn_do_upcall_l2(void)
 {
 	int start_word_idx, start_bit_idx;
 	int word_idx, bit_idx;
@@ -1308,7 +1330,7 @@ static void __xen_evtchn_do_upcall(void)
 			}
 			word_idx = __ffs(words);
 
-			pending_bits = active_evtchns(cpu, s, word_idx);
+			pending_bits = eops->active_evtchns(cpu, s, word_idx);
 			bit_idx = 0; /* usually scan entire word from start */
 			if (word_idx == start_word_idx) {
 				/* We scan the starting word in two parts */
@@ -1377,7 +1399,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
 	exit_idle();
 	irq_enter();
 
-	__xen_evtchn_do_upcall();
+	eops->xen_evtchn_do_upcall();
 
 	irq_exit();
 	set_irq_regs(old_regs);
@@ -1385,7 +1407,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
 
 void xen_hvm_evtchn_do_upcall(void)
 {
-	__xen_evtchn_do_upcall();
+	eops->xen_evtchn_do_upcall();
 }
 EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall);
 
@@ -1459,15 +1481,14 @@ static int set_affinity_irq(struct irq_data *data, const
struct cpumask *dest,
 int resend_irq_on_evtchn(unsigned int irq)
 {
 	int masked, evtchn = evtchn_from_irq(irq);
-	struct shared_info *s = HYPERVISOR_shared_info;
 
 	if (!VALID_EVTCHN(evtchn))
 		return 1;
 
-	masked = sync_test_and_set_bit(evtchn, s->evtchn_mask);
-	sync_set_bit(evtchn, s->evtchn_pending);
+	masked = eops->is_masked(evtchn);
+	eops->set_evtchn(evtchn);
 	if (!masked)
-		unmask_evtchn(evtchn);
+		eops->unmask_evtchn(evtchn);
 
 	return 1;
 }
@@ -1477,7 +1498,7 @@ static void enable_dynirq(struct irq_data *data)
 	int evtchn = evtchn_from_irq(data->irq);
 
 	if (VALID_EVTCHN(evtchn))
-		unmask_evtchn(evtchn);
+		eops->unmask_evtchn(evtchn);
 }
 
 static void disable_dynirq(struct irq_data *data)
@@ -1485,7 +1506,7 @@ static void disable_dynirq(struct irq_data *data)
 	int evtchn = evtchn_from_irq(data->irq);
 
 	if (VALID_EVTCHN(evtchn))
-		mask_evtchn(evtchn);
+		eops->mask_evtchn(evtchn);
 }
 
 static void ack_dynirq(struct irq_data *data)
@@ -1495,7 +1516,7 @@ static void ack_dynirq(struct irq_data *data)
 	irq_move_irq(data);
 
 	if (VALID_EVTCHN(evtchn))
-		clear_evtchn(evtchn);
+		eops->clear_evtchn(evtchn);
 }
 
 static void mask_ack_dynirq(struct irq_data *data)
@@ -1504,19 +1525,24 @@ static void mask_ack_dynirq(struct irq_data *data)
 	ack_dynirq(data);
 }
 
+static inline int __is_masked_l2(int chn)
+{
+	struct shared_info *sh = HYPERVISOR_shared_info;
+	return sync_test_and_set_bit(chn, sh->evtchn_mask);
+}
+
 static int retrigger_dynirq(struct irq_data *data)
 {
 	int evtchn = evtchn_from_irq(data->irq);
-	struct shared_info *sh = HYPERVISOR_shared_info;
 	int ret = 0;
 
 	if (VALID_EVTCHN(evtchn)) {
 		int masked;
 
-		masked = sync_test_and_set_bit(evtchn, sh->evtchn_mask);
-		sync_set_bit(evtchn, sh->evtchn_pending);
+		masked = eops->is_masked(evtchn);
+		eops->set_evtchn(evtchn);
 		if (!masked)
-			unmask_evtchn(evtchn);
+			eops->unmask_evtchn(evtchn);
 		ret = 1;
 	}
 
@@ -1616,7 +1642,7 @@ void xen_clear_irq_pending(int irq)
 	int evtchn = evtchn_from_irq(irq);
 
 	if (VALID_EVTCHN(evtchn))
-		clear_evtchn(evtchn);
+		eops->clear_evtchn(evtchn);
 }
 EXPORT_SYMBOL(xen_clear_irq_pending);
 void xen_set_irq_pending(int irq)
@@ -1624,7 +1650,7 @@ void xen_set_irq_pending(int irq)
 	int evtchn = evtchn_from_irq(irq);
 
 	if (VALID_EVTCHN(evtchn))
-		set_evtchn(evtchn);
+		eops->set_evtchn(evtchn);
 }
 
 bool xen_test_irq_pending(int irq)
@@ -1633,7 +1659,7 @@ bool xen_test_irq_pending(int irq)
 	bool ret = false;
 
 	if (VALID_EVTCHN(evtchn))
-		ret = test_evtchn(evtchn);
+		ret = eops->test_evtchn(evtchn);
 
 	return ret;
 }
@@ -1684,7 +1710,7 @@ void xen_irq_resume(void)
 
 	/* New event-channel space is not ''live'' yet. */
 	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
-		mask_evtchn(evtchn);
+		eops->mask_evtchn(evtchn);
 
 	/* No IRQ <-> event-channel mappings. */
 	list_for_each_entry(info, &xen_irq_list_head, list)
@@ -1783,12 +1809,28 @@ void xen_callback_vector(void)
 void xen_callback_vector(void) {}
 #endif
 
+static struct evtchn_ops evtchn_ops_l2 __read_mostly = {
+	.active_evtchns = __active_evtchns_l2,
+	.clear_evtchn = __clear_evtchn_l2,
+	.set_evtchn = __set_evtchn_l2,
+	.test_evtchn = __test_evtchn_l2,
+	.mask_evtchn = __mask_evtchn_l2,
+	.unmask_evtchn = __unmask_evtchn_l2,
+	.is_masked = __is_masked_l2,
+	.xen_evtchn_do_upcall = __xen_evtchn_do_upcall_l2,
+	.xen_debug_interrupt = __xen_debug_interrupt_l2,
+};
+
 void __init xen_init_IRQ(void)
 {
 	int i, rc;
 
+	evtchn_level = 2;
+	eops = &evtchn_ops_l2;
+
+	/* Setup 2-level event channel */
 	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
-				    GFP_KERNEL);
+				GFP_KERNEL);
 	BUG_ON(!evtchn_to_irq);
 	for (i = 0; i < NR_EVENT_CHANNELS; i++)
 		evtchn_to_irq[i] = -1;
@@ -1797,7 +1839,7 @@ void __init xen_init_IRQ(void)
 
 	/* No event channels are ''live'' right now. */
 	for (i = 0; i < NR_EVENT_CHANNELS; i++)
-		mask_evtchn(i);
+		eops->mask_evtchn(i);
 
 	pirq_needs_eoi = pirq_needs_eoi_flag;
 
-- 
1.7.10.4

Wei Liu

2012-Dec-31 18:38 UTC

head link

[RFC PATCH 2/3] Xen: rework NR_EVENT_CHANNELS related stuffs.

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 drivers/xen/events.c        |   44 +++++++++++++++++++++++++++++--------------
 drivers/xen/evtchn.c        |   16 +++++++++-------
 include/xen/events.h        |    3 +++
 include/xen/interface/xen.h |   17 ++++++++++++++++-
 4 files changed, 58 insertions(+), 22 deletions(-)

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 835101f..f60ba76 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -52,7 +52,8 @@
 #include <xen/interface/hvm/params.h>
 
 /* N-level event channel, starting from 2 */
-static unsigned int evtchn_level = 2;
+unsigned int evtchn_level = 2;
+EXPORT_SYMBOL_GPL(evtchn_level);
 
 struct evtchn_ops {
 	unsigned long (*active_evtchns)(unsigned int,
@@ -130,8 +131,7 @@ static int *evtchn_to_irq;
 static unsigned long *pirq_eoi_map;
 static bool (*pirq_needs_eoi)(unsigned irq);
 
-static DEFINE_PER_CPU(unsigned long [NR_EVENT_CHANNELS/BITS_PER_LONG],
-		      cpu_evtchn_mask);
+static DEFINE_PER_CPU(unsigned long *, cpu_evtchn_mask);
 
 /* Xen will never allocate port zero for any purpose. */
 #define VALID_EVTCHN(chn)	((chn) != 0)
@@ -913,7 +913,7 @@ static int find_virq(unsigned int virq, unsigned int cpu)
 	int port, rc = -ENOENT;
 
 	memset(&status, 0, sizeof(status));
-	for (port = 0; port <= NR_EVENT_CHANNELS; port++) {
+	for (port = 0; port <= NR_EVENT_CHANNELS(evtchn_level); port++) {
 		status.dom = DOMID_SELF;
 		status.port = port;
 		rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status);
@@ -1138,7 +1138,7 @@ int evtchn_get(unsigned int evtchn)
 	struct irq_info *info;
 	int err = -ENOENT;
 
-	if (evtchn >= NR_EVENT_CHANNELS)
+	if (evtchn >= NR_EVENT_CHANNELS(evtchn_level))
 		return -EINVAL;
 
 	mutex_lock(&irq_mapping_update_lock);
@@ -1227,7 +1227,7 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void
*dev_id)
 		       i % 8 == 0 ? "\n   " : " ");
 
 	printk("\nlocal cpu%d mask:\n   ", cpu);
-	for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--)
+	for (i = (NR_EVENT_CHANNELS(evtchn_level)/BITS_PER_LONG)-1; i >= 0; i--)
 		printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
 		       cpu_evtchn[i],
 		       i % 8 == 0 ? "\n   " : " ");
@@ -1242,7 +1242,7 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void
*dev_id)
 	}
 
 	printk("\npending list:\n");
-	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+	for (i = 0; i < NR_EVENT_CHANNELS(evtchn_level); i++) {
 		if (sync_test_bit(i, sh->evtchn_pending)) {
 			int word_idx = i / BITS_PER_LONG;
 			printk("  %d: event %d -> irq %d%s%s%s\n",
@@ -1709,14 +1709,14 @@ void xen_irq_resume(void)
 	init_evtchn_cpu_bindings();
 
 	/* New event-channel space is not ''live'' yet. */
-	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS(evtchn_level); evtchn++)
 		eops->mask_evtchn(evtchn);
 
 	/* No IRQ <-> event-channel mappings. */
 	list_for_each_entry(info, &xen_irq_list_head, list)
 		info->evtchn = 0; /* zap event-channel binding */
 
-	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS(evtchn_level); evtchn++)
 		evtchn_to_irq[evtchn] = -1;
 
 	for_each_possible_cpu(cpu) {
@@ -1824,21 +1824,37 @@ static struct evtchn_ops evtchn_ops_l2 __read_mostly = {
 void __init xen_init_IRQ(void)
 {
 	int i, rc;
+	int cpu;
 
-	evtchn_level = 2;
+	/* Setup 2-level event channel */
 	eops = &evtchn_ops_l2;
+	evtchn_level = 2;
 
-	/* Setup 2-level event channel */
-	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
+	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS(evtchn_level),
+				sizeof(*evtchn_to_irq),
 				GFP_KERNEL);
 	BUG_ON(!evtchn_to_irq);
-	for (i = 0; i < NR_EVENT_CHANNELS; i++)
+
+	for_each_possible_cpu(cpu) {
+		void *p;
+		unsigned int nr = NR_EVENT_CHANNELS(evtchn_level)/BITS_PER_LONG;
+		p = kzalloc_node(sizeof(unsigned long) * nr,
+				 GFP_KERNEL,
+				 cpu_to_node(cpu));
+		if (!p)
+			p = kzalloc(sizeof(unsigned long) * nr,
+				    GFP_KERNEL);
+		BUG_ON(!p);
+		per_cpu(cpu_evtchn_mask, cpu) = p;
+	}
+
+	for (i = 0; i < NR_EVENT_CHANNELS(evtchn_level); i++)
 		evtchn_to_irq[i] = -1;
 
 	init_evtchn_cpu_bindings();
 
 	/* No event channels are ''live'' right now. */
-	for (i = 0; i < NR_EVENT_CHANNELS; i++)
+	for (i = 0; i < NR_EVENT_CHANNELS(evtchn_level); i++)
 		eops->mask_evtchn(i);
 
 	pirq_needs_eoi = pirq_needs_eoi_flag;
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index b1f60a0..cb45ecf 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -232,7 +232,7 @@ static ssize_t evtchn_write(struct file *file, const char
__user *buf,
 	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) {
 		unsigned port = kbuf[i];
 
-		if (port < NR_EVENT_CHANNELS &&
+		if (port < NR_EVENT_CHANNELS(evtchn_level) &&
 		    get_port_user(port) == u &&
 		    !get_port_enabled(port)) {
 			set_port_enabled(port, true);
@@ -364,7 +364,7 @@ static long evtchn_ioctl(struct file *file,
 			break;
 
 		rc = -EINVAL;
-		if (unbind.port >= NR_EVENT_CHANNELS)
+		if (unbind.port >= NR_EVENT_CHANNELS(evtchn_level))
 			break;
 
 		spin_lock_irq(&port_user_lock);
@@ -392,7 +392,7 @@ static long evtchn_ioctl(struct file *file,
 		if (copy_from_user(&notify, uarg, sizeof(notify)))
 			break;
 
-		if (notify.port >= NR_EVENT_CHANNELS) {
+		if (notify.port >= NR_EVENT_CHANNELS(evtchn_level)) {
 			rc = -EINVAL;
 		} else if (get_port_user(notify.port) != u) {
 			rc = -ENOTCONN;
@@ -482,7 +482,7 @@ static int evtchn_release(struct inode *inode, struct file
*filp)
 
 	free_page((unsigned long)u->ring);
 
-	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+	for (i = 0; i < NR_EVENT_CHANNELS(evtchn_level); i++) {
 		if (get_port_user(i) != u)
 			continue;
 
@@ -491,7 +491,7 @@ static int evtchn_release(struct inode *inode, struct file
*filp)
 
 	spin_unlock_irq(&port_user_lock);
 
-	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+	for (i = 0; i < NR_EVENT_CHANNELS(evtchn_level); i++) {
 		if (get_port_user(i) != u)
 			continue;
 
@@ -528,7 +528,8 @@ static int __init evtchn_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
-	port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL);
+	port_user = kcalloc(NR_EVENT_CHANNELS(evtchn_level),
+			    sizeof(*port_user), GFP_KERNEL);
 	if (port_user == NULL)
 		return -ENOMEM;
 
@@ -541,7 +542,8 @@ static int __init evtchn_init(void)
 		return err;
 	}
 
-	printk(KERN_INFO "Event-channel device installed.\n");
+	printk(KERN_INFO "Event-channel device installed."
+	       " Event-channel level: %d\n", evtchn_level);
 
 	return 0;
 }
diff --git a/include/xen/events.h b/include/xen/events.h
index 04399b2..bc10f22 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -109,4 +109,7 @@ int xen_irq_from_gsi(unsigned gsi);
 /* Determine whether to ignore this IRQ if it is passed to a guest. */
 int xen_test_irq_shared(int irq);
 
+/* N-level event channels */
+extern unsigned int evtchn_level;
+
 #endif	/* _XEN_EVENTS_H */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index a890804..c66e1ff 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -283,9 +283,24 @@ DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);
 
 /*
  * Event channel endpoints per domain:
+ * 2-level:
  *  1024 if a long is 32 bits; 4096 if a long is 64 bits.
+ * 3-level:
+ *  32k if a long is 32 bits; 256k if a long is 64 bits.
  */
-#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
+#define NR_EVENT_CHANNELS_L2 (sizeof(unsigned long) * sizeof(unsigned long) *
64)
+#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * sizeof(unsigned long))
+#define NR_EVENT_CHANNELS(x) ({ unsigned int __v = 0;	\
+	switch (x) {					\
+	case 2:						\
+		__v = NR_EVENT_CHANNELS_L2; break;	\
+	case 3:						\
+		__v = NR_EVENT_CHANNELS_L3; break;	\
+	default:					\
+		BUG();					\
+	}						\
+	__v; })
+
 
 struct vcpu_time_info {
 	/*
-- 
1.7.10.4

Wei Liu

2012-Dec-31 18:38 UTC

head link

[RFC PATCH 3/3] Xen: implement 3-level event channel routines.

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 arch/x86/xen/enlighten.c              |    7 +
 drivers/xen/events.c                  |  419 +++++++++++++++++++++++++++++++--
 include/xen/events.h                  |    2 +
 include/xen/interface/event_channel.h |   24 ++
 include/xen/interface/xen.h           |    2 +-
 5 files changed, 437 insertions(+), 17 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bc893e7..f471881 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -43,6 +43,7 @@
 #include <xen/hvm.h>
 #include <xen/hvc-console.h>
 #include <xen/acpi.h>
+#include <xen/events.h>
 
 #include <asm/paravirt.h>
 #include <asm/apic.h>
@@ -195,6 +196,9 @@ void xen_vcpu_restore(void)
 		    HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
 			BUG();
 	}
+
+	if (evtchn_level_param == 3)
+		xen_event_channel_setup_3level();
 }
 
 static void __init xen_banner(void)
@@ -1028,6 +1032,9 @@ void xen_setup_vcpu_info_placement(void)
 	for_each_possible_cpu(cpu)
 		xen_vcpu_setup(cpu);
 
+	if (evtchn_level_param == 3)
+		xen_event_channel_setup_3level();
+
 	/* xen_vcpu_setup managed to place the vcpu_info within the
 	   percpu area for all cpus, so make use of it */
 	if (have_vcpu_info_placement) {
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index f60ba76..adb94e9 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -52,9 +52,15 @@
 #include <xen/interface/hvm/params.h>
 
 /* N-level event channel, starting from 2 */
+unsigned int evtchn_level_param = -1;
 unsigned int evtchn_level = 2;
 EXPORT_SYMBOL_GPL(evtchn_level);
 
+/* 3-level event channel */
+DEFINE_PER_CPU(unsigned long [sizeof(unsigned long)*8], evtchn_sel_l2);
+unsigned long evtchn_pending[NR_EVENT_CHANNELS_L3/BITS_PER_LONG]
__page_aligned_bss;
+unsigned long evtchn_mask[NR_EVENT_CHANNELS_L3/BITS_PER_LONG]
__page_aligned_bss;
+
 struct evtchn_ops {
 	unsigned long (*active_evtchns)(unsigned int,
 					struct shared_info*, unsigned int);
@@ -142,6 +148,29 @@ static struct irq_chip xen_pirq_chip;
 static void enable_dynirq(struct irq_data *data);
 static void disable_dynirq(struct irq_data *data);
 
+static int __init parse_evtchn_level(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	if (strcmp(arg, "3") == 0)
+		evtchn_level_param = 3;
+
+	return 0;
+}
+early_param("evtchn_level", parse_evtchn_level);
+
+static inline int __is_masked_l2(int chn)
+{
+	struct shared_info *sh = HYPERVISOR_shared_info;
+	return sync_test_and_set_bit(chn, sh->evtchn_mask);
+}
+
+static inline int __is_masked_l3(int chn)
+{
+	return sync_test_and_set_bit(chn, evtchn_mask);
+}
+
 /* Get info for IRQ */
 static struct irq_info *info_for_irq(unsigned irq)
 {
@@ -311,6 +340,15 @@ static inline unsigned long __active_evtchns_l2(unsigned
int cpu,
 		~sh->evtchn_mask[idx];
 }
 
+static inline unsigned long __active_evtchns_l3(unsigned int cpu,
+						struct shared_info *sh,
+						unsigned int idx)
+{
+	return evtchn_pending[idx] &
+		per_cpu(cpu_evtchn_mask, cpu)[idx] &
+		~evtchn_mask[idx];
+}
+
 static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
 {
 	int irq = evtchn_to_irq[chn];
@@ -351,18 +389,33 @@ static inline void __clear_evtchn_l2(int port)
 	sync_clear_bit(port, &s->evtchn_pending[0]);
 }
 
+static inline void __clear_evtchn_l3(int port)
+{
+	sync_clear_bit(port, &evtchn_pending[0]);
+}
+
 static inline void __set_evtchn_l2(int port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	sync_set_bit(port, &s->evtchn_pending[0]);
 }
 
+static inline void __set_evtchn_l3(int port)
+{
+	sync_set_bit(port, &evtchn_pending[0]);
+}
+
 static inline int __test_evtchn_l2(int port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
 	return sync_test_bit(port, &s->evtchn_pending[0]);
 }
 
+static inline int __test_evtchn_l3(int port)
+{
+	return sync_test_bit(port, &evtchn_pending[0]);
+}
+
 /**
  * notify_remote_via_irq - send event to remote end of event channel via irq
  * @irq: irq of event channel to send event to
@@ -386,6 +439,11 @@ static void __mask_evtchn_l2(int port)
 	sync_set_bit(port, &s->evtchn_mask[0]);
 }
 
+static void __mask_evtchn_l3(int port)
+{
+	sync_set_bit(port, &evtchn_mask[0]);
+}
+
 static void __unmask_evtchn_l2(int port)
 {
 	struct shared_info *s = HYPERVISOR_shared_info;
@@ -416,6 +474,36 @@ static void __unmask_evtchn_l2(int port)
 	put_cpu();
 }
 
+static void __unmask_evtchn_l3(int port)
+{
+	unsigned int cpu = get_cpu();
+	int l1cb = BITS_PER_LONG * BITS_PER_LONG;
+	int l2cb = BITS_PER_LONG;
+
+	if (unlikely(cpu != cpu_from_evtchn(port))) {
+		struct evtchn_unmask unmask = { .port = port };
+		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+	} else {
+		struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+
+		sync_clear_bit(port, &evtchn_mask[0]);
+
+		/*
+		 * The following is basically the equivalent of
+		 * ''hw_resend_irq''. Just like a real IO-APIC we
''lose
+		 * the interrupt edge'' if the channel is masked.
+		 */
+		if (sync_test_bit(port, &evtchn_pending[0]) &&
+		    !sync_test_and_set_bit(port / l2cb,
+					   &per_cpu(evtchn_sel_l2, cpu)[0]) &&
+		    !sync_test_and_set_bit(port / l1cb,
+					   &vcpu_info->evtchn_pending_sel))
+			vcpu_info->evtchn_upcall_pending = 1;
+	}
+
+	put_cpu();
+}
+
 static void xen_irq_init(unsigned irq)
 {
 	struct irq_info *info;
@@ -1181,6 +1269,7 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector
vector)
 	notify_remote_via_irq(irq);
 }
 
+static DEFINE_SPINLOCK(debug_lock);
 static irqreturn_t __xen_debug_interrupt_l2(int irq, void *dev_id)
 {
 	struct shared_info *sh = HYPERVISOR_shared_info;
@@ -1188,7 +1277,6 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void
*dev_id)
 	unsigned long *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
 	int i;
 	unsigned long flags;
-	static DEFINE_SPINLOCK(debug_lock);
 	struct vcpu_info *v;
 
 	spin_lock_irqsave(&debug_lock, flags);
@@ -1196,13 +1284,13 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq,
void *dev_id)
 	printk("\nvcpu %d\n  ", cpu);
 
 	for_each_online_cpu(i) {
-		int pending;
+		int masked;
 		v = per_cpu(xen_vcpu, i);
-		pending = (get_irq_regs() && i == cpu)
+		masked = (get_irq_regs() && i == cpu)
 			? xen_irqs_disabled(get_irq_regs())
 			: v->evtchn_upcall_mask;
 		printk("%d: masked=%d pending=%d event_sel %0*lx\n  ", i,
-		       pending, v->evtchn_upcall_pending,
+		       masked, v->evtchn_upcall_pending,
 		       (int)(sizeof(v->evtchn_pending_sel)*2),
 		       v->evtchn_pending_sel);
 	}
@@ -1227,7 +1315,7 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void
*dev_id)
 		       i % 8 == 0 ? "\n   " : " ");
 
 	printk("\nlocal cpu%d mask:\n   ", cpu);
-	for (i = (NR_EVENT_CHANNELS(evtchn_level)/BITS_PER_LONG)-1; i >= 0; i--)
+	for (i = (NR_EVENT_CHANNELS(2)/BITS_PER_LONG)-1; i >= 0; i--)
 		printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
 		       cpu_evtchn[i],
 		       i % 8 == 0 ? "\n   " : " ");
@@ -1242,7 +1330,7 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq, void
*dev_id)
 	}
 
 	printk("\npending list:\n");
-	for (i = 0; i < NR_EVENT_CHANNELS(evtchn_level); i++) {
+	for (i = 0; i < NR_EVENT_CHANNELS(2); i++) {
 		if (sync_test_bit(i, sh->evtchn_pending)) {
 			int word_idx = i / BITS_PER_LONG;
 			printk("  %d: event %d -> irq %d%s%s%s\n",
@@ -1262,15 +1350,110 @@ static irqreturn_t __xen_debug_interrupt_l2(int irq,
void *dev_id)
 	return IRQ_HANDLED;
 }
 
+static irqreturn_t __xen_debug_interrupt_l3(int irq, void *dev_id)
+{
+	int cpu = smp_processor_id();
+	unsigned long *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu);
+	int i, j;
+	unsigned long flags;
+	struct vcpu_info *v;
+
+	spin_lock_irqsave(&debug_lock, flags);
+
+	printk("\nvcpu %d\n  ", cpu);
+
+	for_each_online_cpu(i) {
+		int masked;
+
+		v = per_cpu(xen_vcpu, i);
+		masked = (get_irq_regs() && i == cpu)
+			? xen_irqs_disabled(get_irq_regs())
+			: v->evtchn_upcall_mask;
+		printk("%d: masked=%d pending=%d event_sel_l1 %0*lx\n  ", i,
+		       masked, v->evtchn_upcall_pending,
+		       (int)(sizeof(v->evtchn_pending_sel)*2),
+		       v->evtchn_pending_sel);
+
+		printk("\nevtchn_sel_l2:\n   ");
+		for (j = (sizeof(unsigned long)*8)-1; j >= 0; j--)
+			printk("%0*lx%s",
+			       (int)(sizeof(evtchn_sel_l2[0])*2),
+			       per_cpu(evtchn_sel_l2, i)[j],
+			       j % 8 == 0 ? "\n   " : " ");
+	}
+
+	v = per_cpu(xen_vcpu, cpu);
+
+	printk("\npending:\n   ");
+	for (i = ARRAY_SIZE(evtchn_pending)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)(sizeof(evtchn_pending[0])*2),
+		       evtchn_pending[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nglobal mask:\n   ");
+	for (i = ARRAY_SIZE(evtchn_mask)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)(sizeof(evtchn_mask[0])*2),
+		       evtchn_mask[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nglobally unmasked:\n   ");
+	for (i = ARRAY_SIZE(evtchn_mask)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)(sizeof(evtchn_mask[0])*2),
+		       evtchn_pending[i] & ~evtchn_mask[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nlocal cpu%d mask:\n   ", cpu);
+	for (i = (NR_EVENT_CHANNELS(3)/BITS_PER_LONG)-1; i >= 0; i--)
+		printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
+		       cpu_evtchn[i],
+		       i % 8 == 0 ? "\n   " : " ");
+
+	printk("\nlocally unmasked:\n   ");
+	for (i = ARRAY_SIZE(evtchn_mask)-1; i >= 0; i--) {
+		unsigned long pending = evtchn_pending[i]
+			& ~evtchn_mask[i]
+			& cpu_evtchn[i];
+		printk("%0*lx%s", (int)(sizeof(evtchn_mask[0])*2),
+		       pending, i % 8 == 0 ? "\n   " : " ");
+	}
+
+	printk("\npending list:\n");
+	for (i = 0; i < NR_EVENT_CHANNELS(3); i++) {
+		if (sync_test_bit(i, evtchn_pending)) {
+			int word_idx_l1 = i / (BITS_PER_LONG * BITS_PER_LONG);
+			int word_idx_l2 = i / BITS_PER_LONG;
+			printk("  %d: event %d -> irq %d%s%s%s%s\n",
+			       cpu_from_evtchn(i), i,
+			       evtchn_to_irq[i],
+			       sync_test_bit(word_idx_l1, &v->evtchn_pending_sel)
+					     ? "" : " l1-clear",
+			       sync_test_bit(word_idx_l2, per_cpu(evtchn_sel_l2, cpu))
+					     ? "" : " l2-clear",
+			       !sync_test_bit(i, evtchn_mask)
+					     ? "" : " globally-masked",
+			       sync_test_bit(i, cpu_evtchn)
+					     ? "" : " locally-masked");
+		}
+	}
+
+	spin_unlock_irqrestore(&debug_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
 irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
 {
 	return eops->xen_debug_interrupt(irq, dev_id);
 }
 
 static DEFINE_PER_CPU(unsigned, xed_nesting_count);
+
+/* 2-level event channel does not use current_word_idx_l2 */
 static DEFINE_PER_CPU(unsigned int, current_word_idx);
+static DEFINE_PER_CPU(unsigned int, current_word_idx_l2);
 static DEFINE_PER_CPU(unsigned int, current_bit_idx);
 
+
 /*
  * Mask out the i least significant bits of w
  */
@@ -1303,7 +1486,8 @@ static void __xen_evtchn_do_upcall_l2(void)
 		if (__this_cpu_inc_return(xed_nesting_count) - 1)
 			goto out;
 
-#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
+#ifndef CONFIG_X86
+		/* No need for a barrier -- XCHG is a barrier on x86. */
 		/* Clear master flag /before/ clearing selector flag. */
 		wmb();
 #endif
@@ -1392,6 +1576,155 @@ out:
 	put_cpu();
 }
 
+void __xen_evtchn_do_upcall_l3(void)
+{
+	struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
+	unsigned count;
+	int start_word_idx_l1, start_word_idx_l2, start_bit_idx;
+	int word_idx_l1, word_idx_l2, bit_idx;
+	int i, j;
+	unsigned long l1cb, l2cb;
+	int cpu = get_cpu();
+
+	l1cb = BITS_PER_LONG * BITS_PER_LONG;
+	l2cb = BITS_PER_LONG;
+
+	do {
+		unsigned long pending_words_l1;
+
+		vcpu_info->evtchn_upcall_pending = 0;
+
+		if (__this_cpu_inc_return(xed_nesting_count) - 1)
+			goto out;
+#ifndef CONFIG_X86
+		/* No need for a barrier -- XCHG is a barrier on x86. */
+		/* Clear master flag /before/ clearing selector flag. */
+		wmb();
+#endif
+		/* here we get l1 pending selector */
+		pending_words_l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
+
+		start_word_idx_l1 = __this_cpu_read(current_word_idx);
+		start_word_idx_l2 = __this_cpu_read(current_word_idx_l2);
+		start_bit_idx = __this_cpu_read(current_bit_idx);
+
+		word_idx_l1 = start_word_idx_l1;
+
+		/* loop through l1, try to pick up l2 */
+		for (i = 0; pending_words_l1 != 0; i++) {
+			unsigned long words_l1;
+			unsigned long pending_words_l2;
+			unsigned long pwl2idx;
+
+			words_l1 = MASK_LSBS(pending_words_l1, word_idx_l1);
+
+			if (words_l1 == 0) {
+				word_idx_l1 = 0;
+				start_word_idx_l2 = 0;
+				continue;
+			}
+
+			word_idx_l1 = __ffs(words_l1);
+
+			pwl2idx = word_idx_l1 * BITS_PER_LONG;
+
+			pending_words_l2 +				xchg(&per_cpu(evtchn_sel_l2, cpu)[pwl2idx],
+				     0);
+
+			word_idx_l2 = 0;
+			if (word_idx_l1 == start_word_idx_l1) {
+				if (i == 0)
+					word_idx_l2 = start_word_idx_l2;
+				else
+					word_idx_l2 &= (1UL << start_word_idx_l2) - 1;
+			}
+
+			for (j = 0; pending_words_l2 != 0; j++) {
+				unsigned long pending_bits;
+				unsigned long words_l2;
+				unsigned long idx;
+
+				words_l2 = MASK_LSBS(pending_words_l2,
+						     word_idx_l2);
+
+				if (words_l2 == 0) {
+					word_idx_l2 = 0;
+					bit_idx = 0;
+					continue;
+				}
+
+				word_idx_l2 = __ffs(words_l2);
+
+				idx = word_idx_l1*BITS_PER_LONG+word_idx_l2;
+				pending_bits +					eops->active_evtchns(cpu, NULL, idx);
+
+				bit_idx = 0;
+				if (word_idx_l2 == start_word_idx_l2) {
+					if (j == 0)
+						bit_idx = start_bit_idx;
+					else
+						bit_idx &= (1UL<<start_bit_idx)-1;
+				}
+
+				/* process port */
+				do {
+					unsigned long bits;
+					int port, irq;
+					struct irq_desc *desc;
+
+					bits = MASK_LSBS(pending_bits, bit_idx);
+
+					if (bits == 0)
+						break;
+
+					bit_idx = __ffs(bits);
+
+					port = word_idx_l1 * l1cb +
+						word_idx_l2 * l2cb +
+						bit_idx;
+
+					irq = evtchn_to_irq[port];
+
+					if (irq != -1) {
+						desc = irq_to_desc(irq);
+						if (desc)
+							generic_handle_irq_desc(irq, desc);
+					}
+
+					bit_idx = (bit_idx + 1) % BITS_PER_LONG;
+
+					__this_cpu_write(current_bit_idx, bit_idx);
+					__this_cpu_write(current_word_idx_l2,
+							 bit_idx ? word_idx_l2 :
+							 (word_idx_l2+1) % BITS_PER_LONG);
+					__this_cpu_write(current_word_idx_l2,
+							 word_idx_l2 ? word_idx_l1 :
+							 (word_idx_l1+1) % BITS_PER_LONG);
+				} while (bit_idx != 0);
+
+				if ((word_idx_l2 != start_word_idx_l2) || (j != 0))
+					pending_words_l2 &= ~(1UL << word_idx_l2);
+
+				word_idx_l2 = (word_idx_l2) % BITS_PER_LONG;
+			}
+
+			if ((word_idx_l1 != start_word_idx_l1) || (i != 0))
+				pending_words_l1 &= ~(1UL << word_idx_l1);
+
+			word_idx_l1 = (word_idx_l1) % BITS_PER_LONG;
+		}
+
+		BUG_ON(!irqs_disabled());
+		count = __this_cpu_read(xed_nesting_count);
+		__this_cpu_write(xed_nesting_count, 0);
+	} while (count != 1 || vcpu_info->evtchn_upcall_pending);
+
+out:
+	put_cpu();
+}
+
 void xen_evtchn_do_upcall(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
@@ -1525,12 +1858,6 @@ static void mask_ack_dynirq(struct irq_data *data)
 	ack_dynirq(data);
 }
 
-static inline int __is_masked_l2(int chn)
-{
-	struct shared_info *sh = HYPERVISOR_shared_info;
-	return sync_test_and_set_bit(chn, sh->evtchn_mask);
-}
-
 static int retrigger_dynirq(struct irq_data *data)
 {
 	int evtchn = evtchn_from_irq(data->irq);
@@ -1821,14 +2148,74 @@ static struct evtchn_ops evtchn_ops_l2 __read_mostly = {
 	.xen_debug_interrupt = __xen_debug_interrupt_l2,
 };
 
+static struct evtchn_ops evtchn_ops_l3 __read_mostly = {
+	.active_evtchns = __active_evtchns_l3,
+	.clear_evtchn = __clear_evtchn_l3,
+	.set_evtchn = __set_evtchn_l3,
+	.test_evtchn = __test_evtchn_l3,
+	.mask_evtchn = __mask_evtchn_l3,
+	.unmask_evtchn = __unmask_evtchn_l3,
+	.is_masked = __is_masked_l3,
+	.xen_evtchn_do_upcall = __xen_evtchn_do_upcall_l3,
+	.xen_debug_interrupt = __xen_debug_interrupt_l3,
+};
+
+int xen_event_channel_setup_3level(void)
+{
+	evtchn_register_nlevel_t reg;
+	int i, nr_pages, cpu;
+	unsigned long mfns[nr_cpu_ids];
+	unsigned long offsets[nr_cpu_ids];
+	int rc = -EINVAL;
+
+	memset(&reg, 0, sizeof(reg));
+
+	reg.level = 3;
+	nr_pages = (sizeof(unsigned long) == 4 ? 1 : 8);
+
+	for (i = 0; i < nr_pages; i++) {
+		unsigned long offset = PAGE_SIZE * i;
+		reg.u.l3.evtchn_pending[i] +			arbitrary_virt_to_mfn(
+				(void *)((unsigned long)evtchn_pending+offset));
+		reg.u.l3.evtchn_mask[i] +			arbitrary_virt_to_mfn(
+				(void *)((unsigned long)evtchn_mask+offset));
+	}
+
+	reg.u.l3.l2sel_mfn = mfns;
+	reg.u.l3.l2sel_offset = offsets;
+	reg.u.l3.nr_vcpus = nr_cpu_ids;
+
+	for_each_possible_cpu(cpu) {
+		reg.u.l3.l2sel_mfn[cpu] +			arbitrary_virt_to_mfn(&per_cpu(evtchn_sel_l2,
cpu));
+		reg.u.l3.l2sel_offset[cpu] +			offset_in_page(&per_cpu(evtchn_sel_l2,
cpu));
+	}
+
+	rc = HYPERVISOR_event_channel_op(EVTCHNOP_register_nlevel, &reg);
+
+	if (rc == 0)
+		evtchn_level = 3;
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(xen_event_channel_setup_3level);
+
 void __init xen_init_IRQ(void)
 {
 	int i, rc;
 	int cpu;
 
-	/* Setup 2-level event channel */
-	eops = &evtchn_ops_l2;
-	evtchn_level = 2;
+	switch (evtchn_level) {
+	case 2:
+		eops = &evtchn_ops_l2; break;
+	case 3:
+		eops = &evtchn_ops_l3; break;
+	default:
+		BUG();
+	}
 
 	evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS(evtchn_level),
 				sizeof(*evtchn_to_irq),
diff --git a/include/xen/events.h b/include/xen/events.h
index bc10f22..87696fc 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -111,5 +111,7 @@ int xen_test_irq_shared(int irq);
 
 /* N-level event channels */
 extern unsigned int evtchn_level;
+extern unsigned int evtchn_level_param;
+int xen_event_channel_setup_3level(void);
 
 #endif	/* _XEN_EVENTS_H */
diff --git a/include/xen/interface/event_channel.h
b/include/xen/interface/event_channel.h
index f494292..f764d21 100644
--- a/include/xen/interface/event_channel.h
+++ b/include/xen/interface/event_channel.h
@@ -190,6 +190,30 @@ struct evtchn_reset {
 };
 typedef struct evtchn_reset evtchn_reset_t;
 
+/*
+ * EVTCHNOP_register_nlevel: Register N level event channels.
+ * NOTES:
+ *   1. currently only 3-level is supported.
+ *   2. should fall back to basic 2-level if this call fails.
+ */
+#define EVTCHNOP_register_nlevel 11
+#define MAX_L3_PAGES 8		/* 8 pages for 64 bits */
+struct evtchn_register_3level {
+	unsigned long evtchn_pending[MAX_L3_PAGES];
+	unsigned long evtchn_mask[MAX_L3_PAGES];
+	unsigned long *l2sel_mfn;
+	unsigned long *l2sel_offset;
+	unsigned int nr_vcpus;
+};
+
+struct evtchn_register_nlevel {
+	uint32_t level;
+	union {
+		struct evtchn_register_3level l3;
+	} u;
+};
+typedef struct evtchn_register_nlevel evtchn_register_nlevel_t;
+
 struct evtchn_op {
 	uint32_t cmd; /* EVTCHNOP_* */
 	union {
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index c66e1ff..7cb9d8f 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -289,7 +289,7 @@ DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);
  *  32k if a long is 32 bits; 256k if a long is 64 bits.
  */
 #define NR_EVENT_CHANNELS_L2 (sizeof(unsigned long) * sizeof(unsigned long) *
64)
-#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * sizeof(unsigned long))
+#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * 64)
 #define NR_EVENT_CHANNELS(x) ({ unsigned int __v = 0;	\
 	switch (x) {					\
 	case 2:						\
-- 
1.7.10.4

David Vrabel

2013-Jan-02 14:13 UTC

head link

Re: [RFC PATCH 1/3] Xen: generalized event channel operations.

On 31/12/12 18:38, Wei Liu wrote:> 
Changeset description is too brief.  Why is this change needed?
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> ---
>  drivers/xen/events.c |  110
++++++++++++++++++++++++++++++++++----------------
>  1 file changed, 76 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/xen/events.c b/drivers/xen/events.c
> index 7595581..835101f 100644
> --- a/drivers/xen/events.c
> +++ b/drivers/xen/events.c
> @@ -51,6 +51,23 @@
>  #include <xen/interface/hvm/hvm_op.h>
>  #include <xen/interface/hvm/params.h>
>  
> +/* N-level event channel, starting from 2 */
> +static unsigned int evtchn_level = 2;
> +
> +struct evtchn_ops {
> +	unsigned long (*active_evtchns)(unsigned int,
> +					struct shared_info*, unsigned int);
> +	void (*clear_evtchn)(int);
> +	void (*set_evtchn)(int);
> +	int (*test_evtchn)(int);
> +	void (*mask_evtchn)(int);
> +	void (*unmask_evtchn)(int);
> +	int (*is_masked)(int);
> +	void (*xen_evtchn_do_upcall)(void);
> +	irqreturn_t (*xen_debug_interrupt)(int, void*);
> +};
> +static struct evtchn_ops *eops;
Suggest not using a pointer here to avoid the indirection.
> +
>  /*
>   * This lock protects updates to the following mapping and reference-count
>   * arrays. The lock does not need to be acquired to read the mapping
tables.
> @@ -285,9 +302,9 @@ static bool pirq_needs_eoi_flag(unsigned irq)
[...]> @@ -1783,12 +1809,28 @@ void xen_callback_vector(void)
>  void xen_callback_vector(void) {}
>  #endif
>  
> +static struct evtchn_ops evtchn_ops_l2 __read_mostly = {
const
> +	.active_evtchns = __active_evtchns_l2,
> +	.clear_evtchn = __clear_evtchn_l2,
> +	.set_evtchn = __set_evtchn_l2,
> +	.test_evtchn = __test_evtchn_l2,
> +	.mask_evtchn = __mask_evtchn_l2,
> +	.unmask_evtchn = __unmask_evtchn_l2,
> +	.is_masked = __is_masked_l2,
> +	.xen_evtchn_do_upcall = __xen_evtchn_do_upcall_l2,
> +	.xen_debug_interrupt = __xen_debug_interrupt_l2,
> +};
David

David Vrabel

2013-Jan-02 14:20 UTC

head link

Re: [RFC PATCH 2/3] Xen: rework NR_EVENT_CHANNELS related stuffs.

On 31/12/12 18:38, Wei Liu wrote:> 
Again, the changeset description is too brief.
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> ---
>  drivers/xen/events.c        |   44
+++++++++++++++++++++++++++++--------------
>  drivers/xen/evtchn.c        |   16 +++++++++-------
>  include/xen/events.h        |    3 +++
>  include/xen/interface/xen.h |   17 ++++++++++++++++-
>  4 files changed, 58 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/xen/events.c b/drivers/xen/events.c
> index 835101f..f60ba76 100644
> --- a/drivers/xen/events.c
> +++ b/drivers/xen/events.c
> @@ -52,7 +52,8 @@
>  #include <xen/interface/hvm/params.h>
>  
>  /* N-level event channel, starting from 2 */
> -static unsigned int evtchn_level = 2;
> +unsigned int evtchn_level = 2;
> +EXPORT_SYMBOL_GPL(evtchn_level);
I presume this is exported so the NR_EVENT_CHANNELS() macro works.  I
think it would be better to provide and export an evtchn_nr_channels()
function instead.  Perhaps make it part of the event channel ops structure?
> --- a/include/xen/interface/xen.h
> +++ b/include/xen/interface/xen.h
> @@ -283,9 +283,24 @@ DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);
>  
>  /*
>   * Event channel endpoints per domain:
> + * 2-level:
>   *  1024 if a long is 32 bits; 4096 if a long is 64 bits.
> + * 3-level:
> + *  32k if a long is 32 bits; 256k if a long is 64 bits.
>   */
> -#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) *
64)
> +#define NR_EVENT_CHANNELS_L2 (sizeof(unsigned long) * sizeof(unsigned
long) * 64)
> +#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * sizeof(unsigned
long))
> +#define NR_EVENT_CHANNELS(x) ({ unsigned int __v = 0;	\
> +	switch (x) {					\
> +	case 2:						\
> +		__v = NR_EVENT_CHANNELS_L2; break;	\
> +	case 3:						\
> +		__v = NR_EVENT_CHANNELS_L3; break;	\
> +	default:					\
> +		BUG();					\
> +	}						\
> +	__v; })
> +
>  
>  struct vcpu_time_info {
>  	/*
This should be split into a separate patch that resync''s the Linux copy
of the header with the Xen one.

David

David Vrabel

2013-Jan-02 14:57 UTC

head link

Re: [RFC PATCH 3/3] Xen: implement 3-level event channel routines.

On 31/12/12 18:38, Wei Liu wrote:> 
Changeset description?
> Signed-off-by: Wei Liu <wei.liu2@citrix.com>
> ---
>  arch/x86/xen/enlighten.c              |    7 +
>  drivers/xen/events.c                  |  419
+++++++++++++++++++++++++++++++--
>  include/xen/events.h                  |    2 +
>  include/xen/interface/event_channel.h |   24 ++
>  include/xen/interface/xen.h           |    2 +-
>  5 files changed, 437 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> index bc893e7..f471881 100644
> --- a/arch/x86/xen/enlighten.c
> +++ b/arch/x86/xen/enlighten.c
> @@ -43,6 +43,7 @@
>  #include <xen/hvm.h>
>  #include <xen/hvc-console.h>
>  #include <xen/acpi.h>
> +#include <xen/events.h>
>  
>  #include <asm/paravirt.h>
>  #include <asm/apic.h>
> @@ -195,6 +196,9 @@ void xen_vcpu_restore(void)
>  		    HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
>  			BUG();
>  	}
> +
> +	if (evtchn_level_param == 3)
> +		xen_event_channel_setup_3level();
Why is this here?
>  }
>  
>  static void __init xen_banner(void)
> @@ -1028,6 +1032,9 @@ void xen_setup_vcpu_info_placement(void)
>  	for_each_possible_cpu(cpu)
>  		xen_vcpu_setup(cpu);
>  
> +	if (evtchn_level_param == 3)
> +		xen_event_channel_setup_3level();
> +
Why is this here instead of xen_init_IRQ()?
>  	/* xen_vcpu_setup managed to place the vcpu_info within the
>  	   percpu area for all cpus, so make use of it */
>  	if (have_vcpu_info_placement) {
> diff --git a/drivers/xen/events.c b/drivers/xen/events.c
> index f60ba76..adb94e9 100644
> --- a/drivers/xen/events.c
> +++ b/drivers/xen/events.c
[...]> +
> +/* 2-level event channel does not use current_word_idx_l2 */
>  static DEFINE_PER_CPU(unsigned int, current_word_idx);
> +static DEFINE_PER_CPU(unsigned int, current_word_idx_l2);
>  static DEFINE_PER_CPU(unsigned int, current_bit_idx);
I suggest renaming these to current_word_idx_l3 and current_word_idx_l2.

The use of these variable really needs documentation, particularly why
they''re used. I presume (but not really sure) that they''re to
ensure the
average event latency is constant independent of which channel it is.
> +
>  /*
>   * Mask out the i least significant bits of w
>   */
> @@ -1303,7 +1486,8 @@ static void __xen_evtchn_do_upcall_l2(void)
>  		if (__this_cpu_inc_return(xed_nesting_count) - 1)
>  			goto out;
>  
> -#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86.
*/
> +#ifndef CONFIG_X86
> +		/* No need for a barrier -- XCHG is a barrier on x86. */
>  		/* Clear master flag /before/ clearing selector flag. */
>  		wmb();
>  #endif
> @@ -1392,6 +1576,155 @@ out:
>  	put_cpu();
>  }
>  
> +void __xen_evtchn_do_upcall_l3(void)
This is one of my least favourite functions...  A comment describing the
algorithm used here would be nice.
> @@ -1821,14 +2148,74 @@ static struct evtchn_ops evtchn_ops_l2
__read_mostly = {
>  	.xen_debug_interrupt = __xen_debug_interrupt_l2,
>  };
>  
> +static struct evtchn_ops evtchn_ops_l3 __read_mostly = {
const
> +	.active_evtchns = __active_evtchns_l3,
> +	.clear_evtchn = __clear_evtchn_l3,
> +	.set_evtchn = __set_evtchn_l3,
> +	.test_evtchn = __test_evtchn_l3,
> +	.mask_evtchn = __mask_evtchn_l3,
> +	.unmask_evtchn = __unmask_evtchn_l3,
> +	.is_masked = __is_masked_l3,
> +	.xen_evtchn_do_upcall = __xen_evtchn_do_upcall_l3,
> +	.xen_debug_interrupt = __xen_debug_interrupt_l3,
> +};
> +
> +int xen_event_channel_setup_3level(void)
> +{
> +	evtchn_register_nlevel_t reg;
> +	int i, nr_pages, cpu;
> +	unsigned long mfns[nr_cpu_ids];
> +	unsigned long offsets[nr_cpu_ids];
These arrays are too large for the stack if the domain has many VCPUs.
With 256 VCPUs this uses a page of stack.
> diff --git a/include/xen/interface/event_channel.h
b/include/xen/interface/event_channel.h
> index f494292..f764d21 100644
> --- a/include/xen/interface/event_channel.h
> +++ b/include/xen/interface/event_channel.h[...]
> diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
> index c66e1ff..7cb9d8f 100644
> --- a/include/xen/interface/xen.h
> +++ b/include/xen/interface/xen.h[,.,]

Put these in he patch sync''ing the headers.

David

Konrad Rzeszutek Wilk

2013-Jan-02 18:26 UTC

head link

Re: Implement 3-level event channel routines in Linux.

On Mon, Dec 31, 2012 at 06:38:54PM +0000, Wei Liu wrote:> This patch series implements 3-level event channel routines in Linux
kernel.
> 
> My thought is that 3-level event channel is only useful for Dom0 or driver
> domain, so it is not enabled by default. Enable it with evtchn_level=3 in
> kernel command line.
Could it be enabled dynamically? Say when we are close to exhausting the
amount of channels? Or if the machine has some large amount of memory and
hence would probably allocate many many guests?

I am not really a fan of the kernel command line options for
this.> 
> HVM is not supported at the moment. As it is not very likely it will need
this.
> And I haven''t found a right place to issue the hypercall.
> 
> My understaning is that PVH has more or less the same initialization
process as
> PV, so the current implementation should work for PVH as well. Please
correct
> me if I''m wrong.
It uses the HVM mechanism (so vector callback).>

Wei Liu

2013-Jan-02 18:46 UTC

head link

Re: Implement 3-level event channel routines in Linux.

On Wed, 2013-01-02 at 18:26 +0000, Konrad Rzeszutek Wilk
wrote:> On Mon, Dec 31, 2012 at 06:38:54PM +0000, Wei Liu wrote:
> > This patch series implements 3-level event channel routines in Linux
kernel.
> > 
> > My thought is that 3-level event channel is only useful for Dom0 or
driver
> > domain, so it is not enabled by default. Enable it with evtchn_level=3
in
> > kernel command line.
> 
> Could it be enabled dynamically? Say when we are close to exhausting the
> amount of channels? Or if the machine has some large amount of memory and
> hence would probably allocate many many guests?
> 
Do you mean enabling this mechanism on the fly when Dom0 / driver domain
is running? Or do you mean enabling it based on some metrics when
starting up a domain?

If it is the first case, who is responsible for initiating the switching
process? It looks like it is Xen''s responsibility to do the switch
because Dom0 sometimes cannot know the true capability of underlying
hardware. If it is Xen, then we need to add callback to poke Dom0 /
driver domain.

For both cases, what do you have in mind for the metric that used to
trigger the switch? How much RAM / how many CPUs is the threshold?

Wei.

Konrad Rzeszutek Wilk

2013-Jan-02 21:12 UTC

head link

Re: Implement 3-level event channel routines in Linux.

On Wed, Jan 02, 2013 at 06:46:58PM +0000, Wei Liu wrote:> On Wed, 2013-01-02 at 18:26 +0000, Konrad Rzeszutek Wilk wrote:
> > On Mon, Dec 31, 2012 at 06:38:54PM +0000, Wei Liu wrote:
> > > This patch series implements 3-level event channel routines in
Linux kernel.
> > > 
> > > My thought is that 3-level event channel is only useful for Dom0
or driver
> > > domain, so it is not enabled by default. Enable it with
evtchn_level=3 in
> > > kernel command line.
> > 
> > Could it be enabled dynamically? Say when we are close to exhausting
the
> > amount of channels? Or if the machine has some large amount of memory
and
> > hence would probably allocate many many guests?
> > 
> 
> Do you mean enabling this mechanism on the fly when Dom0 / driver domain
> is running? Or do you mean enabling it based on some metrics when
> starting up a domain?
Either one.> 
> If it is the first case, who is responsible for initiating the switching
> process? It looks like it is Xen''s responsibility to do the switch
> because Dom0 sometimes cannot know the true capability of underlying
> hardware. If it is Xen, then we need to add callback to poke Dom0 /
> driver domain.
Huh? It would be as simple as Dom0 just figuring out that it is
at the end of the available events (b/c its close to the MAX) and
then transitioning to the 3-level one (if it can).
> 
> For both cases, what do you have in mind for the metric that used to
> trigger the switch? How much RAM / how many CPUs is the threshold?
No idea. At what point are we running out of the events?> 
> 
> Wei.
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
>

Wei Liu

2013-Jan-03 12:09 UTC

head link

Re: Implement 3-level event channel routines in Linux.

On Wed, 2013-01-02 at 21:12 +0000, Konrad Rzeszutek Wilk
wrote:> On Wed, Jan 02, 2013 at 06:46:58PM +0000, Wei Liu wrote:
> > On Wed, 2013-01-02 at 18:26 +0000, Konrad Rzeszutek Wilk wrote:
> > > On Mon, Dec 31, 2012 at 06:38:54PM +0000, Wei Liu wrote:
> > > > This patch series implements 3-level event channel routines
in Linux kernel.
> > > > 
> > > > My thought is that 3-level event channel is only useful for
Dom0 or driver
> > > > domain, so it is not enabled by default. Enable it with
evtchn_level=3 in
> > > > kernel command line.
> > > 
> > > Could it be enabled dynamically? Say when we are close to
exhausting the
> > > amount of channels? Or if the machine has some large amount of
memory and
> > > hence would probably allocate many many guests?
> > > 
> > 
> > Do you mean enabling this mechanism on the fly when Dom0 / driver
domain
> > is running? Or do you mean enabling it based on some metrics when
> > starting up a domain?
> 
> Either one.
> > 
> > If it is the first case, who is responsible for initiating the
switching
> > process? It looks like it is Xen''s responsibility to do the
switch
> > because Dom0 sometimes cannot know the true capability of underlying
> > hardware. If it is Xen, then we need to add callback to poke Dom0 /
> > driver domain.
> 
> Huh? It would be as simple as Dom0 just figuring out that it is
> at the end of the available events (b/c its close to the MAX) and
> then transitioning to the 3-level one (if it can).
> 
We need to manipulate various data structures. IMHO this makes
implementation tricky and racy.
> > 
> > For both cases, what do you have in mind for the metric that used to
> > trigger the switch? How much RAM / how many CPUs is the threshold?
> 
> No idea. At what point are we running out of the events?
In practice, if you have hundreds or thousands of guests running you''re
likely of running out of event channels. I presume host capable of doing
this has lots of RAM and CPUs...


Wei.

Andrew Cooper

2013-Jan-03 12:12 UTC

head link

Re: Implement 3-level event channel routines in Linux.

On 03/01/13 12:09, Wei Liu wrote:>
>>> For both cases, what do you have in mind for the metric that used
to
>>> trigger the switch? How much RAM / how many CPUs is the threshold?
>> No idea. At what point are we running out of the events?
> In practice, if you have hundreds or thousands of guests running
you''re
> likely of running out of event channels. I presume host capable of doing
> this has lots of RAM and CPUs...
>
>
> Wei.
Yes, but the per-cpu stacks are still a fixed size.  Xen uses 4 (and a 
half-ish) pages.  Linux I would guess is similar.

~Andrew

Konrad Rzeszutek Wilk

2013-Jan-04 16:36 UTC

head link

Re: Implement 3-level event channel routines in Linux.

On Thu, Jan 03, 2013 at 12:09:25PM +0000, Wei Liu wrote:> On Wed, 2013-01-02 at 21:12 +0000, Konrad Rzeszutek Wilk wrote:
> > On Wed, Jan 02, 2013 at 06:46:58PM +0000, Wei Liu wrote:
> > > On Wed, 2013-01-02 at 18:26 +0000, Konrad Rzeszutek Wilk wrote:
> > > > On Mon, Dec 31, 2012 at 06:38:54PM +0000, Wei Liu wrote:
> > > > > This patch series implements 3-level event channel
routines in Linux kernel.
> > > > > 
> > > > > My thought is that 3-level event channel is only useful
for Dom0 or driver
> > > > > domain, so it is not enabled by default. Enable it with
evtchn_level=3 in
> > > > > kernel command line.
> > > > 
> > > > Could it be enabled dynamically? Say when we are close to
exhausting the
> > > > amount of channels? Or if the machine has some large amount
of memory and
> > > > hence would probably allocate many many guests?
> > > > 
> > > 
> > > Do you mean enabling this mechanism on the fly when Dom0 / driver
domain
> > > is running? Or do you mean enabling it based on some metrics when
> > > starting up a domain?
> > 
> > Either one.
> > > 
> > > If it is the first case, who is responsible for initiating the
switching
> > > process? It looks like it is Xen''s responsibility to do
the switch
> > > because Dom0 sometimes cannot know the true capability of
underlying
> > > hardware. If it is Xen, then we need to add callback to poke Dom0
/
> > > driver domain.
> > 
> > Huh? It would be as simple as Dom0 just figuring out that it is
> > at the end of the available events (b/c its close to the MAX) and
> > then transitioning to the 3-level one (if it can).
> > 
> 
> We need to manipulate various data structures. IMHO this makes
> implementation tricky and racy.
> 
> > > 
> > > For both cases, what do you have in mind for the metric that used
to
> > > trigger the switch? How much RAM / how many CPUs is the
threshold?
> > 
> > No idea. At what point are we running out of the events?
> 
> In practice, if you have hundreds or thousands of guests running
you''re
> likely of running out of event channels. I presume host capable of doing
> this has lots of RAM and CPUs...
OK. So "lots" is more than 64GB and 16 CPUS?> 
> 
> Wei.
>

Xen devel - Dec 2012 - Implement 3-level event channel routines in Linux.

Implement 3-level event channel routines in Linux.

[RFC PATCH 1/3] Xen: generalized event channel operations.

[RFC PATCH 2/3] Xen: rework NR_EVENT_CHANNELS related stuffs.

[RFC PATCH 3/3] Xen: implement 3-level event channel routines.

Re: [RFC PATCH 1/3] Xen: generalized event channel operations.

Re: [RFC PATCH 2/3] Xen: rework NR_EVENT_CHANNELS related stuffs.

Re: [RFC PATCH 3/3] Xen: implement 3-level event channel routines.

Re: Implement 3-level event channel routines in Linux.

Re: Implement 3-level event channel routines in Linux.

Re: Implement 3-level event channel routines in Linux.

Re: Implement 3-level event channel routines in Linux.

Re: Implement 3-level event channel routines in Linux.

Re: Implement 3-level event channel routines in Linux.