This is the kernel side code of 3-level event channel ABI, which corresponds to RFC V5 in Xen side. Some notable changes: * More code shared between 2/3-level ABI. * evtchn_cpu_mask is allocated dynamically in CPU hotplug bringup path. Diffstat: arch/x86/xen/enlighten.c | 12 + drivers/xen/events.c | 930 +++++++++++++++++++++++++++------ drivers/xen/evtchn.c | 13 +- include/xen/events.h | 12 + include/xen/interface/event_channel.h | 46 +- include/xen/interface/xen.h | 13 +- 6 files changed, 847 insertions(+), 179 deletions(-)
This typedef slipped into Linux header file, remove it. Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- include/xen/interface/event_channel.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/xen/interface/event_channel.h b/include/xen/interface/event_channel.h index f494292..293c3f0 100644 --- a/include/xen/interface/event_channel.h +++ b/include/xen/interface/event_channel.h @@ -188,7 +188,6 @@ struct evtchn_reset { /* IN parameters. */ domid_t dom; }; -typedef struct evtchn_reset evtchn_reset_t; struct evtchn_op { uint32_t cmd; /* EVTCHNOP_* */ -- 1.7.10.4
Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- drivers/xen/events.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 6b78378..90ac37a 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -1212,7 +1212,7 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) spin_lock_irqsave(&debug_lock, flags); - printk("\nvcpu %d\n ", cpu); + printk(KERN_DEBUG "\nvcpu %d\n ", cpu); for_each_online_cpu(i) { int pending; @@ -1220,27 +1220,27 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) pending = (get_irq_regs() && i == cpu) ? xen_irqs_disabled(get_irq_regs()) : v->evtchn_upcall_mask; - printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n ", i, + printk(KERN_DEBUG "%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n ", i, pending, v->evtchn_upcall_pending, (int)(sizeof(v->evtchn_pending_sel)*2), v->evtchn_pending_sel); } v = per_cpu(xen_vcpu, cpu); - printk("\npending:\n "); + printk(KERN_DEBUG "\npending:\n "); for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) - printk("%0*"PRI_xen_ulong"%s", + printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s", (int)sizeof(sh->evtchn_pending[0])*2, sh->evtchn_pending[i], i % 8 == 0 ? "\n " : " "); - printk("\nglobal mask:\n "); + printk(KERN_DEBUG "\nglobal mask:\n "); for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%0*"PRI_xen_ulong"%s", + printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s", (int)(sizeof(sh->evtchn_mask[0])*2), sh->evtchn_mask[i], i % 8 == 0 ? "\n " : " "); - printk("\nglobally unmasked:\n "); + printk(KERN_DEBUG "\nglobally unmasked:\n "); for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(sh->evtchn_mask[0])*2), @@ -1249,25 +1249,25 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) printk("\nlocal cpu%d mask:\n ", cpu); for (i = (NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) - printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2), + printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2), cpu_evtchn[i], i % 8 == 0 ? "\n " : " "); - printk("\nlocally unmasked:\n "); + printk(KERN_DEBUG "\nlocally unmasked:\n "); for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { xen_ulong_t pending = sh->evtchn_pending[i] & ~sh->evtchn_mask[i] & cpu_evtchn[i]; - printk("%0*"PRI_xen_ulong"%s", + printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s", (int)(sizeof(sh->evtchn_mask[0])*2), pending, i % 8 == 0 ? "\n " : " "); } - printk("\npending list:\n"); + printk(KERN_DEBUG "\npending list:\n"); for (i = 0; i < NR_EVENT_CHANNELS; i++) { if (sync_test_bit(i, BM(sh->evtchn_pending))) { int word_idx = i / BITS_PER_EVTCHN_WORD; - printk(" %d: event %d -> irq %d%s%s%s\n", + printk(KERN_DEBUG " %d: event %d -> irq %d%s%s%s\n", cpu_from_evtchn(i), i, evtchn_to_irq[i], sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) -- 1.7.10.4
Make the per-cpu selector L1 to be consistent with description in __xen_evtchn_do_upcall''s comment. Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- drivers/xen/events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 90ac37a..38e30aa 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -1271,7 +1271,7 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) cpu_from_evtchn(i), i, evtchn_to_irq[i], sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) - ? "" : " l2-clear", + ? "" : " l1-clear", !sync_test_bit(i, BM(sh->evtchn_mask)) ? "" : " globally-masked", sync_test_bit(i, BM(cpu_evtchn)) -- 1.7.10.4
Stay in sync with Xen public headers: * event_channel.h: * EVTCHNOP_query_extended_abis * EVTCHNOP_register_3level * xen.h: * NR_EVENT_CHANNEL* EVTCHNOP_query_extended_aibs is pretty self-explanatory. Other structure and macro definitions belong to the 3-level event channel ABI. Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- include/xen/interface/event_channel.h | 45 +++++++++++++++++++++++++++++++++ include/xen/interface/xen.h | 13 +++++++++- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/include/xen/interface/event_channel.h b/include/xen/interface/event_channel.h index 293c3f0..155454e 100644 --- a/include/xen/interface/event_channel.h +++ b/include/xen/interface/event_channel.h @@ -189,6 +189,51 @@ struct evtchn_reset { domid_t dom; }; +/* + * EVTCHNOP_query_extended_abis: Query the hypervisor for supported extended + * event channel ABIs. + */ +#define EVTCHNOP_query_extended_abis 11 +#define EVTCHN_EXTENDED_NONE 0 +#define _EVTCHN_EXTENDED_L3 1 +#define EVTCHN_EXTENDED_L3 (1UL << _EVTCHN_EXTENDED_L3) +struct evtchn_query_extended_abis { + /* OUT parameters. */ + uint64_t abis; +}; + +/* + * EVTCHNOP_register_3level: Register 3-level event channel. + */ +#define EVTCHNOP_register_3level 12 +/* + * 64 bits guests need 8 pages for evtchn_pending and evtchn_mask for 256k + * event channels while 32 bits ones only need 1 page for 32k event channels. + */ +#define EVTCHN_MAX_L3_PAGES 8 +/* + * A guest should register the bitmaps first, then register L2 selector for + * individual cpu. + */ +#define REGISTER_BITMAPS 1 +#define REGISTER_L2_SELECTOR 2 +struct evtchn_register_3level { + /* IN parameters. */ + uint32_t cmd; + union { + struct { + uint32_t nr_pages; + GUEST_HANDLE(xen_pfn_t) evtchn_pending; + GUEST_HANDLE(xen_pfn_t) evtchn_mask; + } bitmaps; + struct { + uint32_t cpu_id; + xen_pfn_t mfn; /* mfn for L2 selector */ + xen_pfn_t offset; /* offset of L2 selector */ + } l2_selector; + } u; +}; + struct evtchn_op { uint32_t cmd; /* EVTCHNOP_* */ union { diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 53ec416..9b0248d 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -283,9 +283,20 @@ DEFINE_GUEST_HANDLE_STRUCT(multicall_entry); /* * Event channel endpoints per domain: + * 2-level for x86: * 1024 if a long is 32 bits; 4096 if a long is 64 bits. + * 3-level for x86: + * 32k if a long is 32 bits; 256k if a long is 64 bits. + * 2-level for ARM: + * 4096 for both 32 bits and 64 bits. + * 3-level for ARM: + * 256k for both 32 bits and 64 bits. */ -#define NR_EVENT_CHANNELS (sizeof(xen_ulong_t) * sizeof(xen_ulong_t) * 64) +#define NR_EVENT_CHANNELS_L2 (sizeof(xen_ulong_t) * sizeof(xen_ulong_t) * 64) +#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * sizeof(xen_ulong_t) * 8) +#if !defined(__XEN__) && !defined(__XEN_TOOLS__) +#define NR_EVENT_CHANNELS NR_EVENT_CHANNELS_L2 /* for compatibility */ +#endif struct vcpu_time_info { /* -- 1.7.10.4
Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- drivers/xen/events.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 38e30aa..eca6488 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -369,6 +369,12 @@ static inline int test_evtchn(int port) return sync_test_bit(port, BM(&s->evtchn_pending[0])); } +static inline int test_and_set_mask(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0])); +} + /** * notify_remote_via_irq - send event to remote end of event channel via irq @@ -1506,7 +1512,7 @@ int resend_irq_on_evtchn(unsigned int irq) if (!VALID_EVTCHN(evtchn)) return 1; - masked = sync_test_and_set_bit(evtchn, BM(s->evtchn_mask)); + masked = test_and_set_mask(evtchn); sync_set_bit(evtchn, BM(s->evtchn_pending)); if (!masked) unmask_evtchn(evtchn); @@ -1555,7 +1561,7 @@ static int retrigger_dynirq(struct irq_data *data) if (VALID_EVTCHN(evtchn)) { int masked; - masked = sync_test_and_set_bit(evtchn, BM(sh->evtchn_mask)); + masked = test_and_set_mask(evtchn); sync_set_bit(evtchn, BM(sh->evtchn_pending)); if (!masked) unmask_evtchn(evtchn); -- 1.7.10.4
Wei Liu
2013-Mar-19 15:22 UTC
[RFC PATCH V5 06/14] xen: replace raw bit ops with functions
There is already a function called set_evtchn() for that job. Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- drivers/xen/events.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index eca6488..6e226c3 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -1507,13 +1507,12 @@ static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, int resend_irq_on_evtchn(unsigned int irq) { int masked, evtchn = evtchn_from_irq(irq); - struct shared_info *s = HYPERVISOR_shared_info; if (!VALID_EVTCHN(evtchn)) return 1; masked = test_and_set_mask(evtchn); - sync_set_bit(evtchn, BM(s->evtchn_pending)); + set_evtchn(evtchn); if (!masked) unmask_evtchn(evtchn); @@ -1555,14 +1554,13 @@ static void mask_ack_dynirq(struct irq_data *data) static int retrigger_dynirq(struct irq_data *data) { int evtchn = evtchn_from_irq(data->irq); - struct shared_info *sh = HYPERVISOR_shared_info; int ret = 0; if (VALID_EVTCHN(evtchn)) { int masked; masked = test_and_set_mask(evtchn); - sync_set_bit(evtchn, BM(sh->evtchn_pending)); + set_evtchn(evtchn); if (!masked) unmask_evtchn(evtchn); ret = 1; -- 1.7.10.4
Wei Liu
2013-Mar-19 15:22 UTC
[RFC PATCH V5 07/14] xen: generalized event channel operations
Use global pointers in common operations to allow for better code sharing between 2 and 3 level event channel ABI. Function pointers are used to deal with functions which are not suitable for sharing. Also update drivers/xen/evtchn.c to use exported variable instead of macro. Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- drivers/xen/events.c | 199 +++++++++++++++++++++++++++++++------------------- drivers/xen/evtchn.c | 13 ++-- include/xen/events.h | 3 + 3 files changed, 135 insertions(+), 80 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 6e226c3..217efb2 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -56,6 +56,27 @@ #include <xen/interface/sched.h> #include <asm/hw_irq.h> +/* extended event channel ABI in use, default is EVTCHN_EXTENDED_NONE */ +uint64_t xen_evtchn_extended = EVTCHN_EXTENDED_NONE; +EXPORT_SYMBOL_GPL(xen_evtchn_extended); +/* number of event channels */ +unsigned int xen_nr_event_channels; +EXPORT_SYMBOL_GPL(xen_nr_event_channels); + +struct evtchn_ops { + void (*unmask)(int port); + irqreturn_t (*debug_interrupt)(int irq, void *dev_id); + void (*do_upcall)(void); +}; + +static const struct evtchn_ops *eops; + +/* The following pointers point to pending bitmap and mask bitmap. */ +static xen_ulong_t *evtchn_pending; +static xen_ulong_t *evtchn_mask; +/* The following per-cpu var points to selector(s). */ +static DEFINE_PER_CPU(xen_ulong_t *[1], evtchn_sel); + /* * This lock protects updates to the following mapping and reference-count * arrays. The lock does not need to be acquired to read the mapping tables. @@ -135,7 +156,7 @@ static bool (*pirq_needs_eoi)(unsigned irq); /* Find the first set bit in a evtchn mask */ #define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) -static DEFINE_PER_CPU(xen_ulong_t [NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD], +static DEFINE_PER_CPU(xen_ulong_t [NR_EVENT_CHANNELS_L2/BITS_PER_EVTCHN_WORD], cpu_evtchn_mask); /* Xen will never allocate port zero for any purpose. */ @@ -310,12 +331,11 @@ static bool pirq_needs_eoi_flag(unsigned irq) } static inline xen_ulong_t active_evtchns(unsigned int cpu, - struct shared_info *sh, unsigned int idx) { - return sh->evtchn_pending[idx] & + return evtchn_pending[idx] & per_cpu(cpu_evtchn_mask, cpu)[idx] & - ~sh->evtchn_mask[idx]; + ~evtchn_mask[idx]; } static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) @@ -353,26 +373,22 @@ static void init_evtchn_cpu_bindings(void) static inline void clear_evtchn(int port) { - struct shared_info *s = HYPERVISOR_shared_info; - sync_clear_bit(port, BM(&s->evtchn_pending[0])); + sync_clear_bit(port, BM(&evtchn_pending[0])); } static inline void set_evtchn(int port) { - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, BM(&s->evtchn_pending[0])); + sync_set_bit(port, BM(&evtchn_pending[0])); } static inline int test_evtchn(int port) { - struct shared_info *s = HYPERVISOR_shared_info; - return sync_test_bit(port, BM(&s->evtchn_pending[0])); + return sync_test_bit(port, BM(&evtchn_pending[0])); } static inline int test_and_set_mask(int port) { - struct shared_info *s = HYPERVISOR_shared_info; - return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0])); + return sync_test_and_set_bit(port, BM(&evtchn_mask[0])); } @@ -395,24 +411,40 @@ EXPORT_SYMBOL_GPL(notify_remote_via_irq); static void mask_evtchn(int port) { - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, BM(&s->evtchn_mask[0])); + sync_set_bit(port, BM(&evtchn_mask[0])); +} + +static inline void __unmask_local_port_l2(int port) +{ + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + int cpu = smp_processor_id(); + + sync_clear_bit(port, BM(&evtchn_mask[0])); + + /* + * The following is basically the equivalent of + * ''hw_resend_irq''. Just like a real IO-APIC we ''lose + * the interrupt edge'' if the channel is masked. + */ + if (sync_test_bit(port, BM(&evtchn_pending[0])) && + !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD, + BM(per_cpu(evtchn_sel, cpu)[0]))) + vcpu_info->evtchn_upcall_pending = 1; } static void unmask_evtchn(int port) { - struct shared_info *s = HYPERVISOR_shared_info; unsigned int cpu = get_cpu(); - int do_hypercall = 0, evtchn_pending = 0; + int do_hypercall = 0, _evtchn_pending = 0; BUG_ON(!irqs_disabled()); if (unlikely((cpu != cpu_from_evtchn(port)))) do_hypercall = 1; else - evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0])); + _evtchn_pending = sync_test_bit(port, BM(&evtchn_pending[0])); - if (unlikely(evtchn_pending && xen_hvm_domain())) + if (unlikely(_evtchn_pending && xen_hvm_domain())) do_hypercall = 1; /* Slow path (hypercall) if this is a non-local port or if this is @@ -421,21 +453,8 @@ static void unmask_evtchn(int port) if (do_hypercall) { struct evtchn_unmask unmask = { .port = port }; (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); - } else { - struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); - - sync_clear_bit(port, BM(&s->evtchn_mask[0])); - - /* - * The following is basically the equivalent of - * ''hw_resend_irq''. Just like a real IO-APIC we ''lose - * the interrupt edge'' if the channel is masked. - */ - if (evtchn_pending && - !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD, - BM(&vcpu_info->evtchn_pending_sel))) - vcpu_info->evtchn_upcall_pending = 1; - } + } else + eops->unmask(port); put_cpu(); } @@ -938,7 +957,7 @@ static int find_virq(unsigned int virq, unsigned int cpu) int port, rc = -ENOENT; memset(&status, 0, sizeof(status)); - for (port = 0; port <= NR_EVENT_CHANNELS; port++) { + for (port = 0; port <= xen_nr_event_channels; port++) { status.dom = DOMID_SELF; status.port = port; rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status); @@ -1163,7 +1182,7 @@ int evtchn_get(unsigned int evtchn) struct irq_info *info; int err = -ENOENT; - if (evtchn >= NR_EVENT_CHANNELS) + if (evtchn >= xen_nr_event_channels) return -EINVAL; mutex_lock(&irq_mapping_update_lock); @@ -1208,13 +1227,12 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) irqreturn_t xen_debug_interrupt(int irq, void *dev_id) { - struct shared_info *sh = HYPERVISOR_shared_info; - int cpu = smp_processor_id(); - xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); - int i; - unsigned long flags; + irqreturn_t rc; static DEFINE_SPINLOCK(debug_lock); + unsigned long flags; + int cpu = smp_processor_id(); struct vcpu_info *v; + int i; spin_lock_irqsave(&debug_lock, flags); @@ -1228,65 +1246,80 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) : v->evtchn_upcall_mask; printk(KERN_DEBUG "%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n ", i, pending, v->evtchn_upcall_pending, - (int)(sizeof(v->evtchn_pending_sel)*2), - v->evtchn_pending_sel); + (int)(sizeof(*per_cpu(evtchn_sel, cpu)[0])*2), + *per_cpu(evtchn_sel, cpu)[0]); } + + rc = eops->debug_interrupt(irq, dev_id); + + spin_unlock_irqrestore(&debug_lock, flags); + return rc; +} + +static irqreturn_t xen_debug_interrupt_l2(int irq, void *dev_id) +{ + int cpu = smp_processor_id(); + xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); + int i; + unsigned long nr_elems = NR_EVENT_CHANNELS_L2 / BITS_PER_EVTCHN_WORD; + struct vcpu_info *v; + v = per_cpu(xen_vcpu, cpu); printk(KERN_DEBUG "\npending:\n "); - for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) + for (i = nr_elems; i >= 0; i--) printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s", - (int)sizeof(sh->evtchn_pending[0])*2, - sh->evtchn_pending[i], + (int)sizeof(evtchn_pending[0])*2, + evtchn_pending[i], i % 8 == 0 ? "\n " : " "); printk(KERN_DEBUG "\nglobal mask:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + for (i = nr_elems; i >= 0; i--) printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s", - (int)(sizeof(sh->evtchn_mask[0])*2), - sh->evtchn_mask[i], + (int)(sizeof(evtchn_mask[0])*2), + evtchn_mask[i], i % 8 == 0 ? "\n " : " "); printk(KERN_DEBUG "\nglobally unmasked:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + for (i = nr_elems; i >= 0; i--) printk("%0*"PRI_xen_ulong"%s", - (int)(sizeof(sh->evtchn_mask[0])*2), - sh->evtchn_pending[i] & ~sh->evtchn_mask[i], + (int)(sizeof(evtchn_mask[0])*2), + evtchn_pending[i] & ~evtchn_mask[i], i % 8 == 0 ? "\n " : " "); printk("\nlocal cpu%d mask:\n ", cpu); - for (i = (NR_EVENT_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) - printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2), + for (i = (NR_EVENT_CHANNELS_L2/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) + printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s", + (int)(sizeof(cpu_evtchn[0])*2), cpu_evtchn[i], i % 8 == 0 ? "\n " : " "); printk(KERN_DEBUG "\nlocally unmasked:\n "); - for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { - xen_ulong_t pending = sh->evtchn_pending[i] - & ~sh->evtchn_mask[i] + for (i = nr_elems-1; i >= 0; i--) { + xen_ulong_t pending = evtchn_pending[i] + & ~evtchn_mask[i] & cpu_evtchn[i]; printk(KERN_DEBUG "%0*"PRI_xen_ulong"%s", - (int)(sizeof(sh->evtchn_mask[0])*2), + (int)(sizeof(evtchn_mask[0])*2), pending, i % 8 == 0 ? "\n " : " "); } printk(KERN_DEBUG "\npending list:\n"); - for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (sync_test_bit(i, BM(sh->evtchn_pending))) { + for (i = 0; i < NR_EVENT_CHANNELS_L2; i++) { + if (sync_test_bit(i, BM(evtchn_pending))) { int word_idx = i / BITS_PER_EVTCHN_WORD; printk(KERN_DEBUG " %d: event %d -> irq %d%s%s%s\n", cpu_from_evtchn(i), i, evtchn_to_irq[i], - sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) + sync_test_bit(word_idx, + BM(per_cpu(evtchn_sel, cpu)[0])) ? "" : " l1-clear", - !sync_test_bit(i, BM(sh->evtchn_mask)) + !sync_test_bit(i, BM(evtchn_mask)) ? "" : " globally-masked", sync_test_bit(i, BM(cpu_evtchn)) ? "" : " locally-masked"); } } - spin_unlock_irqrestore(&debug_lock, flags); - return IRQ_HANDLED; } @@ -1308,13 +1341,12 @@ static DEFINE_PER_CPU(unsigned int, current_bit_idx); * a bitset of words which contain pending event bits. The second * level is a bitset of pending events themselves. */ -static void __xen_evtchn_do_upcall(void) +static void __xen_evtchn_do_upcall_l2(void) { int start_word_idx, start_bit_idx; int word_idx, bit_idx; int i; int cpu = get_cpu(); - struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); unsigned count; @@ -1331,7 +1363,7 @@ static void __xen_evtchn_do_upcall(void) * selector flag. xchg_xen_ulong must contain an * appropriate barrier. */ - pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0); + pending_words = xchg_xen_ulong(per_cpu(evtchn_sel, cpu)[0], 0); start_word_idx = __this_cpu_read(current_word_idx); start_bit_idx = __this_cpu_read(current_bit_idx); @@ -1354,7 +1386,7 @@ static void __xen_evtchn_do_upcall(void) } word_idx = EVTCHN_FIRST_BIT(words); - pending_bits = active_evtchns(cpu, s, word_idx); + pending_bits = active_evtchns(cpu, word_idx); bit_idx = 0; /* usually scan entire word from start */ if (word_idx == start_word_idx) { /* We scan the starting word in two parts */ @@ -1425,7 +1457,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) exit_idle(); #endif - __xen_evtchn_do_upcall(); + eops->do_upcall(); irq_exit(); set_irq_regs(old_regs); @@ -1433,7 +1465,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) void xen_hvm_evtchn_do_upcall(void) { - __xen_evtchn_do_upcall(); + eops->do_upcall(); } EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall); @@ -1729,14 +1761,14 @@ void xen_irq_resume(void) init_evtchn_cpu_bindings(); /* New event-channel space is not ''live'' yet. */ - for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) + for (evtchn = 0; evtchn < xen_nr_event_channels; evtchn++) mask_evtchn(evtchn); /* No IRQ <-> event-channel mappings. */ list_for_each_entry(info, &xen_irq_list_head, list) info->evtchn = 0; /* zap event-channel binding */ - for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) + for (evtchn = 0; evtchn < xen_nr_event_channels; evtchn++) evtchn_to_irq[evtchn] = -1; for_each_possible_cpu(cpu) { @@ -1829,20 +1861,39 @@ void xen_callback_vector(void) void xen_callback_vector(void) {} #endif +const struct evtchn_ops evtchn_l2_ops = { + .unmask = __unmask_local_port_l2, + .debug_interrupt = xen_debug_interrupt_l2, + .do_upcall = __xen_evtchn_do_upcall_l2 +}; + void __init xen_init_IRQ(void) { int i; + int cpu; + struct shared_info *s = HYPERVISOR_shared_info; + + evtchn_pending = s->evtchn_pending; + evtchn_mask = s->evtchn_mask; + for_each_possible_cpu(cpu) { + struct vcpu_info *vcpu_info = per_cpu(xen_vcpu, cpu); + per_cpu(evtchn_sel, cpu)[0] = &vcpu_info->evtchn_pending_sel; + } + + xen_evtchn_extended = EVTCHN_EXTENDED_NONE; + xen_nr_event_channels = NR_EVENT_CHANNELS_L2; + eops = &evtchn_l2_ops; - evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), + evtchn_to_irq = kcalloc(xen_nr_event_channels, sizeof(*evtchn_to_irq), GFP_KERNEL); BUG_ON(!evtchn_to_irq); - for (i = 0; i < NR_EVENT_CHANNELS; i++) + for (i = 0; i < xen_nr_event_channels; i++) evtchn_to_irq[i] = -1; init_evtchn_cpu_bindings(); /* No event channels are ''live'' right now. */ - for (i = 0; i < NR_EVENT_CHANNELS; i++) + for (i = 0; i < xen_nr_event_channels; i++) mask_evtchn(i); pirq_needs_eoi = pirq_needs_eoi_flag; diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index b2db77e..ac7a96e 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -232,7 +232,7 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) { unsigned port = kbuf[i]; - if (port < NR_EVENT_CHANNELS && + if (port < xen_nr_event_channels && get_port_user(port) == u && !get_port_enabled(port)) { set_port_enabled(port, true); @@ -374,7 +374,7 @@ static long evtchn_ioctl(struct file *file, break; rc = -EINVAL; - if (unbind.port >= NR_EVENT_CHANNELS) + if (unbind.port >= xen_nr_event_channels) break; spin_lock_irq(&port_user_lock); @@ -402,7 +402,7 @@ static long evtchn_ioctl(struct file *file, if (copy_from_user(¬ify, uarg, sizeof(notify))) break; - if (notify.port >= NR_EVENT_CHANNELS) { + if (notify.port >= xen_nr_event_channels) { rc = -EINVAL; } else if (get_port_user(notify.port) != u) { rc = -ENOTCONN; @@ -492,7 +492,7 @@ static int evtchn_release(struct inode *inode, struct file *filp) free_page((unsigned long)u->ring); - for (i = 0; i < NR_EVENT_CHANNELS; i++) { + for (i = 0; i < xen_nr_event_channels; i++) { if (get_port_user(i) != u) continue; @@ -501,7 +501,7 @@ static int evtchn_release(struct inode *inode, struct file *filp) spin_unlock_irq(&port_user_lock); - for (i = 0; i < NR_EVENT_CHANNELS; i++) { + for (i = 0; i < xen_nr_event_channels; i++) { if (get_port_user(i) != u) continue; @@ -538,7 +538,8 @@ static int __init evtchn_init(void) if (!xen_domain()) return -ENODEV; - port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL); + port_user = kcalloc(xen_nr_event_channels, + sizeof(*port_user), GFP_KERNEL); if (port_user == NULL) return -ENOMEM; diff --git a/include/xen/events.h b/include/xen/events.h index c6bfe01..24cf421 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -111,4 +111,7 @@ int xen_test_irq_shared(int irq); /* initialize Xen IRQ subsystem */ void xen_init_IRQ(void); +extern unsigned int xen_nr_event_channels; +extern uint64_t xen_evtchn_extended; + #endif /* _XEN_EVENTS_H */ -- 1.7.10.4
Wei Liu
2013-Mar-19 15:22 UTC
[RFC PATCH V5 08/14] xen: dynamically allocate cpu_evtchn_mask
The size of cpu_evtchn_mask can change, use dynamic allocation to cope with this. To save space, cpu_evtchn_mask is not allocated for offline cpus. It will get allocated as soon as a cpu goes online. Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- drivers/xen/events.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 217efb2..ee35ff9 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -30,6 +30,7 @@ #include <linux/slab.h> #include <linux/irqnr.h> #include <linux/pci.h> +#include <linux/cpu.h> #ifdef CONFIG_X86 #include <asm/desc.h> @@ -156,8 +157,7 @@ static bool (*pirq_needs_eoi)(unsigned irq); /* Find the first set bit in a evtchn mask */ #define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) -static DEFINE_PER_CPU(xen_ulong_t [NR_EVENT_CHANNELS_L2/BITS_PER_EVTCHN_WORD], - cpu_evtchn_mask); +static DEFINE_PER_CPU(xen_ulong_t *, cpu_evtchn_mask); /* Xen will never allocate port zero for any purpose. */ #define VALID_EVTCHN(chn) ((chn) != 0) @@ -356,6 +356,9 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) static void init_evtchn_cpu_bindings(void) { int i; + unsigned int nr = xen_nr_event_channels / BITS_PER_EVTCHN_WORD; + unsigned int nr_bytes = nr * sizeof(xen_ulong_t); + #ifdef CONFIG_SMP struct irq_info *info; @@ -366,9 +369,9 @@ static void init_evtchn_cpu_bindings(void) } #endif - for_each_possible_cpu(i) + for_each_online_cpu(i) memset(per_cpu(cpu_evtchn_mask, i), - (i == 0) ? ~0 : 0, sizeof(*per_cpu(cpu_evtchn_mask, i))); + (i == 0) ? ~0 : 0, nr_bytes); } static inline void clear_evtchn(int port) @@ -1867,6 +1870,41 @@ const struct evtchn_ops evtchn_l2_ops = { .do_upcall = __xen_evtchn_do_upcall_l2 }; +static int __cpuinit xen_events_notifier_cb(struct notifier_block *self, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + int rc = NOTIFY_OK; + void *p; + unsigned int nr = xen_nr_event_channels / BITS_PER_EVTCHN_WORD; + unsigned int nr_bytes = nr * sizeof(xen_ulong_t); + + switch (action) { + case CPU_UP_PREPARE: + if (!per_cpu(cpu_evtchn_mask, cpu)) { + p = kzalloc_node(sizeof(xen_ulong_t) * nr, + GFP_KERNEL, cpu_to_node(cpu)); + if (!p) + rc = NOTIFY_BAD; + else { + per_cpu(cpu_evtchn_mask, cpu) = p; + memset(per_cpu(cpu_evtchn_mask, cpu), + (cpu == 0) ? ~0 : 0, nr_bytes); + rc = NOTIFY_OK; + } + } + break; + default: + break; + } + return rc; +} + +static struct notifier_block xen_events_notifier __cpuinitdata = { + .notifier_call = xen_events_notifier_cb, +}; + void __init xen_init_IRQ(void) { int i; @@ -1890,6 +1928,17 @@ void __init xen_init_IRQ(void) for (i = 0; i < xen_nr_event_channels; i++) evtchn_to_irq[i] = -1; + for_each_online_cpu(cpu) { + void *p; + unsigned int nr = xen_nr_event_channels / BITS_PER_EVTCHN_WORD; + + p = kzalloc_node(sizeof(xen_ulong_t) * nr, + GFP_KERNEL, cpu_to_node(cpu)); + BUG_ON(!p); + per_cpu(cpu_evtchn_mask, cpu) = p; + } + register_cpu_notifier(&xen_events_notifier); + init_evtchn_cpu_bindings(); /* No event channels are ''live'' right now. */ -- 1.7.10.4
Wei Liu
2013-Mar-19 15:22 UTC
[RFC PATCH V5 09/14] xen: implement 3-level event channel routines
Implement several routines for 3-level event channel ABI. Some routines are shared between 2/3-level ABIs. For N-level (now only 2 and 3) event channel ABIs, the active events are processed in a top-down approach, i.e. L1 -> L2 -> .. -> L(n-1) -> bitmap. The selectors are processed recursively, the event bitmap is processed by a dedicated function called process_port. Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- drivers/xen/events.c | 376 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 293 insertions(+), 83 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index ee35ff9..fe1831b 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -76,7 +76,12 @@ static const struct evtchn_ops *eops; static xen_ulong_t *evtchn_pending; static xen_ulong_t *evtchn_mask; /* The following per-cpu var points to selector(s). */ -static DEFINE_PER_CPU(xen_ulong_t *[1], evtchn_sel); +static DEFINE_PER_CPU(xen_ulong_t *[2], evtchn_sel); +/* + * 2nd level selector for 3-level event channel, ''8'' stands for 8 bits + * per byte. + */ +static DEFINE_PER_CPU(xen_ulong_t [sizeof(xen_ulong_t) * 8], evtchn_sel_l2); /* * This lock protects updates to the following mapping and reference-count @@ -150,6 +155,11 @@ static bool (*pirq_needs_eoi)(unsigned irq); */ #define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8) /* + * If xen_ulong_t is 8 byte, it''s 64 bits wide, 2^6 == 64, otherwise + * it is 32 bits, 2^5 == 32 + */ +#define EVTCHN_WORD_BITORDER (sizeof(xen_ulong_t) == 8 ? 6 : 5) +/* * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t * array. Primarily to avoid long lines (hence the terse name). */ @@ -435,6 +445,29 @@ static inline void __unmask_local_port_l2(int port) vcpu_info->evtchn_upcall_pending = 1; } +static inline void __unmask_local_port_l3(int port) +{ + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + int cpu = smp_processor_id(); + unsigned int l1bit = port >> (EVTCHN_WORD_BITORDER << 1); + unsigned int l2bit = port >> EVTCHN_WORD_BITORDER; + + sync_clear_bit(port, BM(&evtchn_mask[0])); + + /* + * The following is basically the equivalent of + * ''hw_resend_irq''. Just like a real IO-APIC we ''lose + * the interrupt edge'' if the channel is masked. + */ + if (sync_test_bit(port, BM(&evtchn_pending[0])) && + !sync_test_and_set_bit(l2bit, + BM(per_cpu(evtchn_sel, cpu)[1])) && + !sync_test_and_set_bit(l1bit, + BM(per_cpu(evtchn_sel, cpu)[0]))) + vcpu_info->evtchn_upcall_pending = 1; + +} + static void unmask_evtchn(int port) { unsigned int cpu = get_cpu(); @@ -1326,119 +1359,254 @@ static irqreturn_t xen_debug_interrupt_l2(int irq, void *dev_id) return IRQ_HANDLED; } +static irqreturn_t xen_debug_interrupt_l3(int irq, void *dev_id) +{ + int cpu = smp_processor_id(); + xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); + unsigned long nr_elems = NR_EVENT_CHANNELS_L3 / BITS_PER_EVTCHN_WORD; + int i; + struct vcpu_info *v; + + v = per_cpu(xen_vcpu, cpu); + + printk(KERN_DEBUG "\npending (only show words which have bits set to 1):\n "); + for (i = nr_elems-1; i >= 0; i--) + if (evtchn_pending[i] != 0UL) { + printk(KERN_DEBUG " word index %d %0*"PRI_xen_ulong"\n", + i, + (int)(sizeof(evtchn_pending[0])*2), + evtchn_pending[i]); + } + + printk(KERN_DEBUG "\nglobal mask (only show words which have bits set to 0):\n "); + for (i = nr_elems-1; i >= 0; i--) + if (evtchn_mask[i] != ~0UL) { + printk(KERN_DEBUG " word index %d %0*"PRI_xen_ulong"\n", + i, + (int)(sizeof(evtchn_mask[0])*2), + evtchn_mask[i]); + } + + printk(KERN_DEBUG "\nglobally unmasked (only show result words which have bits set to 1):\n "); + for (i = nr_elems-1; i >= 0; i--) + if ((evtchn_pending[i] & ~evtchn_mask[i]) != 0UL) { + printk(KERN_DEBUG " word index %d %0*"PRI_xen_ulong"\n", + i, + (int)(sizeof(evtchn_mask[0])*2), + evtchn_pending[i] & ~evtchn_mask[i]); + } + + printk(KERN_DEBUG "\nlocal cpu%d mask (only show words which have bits set to 1):\n ", cpu); + for (i = (NR_EVENT_CHANNELS_L3/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) + if (cpu_evtchn[i] != 0UL) { + printk(KERN_DEBUG " word index %d %0*"PRI_xen_ulong"\n", + i, + (int)(sizeof(cpu_evtchn[0])*2), + cpu_evtchn[i]); + } + + printk(KERN_DEBUG "\nlocally unmasked (only show result words which have bits set to 1):\n "); + for (i = nr_elems-1; i >= 0; i--) { + xen_ulong_t pending = evtchn_pending[i] + & ~evtchn_mask[i] + & cpu_evtchn[i]; + if (pending != 0UL) { + printk(KERN_DEBUG " word index %d %0*"PRI_xen_ulong"\n", + i, + (int)(sizeof(evtchn_mask[0])*2), + pending); + } + } + + printk(KERN_DEBUG "\npending list:\n"); + for (i = 0; i < NR_EVENT_CHANNELS_L3; i++) { + if (sync_test_bit(i, evtchn_pending)) { + int word_idx = i / (BITS_PER_EVTCHN_WORD * BITS_PER_EVTCHN_WORD); + int word_idx_l2 = i / BITS_PER_EVTCHN_WORD; + printk(KERN_DEBUG " %d: event %d -> irq %d%s%s%s%s\n", + cpu_from_evtchn(i), i, + evtchn_to_irq[i], + !sync_test_bit(word_idx, BM(per_cpu(evtchn_sel, cpu)[0])) + ? "" : " l1-clear", + !sync_test_bit(word_idx_l2, BM(per_cpu(evtchn_sel, cpu)[1])) + ? "" : " l2-clear", + sync_test_bit(i, BM(evtchn_mask)) + ? "" : " globally-masked", + sync_test_bit(i, BM(cpu_evtchn)) + ? "" : " locally-masked"); + } + } + + return IRQ_HANDLED; +} + static DEFINE_PER_CPU(unsigned, xed_nesting_count); -static DEFINE_PER_CPU(unsigned int, current_word_idx); -static DEFINE_PER_CPU(unsigned int, current_bit_idx); +static DEFINE_PER_CPU(unsigned int[3], current_idx); /* * Mask out the i least significant bits of w */ #define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i)) +static __always_inline void process_port(int cpu, + unsigned int base, + unsigned int *idx, + unsigned int *idx_array) +{ + xen_ulong_t pending_bits, bits; + int port, irq; + struct irq_desc *desc; + + pending_bits = active_evtchns(cpu, base >> EVTCHN_WORD_BITORDER); + + do { + bits = MASK_LSBS(pending_bits, *idx); + + /* If we masked out all events, move on. */ + if (bits == 0) + break; + + *idx = EVTCHN_FIRST_BIT(bits); + + /* Process port. */ + port = base + *idx; + irq = evtchn_to_irq[port]; + + if (irq != -1) { + desc = irq_to_desc(irq); + if (desc) + generic_handle_irq_desc(irq, desc); + } + + *idx = (*idx + 1) % BITS_PER_EVTCHN_WORD; + + /* Next caller starts at last processed + 1 */ + /* + * As this routine is shared by 2/3-level event + * channel, we need to write all three current_idx + * elements. In the 2-level case, the caller /should/ + * always set idx_array[2] to ~0U, so in practice the + * write to current_idx[1] is equivalent to writing + * idx_array[1]. + */ + __this_cpu_write(current_idx[0], + idx_array[1] ? idx_array[0] : + (idx_array[0]+1) % BITS_PER_EVTCHN_WORD); + __this_cpu_write(current_idx[1], + idx_array[2] ? idx_array[1] : + (idx_array[1]+1) % BITS_PER_EVTCHN_WORD); + __this_cpu_write(current_idx[2], idx_array[2]); + } while (*idx != 0); +} + /* - * Search the CPUs pending events bitmasks. For each one found, map - * the event number to an irq, and feed it into do_IRQ() for - * handling. + * This function process active event channel top-down, L1 -> L2 -> + * .. -> L(n-1) -> bitmap. The selectors are processed recursively, + * the event bitmap is processed by process_port * - * Xen uses a two-level bitmap to speed searching. The first level is - * a bitset of words which contain pending event bits. The second - * level is a bitset of pending events themselves. + * @cpu: current cpu id + * @base: accumulated offsets along selector processing + * @start_idx: array used to resume index + * @idx: array of current processing index + * @sel_idx: selector word index + * @level: current processing level, from 0 to highest_level + * @highest_level: highest recursion level + * + * If level == higest_level, we reach the event bitmap. level + * variable starts from 0, so highest_level for 2-level ABI is 1, + * while for 3-level ABI it is 2. */ -static void __xen_evtchn_do_upcall_l2(void) +static void process(int cpu, + unsigned int base, + unsigned int *start_idx, + unsigned int *idx, + unsigned int sel_idx, + unsigned short level, + unsigned short highest_level) { - int start_word_idx, start_bit_idx; - int word_idx, bit_idx; int i; - int cpu = get_cpu(); - struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); - unsigned count; + xen_ulong_t pending_words; - do { - xen_ulong_t pending_words; + if (level == highest_level) { + process_port(cpu, base, &idx[level], idx); + return; + } - vcpu_info->evtchn_upcall_pending = 0; + pending_words + xchg_xen_ulong(&per_cpu(evtchn_sel, cpu)[level][sel_idx], 0); - if (__this_cpu_inc_return(xed_nesting_count) - 1) - goto out; + /* This loop is used to process selectors. */ + for (i = 0; pending_words != 0; i++) { + xen_ulong_t words; + unsigned int saved_base; + + words = MASK_LSBS(pending_words, idx[level]); /* - * Master flag must be cleared /before/ clearing - * selector flag. xchg_xen_ulong must contain an - * appropriate barrier. + * If we masked out all events, wrap to beginning. */ - pending_words = xchg_xen_ulong(per_cpu(evtchn_sel, cpu)[0], 0); - - start_word_idx = __this_cpu_read(current_word_idx); - start_bit_idx = __this_cpu_read(current_bit_idx); - - word_idx = start_word_idx; + if (words == 0) { + idx[level] = 0; + start_idx[level+1] = 0; + continue; + } - for (i = 0; pending_words != 0; i++) { - xen_ulong_t pending_bits; - xen_ulong_t words; + idx[level] = EVTCHN_FIRST_BIT(words); + + idx[level+1] = 0; /* usually scan entire word from start */ + if (idx[level] == start_idx[level]) { + /* We scan the starting word in two parts */ + if (i == 0) + /* 1st time: start in the middle */ + idx[level+1] = start_idx[level+1]; + else + /* 2nd time: mask bits done already */ + idx[level+1] &= (1UL << start_idx[level+1]) - 1; + } - words = MASK_LSBS(pending_words, word_idx); + saved_base = base; + base += (idx[level] << + (EVTCHN_WORD_BITORDER * (highest_level-level))); - /* - * If we masked out all events, wrap to beginning. - */ - if (words == 0) { - word_idx = 0; - bit_idx = 0; - continue; - } - word_idx = EVTCHN_FIRST_BIT(words); - - pending_bits = active_evtchns(cpu, word_idx); - bit_idx = 0; /* usually scan entire word from start */ - if (word_idx == start_word_idx) { - /* We scan the starting word in two parts */ - if (i == 0) - /* 1st time: start in the middle */ - bit_idx = start_bit_idx; - else - /* 2nd time: mask bits done already */ - bit_idx &= (1UL << start_bit_idx) - 1; - } + process(cpu, base, start_idx, idx, idx[level], + level+1, highest_level); - do { - xen_ulong_t bits; - int port, irq; - struct irq_desc *desc; + base = saved_base; - bits = MASK_LSBS(pending_bits, bit_idx); + /* Scan start_l1i twice; all others once. */ + if ((idx[level] != start_idx[level]) || (i != 0)) + pending_words &= ~(1UL << idx[level]); - /* If we masked out all events, move on. */ - if (bits == 0) - break; + idx[level] = (idx[level] + 1) % BITS_PER_EVTCHN_WORD; + } +} - bit_idx = EVTCHN_FIRST_BIT(bits); - /* Process port. */ - port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; - irq = evtchn_to_irq[port]; +/* This routine is shared between 2/3-level ABI */ +static void ___xen_evtchn_do_upcall(unsigned int *start_idx, + unsigned int *idx, + unsigned short highest_level) +{ + int cpu = get_cpu(); + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + unsigned count; - if (irq != -1) { - desc = irq_to_desc(irq); - if (desc) - generic_handle_irq_desc(irq, desc); - } + do { + vcpu_info->evtchn_upcall_pending = 0; - bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD; + if (__this_cpu_inc_return(xed_nesting_count) - 1) + goto out; - /* Next caller starts at last processed + 1 */ - __this_cpu_write(current_word_idx, - bit_idx ? word_idx : - (word_idx+1) % BITS_PER_EVTCHN_WORD); - __this_cpu_write(current_bit_idx, bit_idx); - } while (bit_idx != 0); + start_idx[0] = __this_cpu_read(current_idx[0]); + start_idx[1] = __this_cpu_read(current_idx[1]); + start_idx[2] = __this_cpu_read(current_idx[2]); - /* Scan start_l1i twice; all others once. */ - if ((word_idx != start_word_idx) || (i != 0)) - pending_words &= ~(1UL << word_idx); + idx[0] = start_idx[0]; - word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD; - } + process(cpu, 0 /* base */, start_idx, idx, + 0 /* selector index */, + 0 /* starting from L1 (1-1=0) */, + highest_level); BUG_ON(!irqs_disabled()); @@ -1451,6 +1619,42 @@ out: put_cpu(); } +/* + * Search the CPUs pending events bitmasks. For each one found, map + * the event number to an irq, and feed it into do_IRQ() for + * handling. + * + * Xen uses a two-level bitmap to speed searching. The first level is + * a bitset of words which contain pending event bits. The second + * level is a bitset of pending events themselves. + */ +static void __xen_evtchn_do_upcall_l2(void) +{ + /* + * Need three elements to feed into __process_port, but the + * third element is never used for 2-level ABI and should + * always be set to ~0U. + */ + unsigned int start_idx[3] = { 0, 0, ~0U }; + unsigned int idx[3] = { 0, 0, ~0U }; + + ___xen_evtchn_do_upcall(start_idx, idx, 1); +} + +static void __xen_evtchn_do_upcall_l3(void) +{ + /* + * Need three elements to feed into __process_port, but the + * third element is never used for 2-level ABI and should + * always be set to ~0U. + */ + unsigned int start_idx[3] = { 0, 0, 0 }; + unsigned int idx[3] = { 0, 0, 0 }; + + ___xen_evtchn_do_upcall(start_idx, idx, 2); + +} + void xen_evtchn_do_upcall(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -1870,6 +2074,12 @@ const struct evtchn_ops evtchn_l2_ops = { .do_upcall = __xen_evtchn_do_upcall_l2 }; +const struct evtchn_ops evtchn_l3_ops = { + .unmask = __unmask_local_port_l3, + .debug_interrupt = xen_debug_interrupt_l3, + .do_upcall = __xen_evtchn_do_upcall_l3 +}; + static int __cpuinit xen_events_notifier_cb(struct notifier_block *self, unsigned long action, void *hcpu) -- 1.7.10.4
Wei Liu
2013-Mar-19 15:22 UTC
[RFC PATCH V5 10/14] xen: document 2/3-level event channel ABI
Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- drivers/xen/events.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index fe1831b..ee33421 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -57,6 +57,47 @@ #include <xen/interface/sched.h> #include <asm/hw_irq.h> +/* + * The 2-level (default) event channel ABI: + * + * This is the default ABI, it is guaranteed to be supported. The name + * comes from its 2-level lookup path. + * + * The first level is a per-cpu selector in struct vcpu_info. The size + * of L1 selector is sizeof(xen_ulong_t), in which each bit represents + * a xen_ulong_t word in the event bitmap (second level). + * + * The second level is a shared bitmap of events, embedded in shared + * info page. + * + * The lookup path is as followed. We first look at each bit of the L1 + * selector. A non-zero bit in L1 selector indicates one or more bits + * in the corresponding word in L2 bitmap is / are set. In this case + * we pick up the word in bitmap, process each non-zero bit in the + * word and process the event. + * + * + * The 3-level event channel ABI: + * + * This ABI is more or less the same as the 2-level ABI. In this ABI: + * + * The first level is a per-cpu selector in struct vcpu_info. In fact, + * we reuse the same selector in 2-level ABI. + * + * The second level is a per-cpu bitmap of xen_ulong_t words, whose + * size is the same as the second level bitmap in 2-level ABI. However + * we cannot reuse the same bitmap in shared info page because this + * bitmap is per-cpu. + * + * The third level is a shared bitmap of events, which is allocated at + * boot time by Linux kernel. + * + * The lookup path is as followed. The first two levels lookup is the + * same as the 2-level ABI, but after picking up the non-zero bit in + * L2 selector, we still need to go down one level furthur for the + * actual event bit. + */ + /* extended event channel ABI in use, default is EVTCHN_EXTENDED_NONE */ uint64_t xen_evtchn_extended = EVTCHN_EXTENDED_NONE; EXPORT_SYMBOL_GPL(xen_evtchn_extended); -- 1.7.10.4
Wei Liu
2013-Mar-19 15:22 UTC
[RFC PATCH V5 11/14] xen: introduce xen_event_channel_query_extended_abis
Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- drivers/xen/events.c | 37 +++++++++++++++++++++++++++++++++++++ include/xen/events.h | 3 +++ 2 files changed, 40 insertions(+) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index ee33421..270821d 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -2109,6 +2109,43 @@ void xen_callback_vector(void) void xen_callback_vector(void) {} #endif +/* + * This function returns the extended AIBs a guest can use. + * When + * 1) hypervisor doesn''t support extended ABIs (EVTCHNOP_* not implemented) + * 2) hypervisor supports extended ABIs but this guest cannot use them + * it returns EVTCHN_EXTENDED_NONE + * otherwise it returns a or''ed bitmap of enabled ABIs + */ +uint64_t xen_event_channel_query_extended_abis(void) +{ + struct evtchn_query_extended_abis query; + int rc; + + memset(&query, 0, sizeof(query)); + + rc = HYPERVISOR_event_channel_op(EVTCHNOP_query_extended_abis, &query); + + if (rc < 0) { + printk(KERN_INFO + "Hypervisor does not support extended event channel ABIs."); + return EVTCHN_EXTENDED_NONE; + } + + printk(KERN_INFO "Hypervisor supports extended event channel ABIs.\n"); + + printk(KERN_INFO + "Extended event channel AIBs enabled for this guest:\n"); + if (query.abis == EVTCHN_EXTENDED_NONE /* 0 */) + printk(KERN_INFO " None (disabled by host administrator)\n"); + else { + if (query.abis & EVTCHN_EXTENDED_L3) + printk(KERN_INFO " 3-level event channel ABI\n"); + } + + return query.abis; +} + const struct evtchn_ops evtchn_l2_ops = { .unmask = __unmask_local_port_l2, .debug_interrupt = xen_debug_interrupt_l2, diff --git a/include/xen/events.h b/include/xen/events.h index 24cf421..49d54ac 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -114,4 +114,7 @@ void xen_init_IRQ(void); extern unsigned int xen_nr_event_channels; extern uint64_t xen_evtchn_extended; +/* Query hypervisor for supported / enabled extended event channel ABIs. */ +uint64_t xen_event_channel_query_extended_abis(void); + #endif /* _XEN_EVENTS_H */ -- 1.7.10.4
Wei Liu
2013-Mar-19 15:22 UTC
[RFC PATCH V5 12/14] xen: introduce xen_event_channel_register_3level
Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- drivers/xen/events.c | 172 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 161 insertions(+), 11 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 270821d..6bb9a47 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -201,6 +201,16 @@ static bool (*pirq_needs_eoi)(unsigned irq); */ #define EVTCHN_WORD_BITORDER (sizeof(xen_ulong_t) == 8 ? 6 : 5) /* + * If we use 3-level event channel and the event word size is 64 bits, we have + * 256k event channels in total, for 32 bits, we have 32k event channels in + * total. A page (4K) can represent 4096 * 8 = 32k event channels. So we can + * calculate pages needed for 3-level event channels is 1 page for 32 bits and + * 8 pages for 64 bits. + */ +#define BITMAP_PG_ORDER (BITS_PER_EVTCHN_WORD == 64 ? 3 : 0) +#define BITMAP_NR_PAGES (BITMAP_PG_ORDER == 3 ? 8 : 1) + +/* * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t * array. Primarily to avoid long lines (hence the terse name). */ @@ -2146,6 +2156,115 @@ uint64_t xen_event_channel_query_extended_abis(void) return query.abis; } +static int xen_event_channel_register_3level_bitmaps(void) +{ + struct evtchn_register_3level reg; + int i; + int rc; + xen_ulong_t _evtchn_pending[EVTCHN_MAX_L3_PAGES]; + xen_ulong_t _evtchn_mask[EVTCHN_MAX_L3_PAGES]; + + /* + * can only register 3-level ABI in following states: + * a) no extended ABIs in use + * b) come from restore path which already has ABI set and + * pages allocated + */ + if (!(xen_evtchn_extended == EVTCHN_EXTENDED_NONE || + (xen_evtchn_extended == EVTCHN_EXTENDED_L3 && + evtchn_pending && evtchn_pending))) + return -EINVAL; + + /* + * If we come from restore path, we don''t need to allocate + * pages. + */ + if (!evtchn_pending && !evtchn_mask) { + /* Get zeroed pages */ + evtchn_pending + (xen_ulong_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + BITMAP_PG_ORDER); + evtchn_mask + (xen_ulong_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + BITMAP_PG_ORDER); + if (!evtchn_pending || !evtchn_mask) { + free_pages((unsigned long)evtchn_pending, BITMAP_PG_ORDER); + free_pages((unsigned long)evtchn_mask, BITMAP_PG_ORDER); + evtchn_pending = NULL; + evtchn_mask = NULL; + rc = -ENOMEM; + goto err; + } + } + + memset(®, 0, sizeof(reg)); + + for (i = 0; i < BITMAP_NR_PAGES; i++) { + unsigned long offset = PAGE_SIZE * i; + _evtchn_pending[i] + arbitrary_virt_to_mfn( + (void *)((unsigned long)evtchn_pending+offset)); + _evtchn_mask[i] + arbitrary_virt_to_mfn( + (void *)((unsigned long)evtchn_mask+offset)); + } + + reg.cmd = REGISTER_BITMAPS; + reg.u.bitmaps.nr_pages = BITMAP_NR_PAGES; + reg.u.bitmaps.evtchn_pending = _evtchn_pending; + reg.u.bitmaps.evtchn_mask = _evtchn_mask; + + rc = HYPERVISOR_event_channel_op(EVTCHNOP_register_3level, ®); + if (rc) { + free_pages((unsigned long)evtchn_pending, BITMAP_PG_ORDER); + free_pages((unsigned long)evtchn_mask, BITMAP_PG_ORDER); + evtchn_pending = NULL; + evtchn_mask = NULL; + } + +err: + return rc; +} + +int xen_event_channel_register_3level_l2selector(int cpu) +{ + struct evtchn_register_3level reg; + int rc; + + memset(®, 0, sizeof(reg)); + + reg.cmd = REGISTER_L2_SELECTOR; + + reg.u.l2_selector.cpu_id = cpu; + reg.u.l2_selector.mfn + arbitrary_virt_to_mfn(&per_cpu(evtchn_sel_l2, cpu)); + reg.u.l2_selector.offset + offset_in_page(&per_cpu(evtchn_sel_l2, cpu)); + + rc = HYPERVISOR_event_channel_op(EVTCHNOP_register_3level, ®); + + if (rc == -EBUSY) /* already registered, this can happen in hotplug */ + return 0; + + if (!rc) + per_cpu(evtchn_sel, cpu)[1] = per_cpu(evtchn_sel_l2, cpu); + + return rc; +} + +static int xen_event_channel_register_3level(void) +{ + int rc; + + rc = xen_event_channel_register_3level_bitmaps(); + if (rc) + return rc; + + rc = xen_event_channel_register_3level_l2selector(0); + + return rc; +} + const struct evtchn_ops evtchn_l2_ops = { .unmask = __unmask_local_port_l2, .debug_interrupt = xen_debug_interrupt_l2, @@ -2158,6 +2277,47 @@ const struct evtchn_ops evtchn_l3_ops = { .do_upcall = __xen_evtchn_do_upcall_l3 }; +void xen_set_event_channel_extended(uint64_t abi) +{ + struct shared_info *s = HYPERVISOR_shared_info; + int cpu; + + switch (abi) { + case EVTCHN_EXTENDED_NONE: + evtchn_pending = s->evtchn_pending; + evtchn_mask = s->evtchn_mask; + for_each_possible_cpu(cpu) { + struct vcpu_info *vcpu_info = per_cpu(xen_vcpu, cpu); + per_cpu(evtchn_sel, cpu)[0] + &vcpu_info->evtchn_pending_sel; + } + xen_evtchn_extended = EVTCHN_EXTENDED_NONE; + xen_nr_event_channels = NR_EVENT_CHANNELS_L2; + eops = &evtchn_l2_ops; + printk(KERN_INFO "Using 2-level event channel ABI.\n"); + break; + case EVTCHN_EXTENDED_L3: + /* evtchn_pending/mask already set */ + for_each_possible_cpu(cpu) { + struct vcpu_info *vcpu_info = per_cpu(xen_vcpu, cpu); + per_cpu(evtchn_sel, cpu)[0] + &vcpu_info->evtchn_pending_sel; + per_cpu(evtchn_sel, cpu)[1] + per_cpu(evtchn_sel_l2, cpu); + } + xen_evtchn_extended = EVTCHN_EXTENDED_L3; + xen_nr_event_channels = NR_EVENT_CHANNELS_L3; + eops = &evtchn_l3_ops; + printk(KERN_INFO "Using 3-level event channel ABI.\n"); + break; + default: + printk(KERN_EMERG + "Trying to set unsupported event channel ABI %llx\n", + abi); + BUG(); + } +} + static int __cpuinit xen_events_notifier_cb(struct notifier_block *self, unsigned long action, void *hcpu) @@ -2197,18 +2357,8 @@ void __init xen_init_IRQ(void) { int i; int cpu; - struct shared_info *s = HYPERVISOR_shared_info; - - evtchn_pending = s->evtchn_pending; - evtchn_mask = s->evtchn_mask; - for_each_possible_cpu(cpu) { - struct vcpu_info *vcpu_info = per_cpu(xen_vcpu, cpu); - per_cpu(evtchn_sel, cpu)[0] = &vcpu_info->evtchn_pending_sel; - } - xen_evtchn_extended = EVTCHN_EXTENDED_NONE; - xen_nr_event_channels = NR_EVENT_CHANNELS_L2; - eops = &evtchn_l2_ops; + xen_set_event_channel_extended(EVTCHN_EXTENDED_NONE); evtchn_to_irq = kcalloc(xen_nr_event_channels, sizeof(*evtchn_to_irq), GFP_KERNEL); -- 1.7.10.4
Wei Liu
2013-Mar-19 15:22 UTC
[RFC PATCH V5 13/14] xen: introduce xen_event_channel_register_extended
Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- drivers/xen/events.c | 26 ++++++++++++++++++++++++++ include/xen/events.h | 6 ++++++ 2 files changed, 32 insertions(+) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 6bb9a47..6f21f27 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -2265,6 +2265,32 @@ static int xen_event_channel_register_3level(void) return rc; } +int xen_event_channel_register_extended(uint64_t abi) +{ + int rc = -EINVAL; + + switch (abi) { + case EVTCHN_EXTENDED_L3: + rc = xen_event_channel_register_3level(); + if (rc == 0) + printk(KERN_INFO + "Register 3-level event channel succeed.\n"); + else + printk(KERN_INFO + "Register 3-level event channel failed: %d\n", + rc); + break; + default: + printk(KERN_EMERG + "Trying to register unsupported event channel ABI %llx\n", + abi); + BUG(); + } + + return rc; +} + + const struct evtchn_ops evtchn_l2_ops = { .unmask = __unmask_local_port_l2, .debug_interrupt = xen_debug_interrupt_l2, diff --git a/include/xen/events.h b/include/xen/events.h index 49d54ac..a6a6024 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -117,4 +117,10 @@ extern uint64_t xen_evtchn_extended; /* Query hypervisor for supported / enabled extended event channel ABIs. */ uint64_t xen_event_channel_query_extended_abis(void); +/* Set extended event channel to "abi". */ +void xen_set_event_channel_extended(uint64_t abi); + +/* Register extended event channel. */ +int xen_event_channel_register_extended(uint64_t abi); + #endif /* _XEN_EVENTS_H */ -- 1.7.10.4
CPU hotplug is supported. Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- arch/x86/xen/enlighten.c | 12 ++++++++++++ drivers/xen/events.c | 22 +++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 3556678..18edf66 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -213,6 +213,18 @@ void xen_vcpu_restore(void) HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) BUG(); } + + /* + * If we use any extended event channel ABI, should try to + * re-setup it in restore path. Currently only 3-level ABI is + * implemented, so simplify the code a bit. + */ + if (xen_evtchn_extended & EVTCHN_EXTENDED_L3) { + int rc; + rc = xen_event_channel_register_extended(EVTCHN_EXTENDED_L3); + if (rc) + xen_set_event_channel_extended(EVTCHN_EXTENDED_NONE); + } } static void __init xen_banner(void) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 6f21f27..b7e5bc1 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -2368,6 +2368,11 @@ static int __cpuinit xen_events_notifier_cb(struct notifier_block *self, rc = NOTIFY_OK; } } + if (rc == NOTIFY_OK && + xen_evtchn_extended & EVTCHN_EXTENDED_L3) { + rc = xen_event_channel_register_3level_l2selector(cpu); + rc = (rc == 0 ? NOTIFY_OK : NOTIFY_BAD); + } break; default: break; @@ -2383,8 +2388,23 @@ void __init xen_init_IRQ(void) { int i; int cpu; + uint64_t evtchn_ext_abis; + int rc, fallback_to_default_evtchn = 0; + + evtchn_ext_abis = xen_event_channel_query_extended_abis(); + + if (evtchn_ext_abis == EVTCHN_EXTENDED_NONE) + fallback_to_default_evtchn = 1; + else if (evtchn_ext_abis & EVTCHN_EXTENDED_L3) { + rc = xen_event_channel_register_extended(EVTCHN_EXTENDED_L3); + if (rc == 0) + xen_set_event_channel_extended(EVTCHN_EXTENDED_L3); + else + fallback_to_default_evtchn = 1; + } - xen_set_event_channel_extended(EVTCHN_EXTENDED_NONE); + if (fallback_to_default_evtchn) + xen_set_event_channel_extended(EVTCHN_EXTENDED_NONE); evtchn_to_irq = kcalloc(xen_nr_event_channels, sizeof(*evtchn_to_irq), GFP_KERNEL); -- 1.7.10.4