This patch series implements 3-level event channel routines in Xen. The implementation is as followed: * Add a field evtchn_level in struct domain. * Add pointers in struct domain to point to 3-level shared array. * Add 2nd level selector in struct vcpu. * Add a new op in do_event_channel_op to register n-level evtchn. The exposed interface for registering is extendable, however only 3-level is supported at the moment. The routines for 3-level evtchns are more or less the same as 2-level ones.
Wei Liu
2012-Dec-31 18:22 UTC
[RFC PATCH 1/3] Add a field in struct domain to indicate evtchn level.
From: Wei Liu <liuw@liuw.name> Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- xen/common/event_channel.c | 1 + xen/include/xen/sched.h | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c index 89f0ca7..87e422e 100644 --- a/xen/common/event_channel.c +++ b/xen/common/event_channel.c @@ -1173,6 +1173,7 @@ void notify_via_xen_event_channel(struct domain *ld, int lport) int evtchn_init(struct domain *d) { spin_lock_init(&d->event_lock); + d->evtchn_level = 2; if ( get_free_port(d) != 0 ) return -EINVAL; evtchn_from_port(d, 0)->state = ECS_RESERVED; diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 6c55039..1c43e0a 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -50,7 +50,19 @@ extern struct domain *dom0; #else #define BITS_PER_EVTCHN_WORD(d) (has_32bit_shinfo(d) ? 32 : BITS_PER_LONG) #endif -#define MAX_EVTCHNS(d) (BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d)) +#define MAX_EVTCHNS_L2(d) (BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d)) +#define MAX_EVTCHNS_L3(d) (MAX_EVTCHNS_L2(d) * BITS_PER_EVTCHN_WORD(d)) +#define MAX_EVTCHNS(d) ({ int __v = 0; \ + switch ( d->evtchn_level ) { \ + case 2: \ + __v = MAX_EVTCHNS_L2(d); break; \ + case 3: \ + __v = MAX_EVTCHNS_L3(d); break; \ + default: \ + BUG(); \ + }; \ + __v;}) + #define EVTCHNS_PER_BUCKET 128 #define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS / EVTCHNS_PER_BUCKET) @@ -262,6 +274,7 @@ struct domain /* Event channel information. */ struct evtchn *evtchn[NR_EVTCHN_BUCKETS]; spinlock_t event_lock; + unsigned int evtchn_level; struct grant_table *grant_table; -- 1.7.10.4
Wei Liu
2012-Dec-31 18:22 UTC
[RFC PATCH 2/3] Dynamically allocate domain->evtchn, also bump EVTCHNS_PER_BUCKET to 512.
From: Wei Liu <liuw@liuw.name> Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- xen/common/event_channel.c | 8 ++++++++ xen/include/xen/sched.h | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c index 87e422e..9898f8e 100644 --- a/xen/common/event_channel.c +++ b/xen/common/event_channel.c @@ -1172,6 +1172,12 @@ void notify_via_xen_event_channel(struct domain *ld, int lport) int evtchn_init(struct domain *d) { + d->evtchn = (struct evtchn **) + xzalloc_array(struct evtchn *, NR_EVTCHN_BUCKETS); + + if ( !d->evtchn ) + return -ENOMEM; + spin_lock_init(&d->event_lock); d->evtchn_level = 2; if ( get_free_port(d) != 0 ) @@ -1215,6 +1221,8 @@ void evtchn_destroy(struct domain *d) spin_unlock(&d->event_lock); clear_global_virq_handlers(d); + + xfree(d->evtchn); } diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 1c43e0a..5f23213 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -63,7 +63,7 @@ extern struct domain *dom0; }; \ __v;}) -#define EVTCHNS_PER_BUCKET 128 +#define EVTCHNS_PER_BUCKET 512 #define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS / EVTCHNS_PER_BUCKET) struct evtchn @@ -272,7 +272,7 @@ struct domain spinlock_t rangesets_lock; /* Event channel information. */ - struct evtchn *evtchn[NR_EVTCHN_BUCKETS]; + struct evtchn **evtchn; spinlock_t event_lock; unsigned int evtchn_level; -- 1.7.10.4
From: Wei Liu <liuw@liuw.name> Add some fields in struct domain to hold pointer to 3-level shared arrays. Also add per-cpu second level selector in struct vcpu. These structures are mapped by a new evtchn op. Guest should fall back to use 2-level event channel if mapping fails. The routines are more or less as the 2-level ones. Signed-off-by: Wei Liu <wei.liu2@citrix.com> --- xen/arch/x86/domain.c | 2 + xen/arch/x86/irq.c | 2 +- xen/common/event_channel.c | 520 ++++++++++++++++++++++++++++++++++-- xen/common/keyhandler.c | 12 +- xen/common/schedule.c | 2 +- xen/include/public/event_channel.h | 24 ++ xen/include/public/xen.h | 17 +- xen/include/xen/event.h | 4 + xen/include/xen/sched.h | 12 +- 9 files changed, 565 insertions(+), 30 deletions(-) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 7a07c06..b457b00 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -2058,6 +2058,8 @@ int domain_relinquish_resources(struct domain *d) } } + event_channel_unmap_nlevel(d); + d->arch.relmem = RELMEM_shared; /* fallthrough */ diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c index 05cede5..d517e39 100644 --- a/xen/arch/x86/irq.c +++ b/xen/arch/x86/irq.c @@ -1452,7 +1452,7 @@ int pirq_guest_unmask(struct domain *d) { pirq = pirqs[i]->pirq; if ( pirqs[i]->masked && - !test_bit(pirqs[i]->evtchn, &shared_info(d, evtchn_mask)) ) + !evtchn_is_masked(d, pirqs[i]->evtchn) ) pirq_guest_eoi(pirqs[i]); } } while ( ++pirq < d->nr_pirqs && n == ARRAY_SIZE(pirqs) ); diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c index 9898f8e..fb3a7b4 100644 --- a/xen/common/event_channel.c +++ b/xen/common/event_channel.c @@ -27,6 +27,7 @@ #include <xen/guest_access.h> #include <xen/keyhandler.h> #include <asm/current.h> +#include <xen/paging.h> #include <public/xen.h> #include <public/event_channel.h> @@ -413,6 +414,85 @@ static long evtchn_bind_pirq(evtchn_bind_pirq_t *bind) return rc; } +static inline int __evtchn_is_masked_l2(struct domain *d, int port) +{ + return test_bit(port, &shared_info(d, evtchn_mask)); +} + +static inline int __evtchn_is_masked_l3(struct domain *d, int port) +{ + return test_bit(port % EVTCHNS_PER_PAGE, + d->evtchn_mask[port / EVTCHNS_PER_PAGE]); +} + +int evtchn_is_masked(struct domain *d, int port) +{ + switch ( d->evtchn_level ) + { + case 2: + return __evtchn_is_masked_l2(d, port); + case 3: + return __evtchn_is_masked_l3(d, port); + default: + printk(" %s: unknown event channel level %d for domain %d \n", + __FUNCTION__, d->evtchn_level, d->domain_id); + return 1; + } +} + +static inline int __evtchn_is_pending_l2(struct domain *d, int port) +{ + return test_bit(port, &shared_info(d, evtchn_pending)); +} + +static inline int __evtchn_is_pending_l3(struct domain *d, int port) +{ + return test_bit(port % EVTCHNS_PER_PAGE, + d->evtchn_pending[port / EVTCHNS_PER_PAGE]); +} + +int evtchn_is_pending(struct domain *d, int port) +{ + switch ( d->evtchn_level ) + { + case 2: + return __evtchn_is_pending_l2(d, port); + case 3: + return __evtchn_is_pending_l3(d, port); + default: + printk(" %s: unknown event channel level %d for domain %d\n", + __FUNCTION__, d->evtchn_level, d->domain_id); + return 0; + } +} + +static inline void __evtchn_clear_pending_l2(struct domain *d, int port) +{ + clear_bit(port, &shared_info(d, evtchn_pending)); +} + +static inline void __evtchn_clear_pending_l3(struct domain *d, int port) +{ + clear_bit(port % EVTCHNS_PER_PAGE, + d->evtchn_pending[port / EVTCHNS_PER_PAGE]); +} + +static void evtchn_clear_pending(struct domain *d, int port) +{ + switch ( d->evtchn_level ) + { + case 2: + __evtchn_clear_pending_l2(d, port); + break; + case 3: + __evtchn_clear_pending_l3(d, port); + break; + default: + printk("Cannot clear pending for %d level event channel" + " for domain %d, port %d\n", + d->evtchn_level, d->domain_id, port); + } +} static long __evtchn_close(struct domain *d1, int port1) { @@ -529,7 +609,7 @@ static long __evtchn_close(struct domain *d1, int port1) } /* Clear pending event to avoid unexpected behavior on re-bind. */ - clear_bit(port1, &shared_info(d1, evtchn_pending)); + evtchn_clear_pending(d1, port1); /* Reset binding to vcpu0 when the channel is freed. */ chn1->state = ECS_FREE; @@ -606,16 +686,15 @@ int evtchn_send(struct domain *d, unsigned int lport) ret = -EINVAL; } -out: + out: spin_unlock(&ld->event_lock); return ret; } -static void evtchn_set_pending(struct vcpu *v, int port) +static void __evtchn_set_pending_l2(struct vcpu *v, int port) { struct domain *d = v->domain; - int vcpuid; /* * The following bit operations must happen in strict order. @@ -633,7 +712,50 @@ static void evtchn_set_pending(struct vcpu *v, int port) { vcpu_mark_events_pending(v); } - +} + +static void __evtchn_set_pending_l3(struct vcpu *v, int port) +{ + struct domain *d = v->domain; + + int page_no = port / EVTCHNS_PER_PAGE; + int offset = port % EVTCHNS_PER_PAGE; + int l1cb = BITS_PER_EVTCHN_WORD(d)*BITS_PER_EVTCHN_WORD(d); + int l2cb = BITS_PER_EVTCHN_WORD(d); + + if ( test_and_set_bit(offset, d->evtchn_pending[page_no]) ) + return; + + if ( !test_bit(offset, d->evtchn_mask[page_no]) && + !test_and_set_bit(port / l2cb, + v->evtchn_pending_sel_l2) && + !test_and_set_bit(port / l1cb, + &vcpu_info(v, evtchn_pending_sel)) ) + { + vcpu_mark_events_pending(v); + } +} + +static void evtchn_set_pending(struct vcpu *v, int port) +{ + struct domain *d = v->domain; + int vcpuid; + + switch ( d->evtchn_level ) + { + case 2: + __evtchn_set_pending_l2(v, port); + break; + case 3: + __evtchn_set_pending_l3(v, port); + break; + default: + printk("Cannot set pending for %d level event channel" + " for domain %d, port %d\n", + d->evtchn_level, d->domain_id, port); + return; + } + /* Check if some VCPU might be polling for this event. */ if ( likely(bitmap_empty(d->poll_mask, d->max_vcpus)) ) return; @@ -916,21 +1038,16 @@ long evtchn_bind_vcpu(unsigned int port, unsigned int vcpu_id) } -int evtchn_unmask(unsigned int port) +static int __evtchn_unmask_l2(unsigned int port) { struct domain *d = current->domain; struct vcpu *v; - ASSERT(spin_is_locked(&d->event_lock)); - - if ( unlikely(!port_is_valid(d, port)) ) - return -EINVAL; - v = d->vcpu[evtchn_from_port(d, port)->notify_vcpu_id]; /* * These operations must happen in strict order. Based on - * include/xen/event.h:evtchn_set_pending(). + * __evtchn_set_pending_l2(). */ if ( test_and_clear_bit(port, &shared_info(d, evtchn_mask)) && test_bit (port, &shared_info(d, evtchn_pending)) && @@ -943,6 +1060,58 @@ int evtchn_unmask(unsigned int port) return 0; } +static int __evtchn_unmask_l3(unsigned int port) +{ + struct domain *d = current->domain; + struct vcpu *v; + + int page_no = port / EVTCHNS_PER_PAGE; + int offset = port % EVTCHNS_PER_PAGE; + int l1cb = BITS_PER_EVTCHN_WORD(d)*BITS_PER_EVTCHN_WORD(d); + int l2cb = BITS_PER_EVTCHN_WORD(d); + + v = d->vcpu[evtchn_from_port(d, port)->notify_vcpu_id]; + + if ( test_and_clear_bit(offset, d->evtchn_mask[page_no]) && + test_bit(offset, d->evtchn_pending[page_no]) && + !test_and_set_bit(port / l2cb, + v->evtchn_pending_sel_l2) && + !test_and_set_bit(port / l1cb, + &vcpu_info(v, evtchn_pending_sel)) ) + { + vcpu_mark_events_pending(v); + } + + return 0; +} + +int evtchn_unmask(unsigned int port) +{ + struct domain *d = current->domain; + int rc = -EINVAL; + + ASSERT(spin_is_locked(&d->event_lock)); + + if ( unlikely(!port_is_valid(d, port)) ) + return -EINVAL; + + switch ( d->evtchn_level ) + { + case 2: + rc = __evtchn_unmask_l2(port); + break; + case 3: + rc = __evtchn_unmask_l3(port); + break; + default: + printk("Cannot unmask port %d for %d level event channel" + " for domain %d\n", port, + d->evtchn_level, d->domain_id); + } + + return rc; +} + static long evtchn_reset(evtchn_reset_t *r) { @@ -969,6 +1138,290 @@ out: return rc; } +static void __unmap_l2_sel(struct vcpu *v) +{ + unsigned long mfn; + + if ( v->evtchn_pending_sel_l2 != 0 ) + { + unmap_domain_page_global(v->evtchn_pending_sel_l2); + mfn = virt_to_mfn(v->evtchn_pending_sel_l2); + put_page_and_type(mfn_to_page(mfn)); + + v->evtchn_pending_sel_l2 = 0; + } +} + +static int __map_l2_sel(struct vcpu *v, unsigned long gfn, unsigned long off) +{ + void *mapping; + int rc = -EINVAL; + struct page_info *page; + struct domain *d = v->domain; + + if ( off >= PAGE_SIZE ) + return rc; + + page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC); + if ( !page ) + goto out; + + if ( !get_page_type(page, PGT_writable_page) ) + { + put_page(page); + goto out; + } + + mapping = __map_domain_page_global(page); + if ( mapping == NULL ) + { + put_page_and_type(page); + rc = -ENOMEM; + goto out; + } + + v->evtchn_pending_sel_l2 = (unsigned long *)(mapping + off); + rc = 0; + + out: + return rc; +} + + +static void __unmap_l3_arrays(struct domain *d) +{ + int i; + unsigned long mfn; + + for ( i = 0; i < MAX_L3_PAGES; i++ ) + { + if ( d->evtchn_pending[i] != 0 ) + { + unmap_domain_page_global(d->evtchn_pending[i]); + mfn = virt_to_mfn(d->evtchn_pending[i]); + put_page_and_type(mfn_to_page(mfn)); + d->evtchn_pending[i] = 0; + } + if ( d->evtchn_mask[i] != 0 ) + { + unmap_domain_page_global(d->evtchn_mask[i]); + mfn = virt_to_mfn(d->evtchn_mask[i]); + put_page_and_type(mfn_to_page(mfn)); + d->evtchn_mask[i] = 0; + } + } +} + +static int __map_l3_arrays(struct domain *d, unsigned long *pending, + unsigned long *mask) +{ + int i; + void *pending_mapping, *mask_mapping; + struct page_info *pending_page, *mask_page; + unsigned long pending_gfn, mask_gfn; + int rc = -EINVAL; + + for ( i = 0; + i < MAX_L3_PAGES && pending[i] != 0 && mask[i] != 0; + i++ ) + { + pending_gfn = pending[i]; + mask_gfn = mask[i]; + + pending_page = get_page_from_gfn(d, pending_gfn, NULL, P2M_ALLOC); + if ( !pending_page ) + goto err; + + mask_page = get_page_from_gfn(d, mask_gfn, NULL, P2M_ALLOC); + if ( !mask_page ) + { + put_page(pending_page); + goto err; + } + + if ( !get_page_type(pending_page, PGT_writable_page) ) + { + put_page(pending_page); + put_page(mask_page); + goto err; + } + + if ( !get_page_type(mask_page, PGT_writable_page) ) + { + put_page_and_type(pending_page); + put_page(mask_page); + goto err; + } + + pending_mapping = __map_domain_page_global(pending_page); + if ( !pending_mapping ) + { + put_page_and_type(pending_page); + put_page_and_type(mask_page); + rc = -ENOMEM; + goto err; + } + + mask_mapping = __map_domain_page_global(mask_page); + if ( !mask_mapping ) + { + unmap_domain_page_global(pending_mapping); + put_page_and_type(pending_page); + put_page_and_type(mask_page); + rc = -ENOMEM; + goto err; + } + + d->evtchn_pending[i] = pending_mapping; + d->evtchn_mask[i] = mask_mapping; + } + + rc = 0; + +err: + return rc; +} + +static void __evtchn_unmap_all_3level(struct domain *d) +{ + struct vcpu *v; + /* This is called when destroying a domain, so no pausing... */ + for_each_vcpu ( d, v ) + __unmap_l2_sel(v); + __unmap_l3_arrays(d); +} + +void event_channel_unmap_nlevel(struct domain *d) +{ + switch ( d->evtchn_level ) + { + case 3: + __evtchn_unmap_all_3level(d); + break; + default: + break; + } +} + +static void __evtchn_migrate_bitmap_l3(struct domain *d) +{ + struct vcpu *v; + + /* Easy way to migrate, just move existing selector down one level + * then copy the pending array and mask array */ + for_each_vcpu ( d, v ) + { + memcpy(&v->evtchn_pending_sel_l2[0], + &vcpu_info(v, evtchn_pending_sel), + sizeof(vcpu_info(v, evtchn_pending_sel))); + + memset(&vcpu_info(v, evtchn_pending_sel), 0, + sizeof(vcpu_info(v, evtchn_pending_sel))); + + set_bit(0, &vcpu_info(v, evtchn_pending_sel)); + } + + memcpy(d->evtchn_pending[0], &shared_info(d, evtchn_pending), + sizeof(shared_info(d, evtchn_pending))); + memcpy(d->evtchn_mask[0], &shared_info(d, evtchn_mask), + sizeof(shared_info(d, evtchn_mask))); +} + +static long __evtchn_register_3level(struct evtchn_register_3level *r) +{ + struct domain *d = current->domain; + unsigned long mfns[r->nr_vcpus]; + unsigned long offsets[r->nr_vcpus]; + unsigned char was_up[r->nr_vcpus]; + int rc, i; + struct vcpu *v; + + if ( d->evtchn_level == 3 ) + return -EINVAL; + + if ( r->nr_vcpus > d->max_vcpus ) + return -EINVAL; + + for ( i = 0; i < MAX_L3_PAGES; i++ ) + if ( d->evtchn_pending[i] || d->evtchn_mask[i] ) + return -EINVAL; + + for_each_vcpu ( d, v ) + if ( v->evtchn_pending_sel_l2 ) + return -EINVAL; + + if ( copy_from_user(mfns, r->l2sel_mfn, + sizeof(unsigned long)*r->nr_vcpus) ) + return -EFAULT; + + if ( copy_from_user(offsets, r->l2sel_offset, + sizeof(unsigned long)*r->nr_vcpus) ) + return -EFAULT; + + /* put cpu offline */ + for_each_vcpu ( d, v ) + { + if ( v == current ) + was_up[v->vcpu_id] = 1; + else + was_up[v->vcpu_id] = !test_and_set_bit(_VPF_down, + &v->pause_flags); + } + + /* map evtchn pending array and evtchn mask array */ + rc = __map_l3_arrays(d, r->evtchn_pending, r->evtchn_mask); + if ( rc ) + goto out; + + for_each_vcpu ( d, v ) + { + if ( (rc = __map_l2_sel(v, mfns[v->vcpu_id], offsets[v->vcpu_id])) ) + { + struct vcpu *v1; + for_each_vcpu ( d, v1 ) + __unmap_l2_sel(v1); + __unmap_l3_arrays(d); + goto out; + } + } + + /* Scan current bitmap and migrate all outstanding events to new bitmap */ + __evtchn_migrate_bitmap_l3(d); + + /* make sure all writes take effect before switching to new routines */ + wmb(); + d->evtchn_level = 3; + + out: + /* bring cpus back online */ + for_each_vcpu ( d, v ) + if ( was_up[v->vcpu_id] && + test_and_clear_bit(_VPF_down, &v->pause_flags) ) + vcpu_wake(v); + + return rc; +} + +static long evtchn_register_nlevel(evtchn_register_nlevel_t *r) +{ + struct domain *d = current->domain; + int rc; + + spin_lock(&d->event_lock); + + switch ( r->level ) + { + case 3: + rc = __evtchn_register_3level(&r->u.l3); + break; + default: + rc = -EINVAL; + } + + spin_unlock(&d->event_lock); + + return rc; +} long do_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) { @@ -1078,6 +1531,14 @@ long do_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) break; } + case EVTCHNOP_register_nlevel: { + struct evtchn_register_nlevel reg; + if ( copy_from_guest(®, arg, 1) != 0 ) + return -EFAULT; + rc = evtchn_register_nlevel(®); + break; + } + default: rc = -ENOSYS; break; @@ -1251,7 +1712,6 @@ void evtchn_move_pirqs(struct vcpu *v) spin_unlock(&d->event_lock); } - static void domain_dump_evtchn_info(struct domain *d) { unsigned int port; @@ -1260,8 +1720,10 @@ static void domain_dump_evtchn_info(struct domain *d) bitmap_scnlistprintf(keyhandler_scratch, sizeof(keyhandler_scratch), d->poll_mask, d->max_vcpus); printk("Event channel information for domain %d:\n" + "using %d-level event channel\n" "Polling vCPUs: {%s}\n" - " port [p/m]\n", d->domain_id, keyhandler_scratch); + " port [p/m]\n", d->domain_id, d->evtchn_level, + keyhandler_scratch); spin_lock(&d->event_lock); @@ -1269,6 +1731,8 @@ static void domain_dump_evtchn_info(struct domain *d) { const struct evtchn *chn; char *ssid; + int page_no = port / EVTCHNS_PER_PAGE; + int offset = port % EVTCHNS_PER_PAGE; if ( !port_is_valid(d, port) ) continue; @@ -1276,11 +1740,28 @@ static void domain_dump_evtchn_info(struct domain *d) if ( chn->state == ECS_FREE ) continue; - printk(" %4u [%d/%d]: s=%d n=%d x=%d", - port, - !!test_bit(port, &shared_info(d, evtchn_pending)), - !!test_bit(port, &shared_info(d, evtchn_mask)), - chn->state, chn->notify_vcpu_id, chn->xen_consumer); + printk(" %4u", port); + + switch ( d->evtchn_level ) + { + case 2: + printk(" [%d/%d]:", + !!test_bit(port, &shared_info(d, evtchn_pending)), + !!test_bit(port, &shared_info(d, evtchn_mask))); + break; + case 3: + printk(" [%d/%d]:", + !!test_bit(offset, d->evtchn_pending[page_no]), + !!test_bit(offset, d->evtchn_mask[page_no])); + break; + default: + printk(" %s: unknown event channel level %d for domain %d \n", + __FUNCTION__, d->evtchn_level, d->domain_id); + goto out; + } + + printk(" s=%d n=%d x=%d", chn->state, chn->notify_vcpu_id, + chn->xen_consumer); switch ( chn->state ) { @@ -1310,6 +1791,7 @@ static void domain_dump_evtchn_info(struct domain *d) } } + out: spin_unlock(&d->event_lock); } diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c index 2c5c230..294cca9 100644 --- a/xen/common/keyhandler.c +++ b/xen/common/keyhandler.c @@ -298,15 +298,15 @@ static void dump_domains(unsigned char key) { for_each_vcpu ( d, v ) { + unsigned int bits = BITS_PER_EVTCHN_WORD(d); + for (int i = 2; i < d->evtchn_level; i++) + bits *= BITS_PER_EVTCHN_WORD(d); printk("Notifying guest %d:%d (virq %d, port %d, stat %d/%d/%d)\n", d->domain_id, v->vcpu_id, VIRQ_DEBUG, v->virq_to_evtchn[VIRQ_DEBUG], - test_bit(v->virq_to_evtchn[VIRQ_DEBUG], - &shared_info(d, evtchn_pending)), - test_bit(v->virq_to_evtchn[VIRQ_DEBUG], - &shared_info(d, evtchn_mask)), - test_bit(v->virq_to_evtchn[VIRQ_DEBUG] / - BITS_PER_EVTCHN_WORD(d), + evtchn_is_pending(d, v->virq_to_evtchn[VIRQ_DEBUG]), + evtchn_is_masked(d, v->virq_to_evtchn[VIRQ_DEBUG]), + test_bit(v->virq_to_evtchn[VIRQ_DEBUG] / bits, &vcpu_info(v, evtchn_pending_sel))); send_guest_vcpu_virq(v, VIRQ_DEBUG); } diff --git a/xen/common/schedule.c b/xen/common/schedule.c index ae798c9..b676c9c 100644 --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -693,7 +693,7 @@ static long do_poll(struct sched_poll *sched_poll) goto out; rc = 0; - if ( test_bit(port, &shared_info(d, evtchn_pending)) ) + if ( evtchn_is_pending(d, port) ) goto out; } diff --git a/xen/include/public/event_channel.h b/xen/include/public/event_channel.h index 07ff321..29cd6e9 100644 --- a/xen/include/public/event_channel.h +++ b/xen/include/public/event_channel.h @@ -71,6 +71,7 @@ #define EVTCHNOP_bind_vcpu 8 #define EVTCHNOP_unmask 9 #define EVTCHNOP_reset 10 +#define EVTCHNOP_register_nlevel 11 /* ` } */ typedef uint32_t evtchn_port_t; @@ -258,6 +259,29 @@ struct evtchn_reset { typedef struct evtchn_reset evtchn_reset_t; /* + * EVTCHNOP_register_nlevel: Register N level event channels. + * NOTES: + * 1. currently only 3-level is supported. + * 2. should fall back to basic 2-level if this call fails. + */ +#define MAX_L3_PAGES 8 +struct evtchn_register_3level { + unsigned long evtchn_pending[MAX_L3_PAGES]; + unsigned long evtchn_mask[MAX_L3_PAGES]; + unsigned long *l2sel_mfn; + unsigned long *l2sel_offset; + uint32_t nr_vcpus; +}; + +struct evtchn_register_nlevel { + uint32_t level; + union { + struct evtchn_register_3level l3; + } u; +}; +typedef struct evtchn_register_nlevel evtchn_register_nlevel_t; + +/* * ` enum neg_errnoval * ` HYPERVISOR_event_channel_op_compat(struct evtchn_op *op) * ` diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h index 5593066..1d4ef2d 100644 --- a/xen/include/public/xen.h +++ b/xen/include/public/xen.h @@ -554,9 +554,24 @@ DEFINE_XEN_GUEST_HANDLE(multicall_entry_t); /* * Event channel endpoints per domain: + * 2-level: * 1024 if a long is 32 bits; 4096 if a long is 64 bits. + * 3-level: + * 32k if a long is 32 bits; 256k if a long is 64 bits; */ -#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64) +#define NR_EVENT_CHANNELS_L2 (sizeof(unsigned long) * sizeof(unsigned long) * 64) +#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * 64) +#define NR_EVENT_CHANNELS(x) ({ unsigned int __v = 0; \ + switch (x) { \ + case 2: \ + __v = NR_EVENT_CHANNELS_L2; break; \ + case 3: \ + __v = NR_EVENT_CHANNELS_L3; break; \ + default: \ + BUG(); \ + } \ + __v;}) + struct vcpu_time_info { /* diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h index 71c3e92..e7cd6be 100644 --- a/xen/include/xen/event.h +++ b/xen/include/xen/event.h @@ -102,4 +102,8 @@ void notify_via_xen_event_channel(struct domain *ld, int lport); mb(); /* set blocked status /then/ caller does his work */ \ } while ( 0 ) +extern void event_channel_unmap_nlevel(struct domain *d); +int evtchn_is_masked(struct domain *d, int port); +int evtchn_is_pending(struct domain *d, int port); + #endif /* __XEN_EVENT_H__ */ diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 5f23213..ae78549 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -64,7 +64,8 @@ extern struct domain *dom0; __v;}) #define EVTCHNS_PER_BUCKET 512 -#define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS / EVTCHNS_PER_BUCKET) +#define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS_L3 / EVTCHNS_PER_BUCKET) +#define EVTCHNS_PER_PAGE (PAGE_SIZE * 8) struct evtchn { @@ -104,7 +105,7 @@ void evtchn_destroy_final(struct domain *d); /* from complete_domain_destroy */ struct waitqueue_vcpu; -struct vcpu +struct vcpu { int vcpu_id; @@ -112,6 +113,9 @@ struct vcpu vcpu_info_t *vcpu_info; + /* For 3-level event channel */ + unsigned long *evtchn_pending_sel_l2; + struct domain *domain; struct vcpu *next_in_list; @@ -275,6 +279,10 @@ struct domain struct evtchn **evtchn; spinlock_t event_lock; unsigned int evtchn_level; +#define L3_PAGES (NR_EVENT_CHANNELS_L3/BITS_PER_LONG*sizeof(unsigned long)/PAGE_SIZE) + unsigned long *evtchn_pending[L3_PAGES]; + unsigned long *evtchn_mask[L3_PAGES]; +#undef L3_PAGES struct grant_table *grant_table; -- 1.7.10.4
David Vrabel
2013-Jan-02 11:11 UTC
Re: [RFC PATCH 1/3] Add a field in struct domain to indicate evtchn level.
On 31/12/12 18:22, Wei Liu wrote:> From: Wei Liu <liuw@liuw.name>The first patch needs to add the ABI documentation. It''s not clear what this is all for from just the changeset descriptions.> Signed-off-by: Wei Liu <wei.liu2@citrix.com> > --- > xen/common/event_channel.c | 1 + > xen/include/xen/sched.h | 15 ++++++++++++++- > 2 files changed, 15 insertions(+), 1 deletion(-) > > diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c > index 89f0ca7..87e422e 100644 > --- a/xen/common/event_channel.c > +++ b/xen/common/event_channel.c > @@ -1173,6 +1173,7 @@ void notify_via_xen_event_channel(struct domain *ld, int lport) > int evtchn_init(struct domain *d) > { > spin_lock_init(&d->event_lock); > + d->evtchn_level = 2;I''d be inclined to have something like: /* Original ABI: 2 level event channels */ #define EVTCHN_LEVEL_DEFAULT 2 ... d->evtchn_level = EVTCHN_LEVEL_DEFAULT;> if ( get_free_port(d) != 0 ) > return -EINVAL; > evtchn_from_port(d, 0)->state = ECS_RESERVED; > diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h > index 6c55039..1c43e0a 100644 > --- a/xen/include/xen/sched.h > +++ b/xen/include/xen/sched.h > @@ -50,7 +50,19 @@ extern struct domain *dom0; > #else > #define BITS_PER_EVTCHN_WORD(d) (has_32bit_shinfo(d) ? 32 : BITS_PER_LONG) > #endif > -#define MAX_EVTCHNS(d) (BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d)) > +#define MAX_EVTCHNS_L2(d) (BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d)) > +#define MAX_EVTCHNS_L3(d) (MAX_EVTCHNS_L2(d) * BITS_PER_EVTCHN_WORD(d)) > +#define MAX_EVTCHNS(d) ({ int __v = 0; \ > + switch ( d->evtchn_level ) { \ > + case 2: \ > + __v = MAX_EVTCHNS_L2(d); break; \ > + case 3: \ > + __v = MAX_EVTCHNS_L3(d); break; \ > + default: \ > + BUG(); \ > + }; \ > + __v;})Here you BUG if d->evtchn_level is bad, but in other places in later patches with similar you have a printk. Need to be more consistent here. On the Linux side you use a set of event channel ops instead of repeatedly testing d->evtchn_level. Would this also be a better approach for the Xen side? David> #define EVTCHNS_PER_BUCKET 128 > #define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS / EVTCHNS_PER_BUCKET) > > @@ -262,6 +274,7 @@ struct domain > /* Event channel information. */ > struct evtchn *evtchn[NR_EVTCHN_BUCKETS]; > spinlock_t event_lock; > + unsigned int evtchn_level; > > struct grant_table *grant_table; >
David Vrabel
2013-Jan-02 13:38 UTC
Re: [RFC PATCH 2/3] Dynamically allocate domain->evtchn, also bump EVTCHNS_PER_BUCKET to 512.
On 31/12/12 18:22, Wei Liu wrote:> From: Wei Liu <liuw@liuw.name>The changeset description needs to say why you''re increasing EVTCHNS_PER_BUCKET. I can''t tell why.> Signed-off-by: Wei Liu <wei.liu2@citrix.com> > --- > xen/common/event_channel.c | 8 ++++++++ > xen/include/xen/sched.h | 4 ++-- > 2 files changed, 10 insertions(+), 2 deletions(-) > > diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c > index 87e422e..9898f8e 100644 > --- a/xen/common/event_channel.c > +++ b/xen/common/event_channel.c > @@ -1172,6 +1172,12 @@ void notify_via_xen_event_channel(struct domain *ld, int lport) > > int evtchn_init(struct domain *d) > { > + d->evtchn = (struct evtchn **) > + xzalloc_array(struct evtchn *, NR_EVTCHN_BUCKETS);Don''t need the cast here? David
David Vrabel
2013-Jan-02 14:08 UTC
Re: [RFC PATCH 3/3] Implement 3-level event channel routines.
On 31/12/12 18:22, Wei Liu wrote:> From: Wei Liu <liuw@liuw.name> > > Add some fields in struct domain to hold pointer to 3-level shared arrays. > Also add per-cpu second level selector in struct vcpu. > > These structures are mapped by a new evtchn op. Guest should fall back to use > 2-level event channel if mapping fails. > > The routines are more or less as the 2-level ones. > > Signed-off-by: Wei Liu <wei.liu2@citrix.com>[...]> --- a/xen/common/event_channel.c > +++ b/xen/common/event_channel.c > @@ -27,6 +27,7 @@ > #include <xen/guest_access.h> > #include <xen/keyhandler.h> > #include <asm/current.h> > +#include <xen/paging.h> > > #include <public/xen.h> > #include <public/event_channel.h> > @@ -413,6 +414,85 @@ static long evtchn_bind_pirq(evtchn_bind_pirq_t *bind) > return rc; > } > > +static inline int __evtchn_is_masked_l2(struct domain *d, int port) > +{ > + return test_bit(port, &shared_info(d, evtchn_mask)); > +} > + > +static inline int __evtchn_is_masked_l3(struct domain *d, int port) > +{ > + return test_bit(port % EVTCHNS_PER_PAGE, > + d->evtchn_mask[port / EVTCHNS_PER_PAGE]); > +} > + > +int evtchn_is_masked(struct domain *d, int port) > +{ > + switch ( d->evtchn_level ) > + { > + case 2: > + return __evtchn_is_masked_l2(d, port); > + case 3: > + return __evtchn_is_masked_l3(d, port); > + default: > + printk(" %s: unknown event channel level %d for domain %d \n", > + __FUNCTION__, d->evtchn_level, d->domain_id); > + return 1; > + }Drop the printk? If d->evtchn_level is wrong this will spam the console and it BUGs elsewhere anyway. There are a bunch of other similar places.> +static long __evtchn_register_3level(struct evtchn_register_3level *r) > +{ > + struct domain *d = current->domain; > + unsigned long mfns[r->nr_vcpus]; > + unsigned long offsets[r->nr_vcpus]; > + unsigned char was_up[r->nr_vcpus]; > + int rc, i; > + struct vcpu *v; > + > + if ( d->evtchn_level == 3 ) > + return -EINVAL; > + > + if ( r->nr_vcpus > d->max_vcpus ) > + return -EINVAL; > + > + for ( i = 0; i < MAX_L3_PAGES; i++ ) > + if ( d->evtchn_pending[i] || d->evtchn_mask[i] ) > + return -EINVAL; > + > + for_each_vcpu ( d, v ) > + if ( v->evtchn_pending_sel_l2 ) > + return -EINVAL; > + > + if ( copy_from_user(mfns, r->l2sel_mfn, > + sizeof(unsigned long)*r->nr_vcpus) ) > + return -EFAULT; > + > + if ( copy_from_user(offsets, r->l2sel_offset, > + sizeof(unsigned long)*r->nr_vcpus) ) > + return -EFAULT; > + > + /* put cpu offline */ > + for_each_vcpu ( d, v ) > + { > + if ( v == current ) > + was_up[v->vcpu_id] = 1; > + else > + was_up[v->vcpu_id] = !test_and_set_bit(_VPF_down, > + &v->pause_flags); > + }Why offline the VCPUs? I think there needs to be comment explaining why.> + /* map evtchn pending array and evtchn mask array */ > + rc = __map_l3_arrays(d, r->evtchn_pending, r->evtchn_mask); > + if ( rc ) > + goto out; > + > + for_each_vcpu ( d, v ) > + { > + if ( (rc = __map_l2_sel(v, mfns[v->vcpu_id], offsets[v->vcpu_id])) ) > + { > + struct vcpu *v1; > + for_each_vcpu ( d, v1 ) > + __unmap_l2_sel(v1); > + __unmap_l3_arrays(d); > + goto out; > + } > + } > + > + /* Scan current bitmap and migrate all outstanding events to new bitmap */ > + __evtchn_migrate_bitmap_l3(d);This migrate seems racy. Won''t events after the migrate but before we set the level below be lost? Alternatively, if it''s not racy because it''s all protected with d->event_lock, then the wmb() isn''t required as the spin locks are implicit barriers.> + > + /* make sure all writes take effect before switching to new routines */ > + wmb(); > + d->evtchn_level = 3; > + > + out: > + /* bring cpus back online */ > + for_each_vcpu ( d, v ) > + if ( was_up[v->vcpu_id] && > + test_and_clear_bit(_VPF_down, &v->pause_flags) ) > + vcpu_wake(v); > + > + return rc; > +} > + > +static long evtchn_register_nlevel(evtchn_register_nlevel_t *r) > +{ > + struct domain *d = current->domain; > + int rc; > + > + spin_lock(&d->event_lock); > + > + switch ( r->level ) > + { > + case 3: > + rc = __evtchn_register_3level(&r->u.l3); > + break; > + default: > + rc = -EINVAL; > + } > + > + spin_unlock(&d->event_lock); > + > + return rc; > +} > > long do_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg) > {[...]> --- a/xen/include/public/event_channel.h > +++ b/xen/include/public/event_channel.h > @@ -71,6 +71,7 @@ > #define EVTCHNOP_bind_vcpu 8 > #define EVTCHNOP_unmask 9 > #define EVTCHNOP_reset 10 > +#define EVTCHNOP_register_nlevel 11 > /* ` } */ > > typedef uint32_t evtchn_port_t; > @@ -258,6 +259,29 @@ struct evtchn_reset { > typedef struct evtchn_reset evtchn_reset_t; > > /* > + * EVTCHNOP_register_nlevel: Register N level event channels. > + * NOTES: > + * 1. currently only 3-level is supported. > + * 2. should fall back to basic 2-level if this call fails. > + */ > +#define MAX_L3_PAGES 8This is a public header so this needs to be prefixed. e.g. #define EVTCHN_MAX_L3_PAGES 8> +struct evtchn_register_3level { > + unsigned long evtchn_pending[MAX_L3_PAGES]; > + unsigned long evtchn_mask[MAX_L3_PAGES]; > + unsigned long *l2sel_mfn; > + unsigned long *l2sel_offset;Should these unsigned longs be xen_pfn_t''s?> + uint32_t nr_vcpus; > +}; > + > +struct evtchn_register_nlevel { > + uint32_t level; > + union { > + struct evtchn_register_3level l3; > + } u; > +}; > +typedef struct evtchn_register_nlevel evtchn_register_nlevel_t;Does there need to be compat handling for these structures? As-is the structures look like they do.> + > +/* > * ` enum neg_errnoval > * ` HYPERVISOR_event_channel_op_compat(struct evtchn_op *op) > * ` > diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h > index 5593066..1d4ef2d 100644 > --- a/xen/include/public/xen.h > +++ b/xen/include/public/xen.h > @@ -554,9 +554,24 @@ DEFINE_XEN_GUEST_HANDLE(multicall_entry_t); > > /* > * Event channel endpoints per domain: > + * 2-level: > * 1024 if a long is 32 bits; 4096 if a long is 64 bits. > + * 3-level: > + * 32k if a long is 32 bits; 256k if a long is 64 bits; > */ > -#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64) > +#define NR_EVENT_CHANNELS_L2 (sizeof(unsigned long) * sizeof(unsigned long) * 64) > +#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * 64) > +#define NR_EVENT_CHANNELS(x) ({ unsigned int __v = 0; \ > + switch (x) { \ > + case 2: \ > + __v = NR_EVENT_CHANNELS_L2; break; \ > + case 3: \ > + __v = NR_EVENT_CHANNELS_L3; break; \ > + default: \ > + BUG(); \ > + } \ > + __v;}) > +You''ve not documented what ''x'' is in this macro. Also name it ''level''. David
Wei Liu
2013-Jan-02 14:27 UTC
Re: [RFC PATCH 2/3] Dynamically allocate domain->evtchn, also bump EVTCHNS_PER_BUCKET to 512.
On Wed, 2013-01-02 at 13:38 +0000, David Vrabel wrote:> On 31/12/12 18:22, Wei Liu wrote: > > From: Wei Liu <liuw@liuw.name> > > The changeset description needs to say why you''re increasing > EVTCHNS_PER_BUCKET. I can''t tell why. >Here is the tedious maths. My thought was that it is not very interesting to see this in a change log, so I dropped it. But I will add this in my later re-post of this series... #define EVTCHNS_PER_BUCKET ??? #define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNEL_L3/EVTCHNS_PER_BUCKET) d->evtchn = xzalloc_array(struct evtchn *, NR_EVTCHN_BUCKETS); We need to allow for 3-level evtchn, so use NR_EVENT_CHANNELS_L3 to calculate NR_EVTCHN_BUCKETS. For 64 bit build, NR_EVENT_CHANNELS_L3 is 256k. The original value of EVTCHNS_PER_BUCKET is 128, which means NR_EVTCHN_BUCKETS=2048, thus d->evtchn has size of 2048*8 = 16KB = 4 pages. Given that only Dom0 or driver domain will need 3-level event channel, this is really overkill for most guests. If we bump EVTCHNS_PER_BUCKET to 512, d->evtchn becomes 512 * 8 = 4KB 1 page, which I think is more space efficient for most guests. Of course, bumping EVTCHNS_PER_BUCKETS will cause overhead in other place. Xen allocates a buckets-size of struct evtchn at a time. So the overhead is: sizeof(struct evtchn) * (512-128) ~= 18 bytes * 384 So the conclusion is: 1 page + sizeof(struct evtchn) * 384 < 4 pages IMHO this is a good reason to bump EVTCHNS_PER_BUCKETS to 512. Wei.> > Signed-off-by: Wei Liu <wei.liu2@citrix.com> > > --- > > xen/common/event_channel.c | 8 ++++++++ > > xen/include/xen/sched.h | 4 ++-- > > 2 files changed, 10 insertions(+), 2 deletions(-) > > > > diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c > > index 87e422e..9898f8e 100644 > > --- a/xen/common/event_channel.c > > +++ b/xen/common/event_channel.c > > @@ -1172,6 +1172,12 @@ void notify_via_xen_event_channel(struct domain *ld, int lport) > > > > int evtchn_init(struct domain *d) > > { > > + d->evtchn = (struct evtchn **) > > + xzalloc_array(struct evtchn *, NR_EVTCHN_BUCKETS); > > Don''t need the cast here? > > David
Wei Liu
2013-Jan-02 14:28 UTC
Re: [RFC PATCH 1/3] Add a field in struct domain to indicate evtchn level.
On Wed, 2013-01-02 at 11:11 +0000, David Vrabel wrote:> On 31/12/12 18:22, Wei Liu wrote: > > From: Wei Liu <liuw@liuw.name> > > The first patch needs to add the ABI documentation. > > It''s not clear what this is all for from just the changeset descriptions. > > > Signed-off-by: Wei Liu <wei.liu2@citrix.com> > > --- > > xen/common/event_channel.c | 1 + > > xen/include/xen/sched.h | 15 ++++++++++++++- > > 2 files changed, 15 insertions(+), 1 deletion(-) > > > > diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c > > index 89f0ca7..87e422e 100644 > > --- a/xen/common/event_channel.c > > +++ b/xen/common/event_channel.c > > @@ -1173,6 +1173,7 @@ void notify_via_xen_event_channel(struct domain *ld, int lport) > > int evtchn_init(struct domain *d) > > { > > spin_lock_init(&d->event_lock); > > + d->evtchn_level = 2; > > I''d be inclined to have something like: > > /* Original ABI: 2 level event channels */ > #define EVTCHN_LEVEL_DEFAULT 2 > ... > d->evtchn_level = EVTCHN_LEVEL_DEFAULT; > > > > if ( get_free_port(d) != 0 ) > > return -EINVAL; > > evtchn_from_port(d, 0)->state = ECS_RESERVED; > > diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h > > index 6c55039..1c43e0a 100644 > > --- a/xen/include/xen/sched.h > > +++ b/xen/include/xen/sched.h > > @@ -50,7 +50,19 @@ extern struct domain *dom0; > > #else > > #define BITS_PER_EVTCHN_WORD(d) (has_32bit_shinfo(d) ? 32 : BITS_PER_LONG) > > #endif > > -#define MAX_EVTCHNS(d) (BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d)) > > +#define MAX_EVTCHNS_L2(d) (BITS_PER_EVTCHN_WORD(d) * BITS_PER_EVTCHN_WORD(d)) > > +#define MAX_EVTCHNS_L3(d) (MAX_EVTCHNS_L2(d) * BITS_PER_EVTCHN_WORD(d)) > > +#define MAX_EVTCHNS(d) ({ int __v = 0; \ > > + switch ( d->evtchn_level ) { \ > > + case 2: \ > > + __v = MAX_EVTCHNS_L2(d); break; \ > > + case 3: \ > > + __v = MAX_EVTCHNS_L3(d); break; \ > > + default: \ > > + BUG(); \ > > + }; \ > > + __v;}) > > Here you BUG if d->evtchn_level is bad, but in other places in later > patches with similar you have a printk. Need to be more consistent here. >Given that d->evtchn_level is only manipulated by the hypervisor, I think a BUG() here is correct. I will add a BUG() after printk''s in other places if necessary. If I switch from ''switch'' to ops pointers, this problem will automatically go away.> On the Linux side you use a set of event channel ops instead of > repeatedly testing d->evtchn_level. Would this also be a better > approach for the Xen side?I will have a look at this. Wei.
Ian Campbell
2013-Jan-02 16:45 UTC
Re: [RFC PATCH 3/3] Implement 3-level event channel routines.
On Mon, 2012-12-31 at 18:22 +0000, Wei Liu wrote:> diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c > index 9898f8e..fb3a7b4 100644 > --- a/xen/common/event_channel.c > +++ b/xen/common/event_channel.c > @@ -27,6 +27,7 @@ > #include <xen/guest_access.h> > #include <xen/keyhandler.h> > #include <asm/current.h> > +#include <xen/paging.h> > > #include <public/xen.h> > #include <public/event_channel.h> > @@ -413,6 +414,85 @@ static long evtchn_bind_pirq(evtchn_bind_pirq_t *bind) > return rc; > } > > +static inline int __evtchn_is_masked_l2(struct domain *d, int port) > +{ > + return test_bit(port, &shared_info(d, evtchn_mask)); > +} > + > +static inline int __evtchn_is_masked_l3(struct domain *d, int port) > +{ > + return test_bit(port % EVTCHNS_PER_PAGE, > + d->evtchn_mask[port / EVTCHNS_PER_PAGE]); > +} > + > +int evtchn_is_masked(struct domain *d, int port) > +{ > + switch ( d->evtchn_level ) > + { > + case 2: > + return __evtchn_is_masked_l2(d, port); > + case 3: > + return __evtchn_is_masked_l3(d, port); > + default: > + printk(" %s: unknown event channel level %d for domain %d \n", > + __FUNCTION__, d->evtchn_level, d->domain_id); > + return 1; > + } > +}How much of this soft of thing goes away if we arrange for d->evtchn_mask to point to &shared_info(d, evtchn_mask) when evtchn_level == 2? (Likewise for pending etc)> @@ -633,7 +712,50 @@ static void evtchn_set_pending(struct vcpu *v, int port) > { > vcpu_mark_events_pending(v); > } > - > +} > + > +static void __evtchn_set_pending_l3(struct vcpu *v, int port) > +{ > + struct domain *d = v->domain; > + > + int page_no = port / EVTCHNS_PER_PAGE; > + int offset = port % EVTCHNS_PER_PAGE; > + int l1cb = BITS_PER_EVTCHN_WORD(d)*BITS_PER_EVTCHN_WORD(d); > + int l2cb = BITS_PER_EVTCHN_WORD(d);What does "cb" in these variable stand for?> + > + if ( test_and_set_bit(offset, d->evtchn_pending[page_no]) ) > + return; > + > + if ( !test_bit(offset, d->evtchn_mask[page_no]) && > + !test_and_set_bit(port / l2cb, > + v->evtchn_pending_sel_l2) && > + !test_and_set_bit(port / l1cb, > + &vcpu_info(v, evtchn_pending_sel)) ) > + { > + vcpu_mark_events_pending(v); > + } > +}It doesn''t seem like it would be too hard to merge this with the l2 version. Perhaps using a scheme similar to Linux''s page table where a per-level macro collapses into a nop (with the appropriate return). If you do end up duplicating the function then you should duplicate the command about the importance of the ordering and the barriers etc too. You should write down the datastructure somewhere. In particular I''m not sure if l1 is at the top or the bottom. I think it''s the top. I think it would also be useful to be explicit about what the l3 things corresponding to the fields in the shared info and vcpu info are and which is the new level.> +static int __map_l2_sel(struct vcpu *v, unsigned long gfn, unsigned long off) > +{ > + void *mapping; > + int rc = -EINVAL; > + struct page_info *page; > + struct domain *d = v->domain; > + > + if ( off >= PAGE_SIZE ) > + return rc; > + > + page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC); > + if ( !page ) > + goto out; > + > + if ( !get_page_type(page, PGT_writable_page) ) > + { > + put_page(page); > + goto out; > + } > + > + mapping = __map_domain_page_global(page);I don''t think you can validly use map_domain_page for a long lived mapping. domain_page.h says "Allow temporary mapping of domain page frames into Xen space.". Are these array per-vcpu or shared? Because if they are per-vcpu you don''t need the global variant.> +static int __map_l3_arrays(struct domain *d, unsigned long *pending, > + unsigned long *mask) > +{ > + int i; > + void *pending_mapping, *mask_mapping; > + struct page_info *pending_page, *mask_page; > + unsigned long pending_gfn, mask_gfn; > + int rc = -EINVAL; > + > + for ( i = 0; > + i < MAX_L3_PAGES && pending[i] != 0 && mask[i] != 0; > + i++ ) > + { > + pending_gfn = pending[i]; > + mask_gfn = mask[i]; > + > + pending_page = get_page_from_gfn(d, pending_gfn, NULL, P2M_ALLOC); > + if ( !pending_page )I think you need to initialise rc here and for each subsequent goto err. The initial EINVAL is overwritten to ENOMEM below on the first iteration and isn''t reset back to EINVAL at the top of the loop.> + goto err; > + > + mask_page = get_page_from_gfn(d, mask_gfn, NULL, P2M_ALLOC); > + if ( !mask_page ) > + { > + put_page(pending_page); > + goto err; > + } > + > + if ( !get_page_type(pending_page, PGT_writable_page) ) > + { > + put_page(pending_page); > + put_page(mask_page); > + goto err; > + } > + > + if ( !get_page_type(mask_page, PGT_writable_page) ) > + { > + put_page_and_type(pending_page); > + put_page(mask_page); > + goto err; > + } > + > + pending_mapping = __map_domain_page_global(pending_page); > + if ( !pending_mapping ) > + { > + put_page_and_type(pending_page); > + put_page_and_type(mask_page); > + rc = -ENOMEM; > + goto err; > + } > + > + mask_mapping = __map_domain_page_global(mask_page); > + if ( !mask_mapping ) > + { > + unmap_domain_page_global(pending_mapping); > + put_page_and_type(pending_page); > + put_page_and_type(mask_page); > + rc = -ENOMEM; > + goto err; > + } > + > + d->evtchn_pending[i] = pending_mapping; > + d->evtchn_mask[i] = mask_mapping; > + } > + > + rc = 0; > + > +err: > + return rc; > +}> diff --git a/xen/include/public/event_channel.h b/xen/include/public/event_channel.h > index 07ff321..29cd6e9 100644 > --- a/xen/include/public/event_channel.h > +++ b/xen/include/public/event_channel.h > @@ -71,6 +71,7 @@ > #define EVTCHNOP_bind_vcpu 8 > #define EVTCHNOP_unmask 9 > #define EVTCHNOP_reset 10 > +#define EVTCHNOP_register_nlevel 11 > /* ` } */ > > typedef uint32_t evtchn_port_t; > @@ -258,6 +259,29 @@ struct evtchn_reset { > typedef struct evtchn_reset evtchn_reset_t; > > /* > + * EVTCHNOP_register_nlevel: Register N level event channels. > + * NOTES: > + * 1. currently only 3-level is supported. > + * 2. should fall back to basic 2-level if this call fails. > + */ > +#define MAX_L3_PAGES 8 > +struct evtchn_register_3level { > + unsigned long evtchn_pending[MAX_L3_PAGES]; > + unsigned long evtchn_mask[MAX_L3_PAGES]; > + unsigned long *l2sel_mfn; > + unsigned long *l2sel_offset;You can''t use bare pointers in a hypercall argument like this, you have to use the GUEST_HANDLE stuff. Please also document which are IN and OUT parameters (look at other struct definitions for examples). The evtchn_{pending,mask} can''t be the actual pages, are they mfns?> + uint32_t nr_vcpus; > +}; > + > +struct evtchn_register_nlevel { > + uint32_t level; > + union { > + struct evtchn_register_3level l3;Might be more extensible to N>3 if evtchn_{pending,mask} were also guest handles rather than fixed size arrays?> + } u; > +}; > +typedef struct evtchn_register_nlevel evtchn_register_nlevel_t; > + > +/* > * ` enum neg_errnoval > * ` HYPERVISOR_event_channel_op_compat(struct evtchn_op *op) > * ` > diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h > index 5593066..1d4ef2d 100644 > --- a/xen/include/public/xen.h > +++ b/xen/include/public/xen.h > @@ -554,9 +554,24 @@ DEFINE_XEN_GUEST_HANDLE(multicall_entry_t); > > /* > * Event channel endpoints per domain: > + * 2-level: > * 1024 if a long is 32 bits; 4096 if a long is 64 bits. > + * 3-level: > + * 32k if a long is 32 bits; 256k if a long is 64 bits; > */ > -#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64) > +#define NR_EVENT_CHANNELS_L2 (sizeof(unsigned long) * sizeof(unsigned long) * 64) > +#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * 64) > +#define NR_EVENT_CHANNELS(x) ({ unsigned int __v = 0; \ > + switch (x) { \ > + case 2: \ > + __v = NR_EVENT_CHANNELS_L2; break; \ > + case 3: \ > + __v = NR_EVENT_CHANNELS_L3; break; \ > + default: \ > + BUG(); \ > + } \ > + __v;})xen/include/public is supposed to be ANSI-C and I think the #define ({...}) syntax is a gcc-ism. I think the ..._L{N} defines are sufficient.> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h > index 5f23213..ae78549 100644 > --- a/xen/include/xen/sched.h > +++ b/xen/include/xen/sched.h > @@ -64,7 +64,8 @@ extern struct domain *dom0; > __v;}) > > #define EVTCHNS_PER_BUCKET 512 > -#define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS / EVTCHNS_PER_BUCKET) > +#define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS_L3 / EVTCHNS_PER_BUCKET) > +#define EVTCHNS_PER_PAGE (PAGE_SIZE * 8) > > struct evtchn > { > @@ -104,7 +105,7 @@ void evtchn_destroy_final(struct domain *d); /* from complete_domain_destroy */ > > struct waitqueue_vcpu; > > -struct vcpu > +struct vcpu > { > int vcpu_id; > > @@ -112,6 +113,9 @@ struct vcpu > > vcpu_info_t *vcpu_info; > > + /* For 3-level event channel */ > + unsigned long *evtchn_pending_sel_l2; > + > struct domain *domain; > > struct vcpu *next_in_list; > @@ -275,6 +279,10 @@ struct domain > struct evtchn **evtchn; > spinlock_t event_lock; > unsigned int evtchn_level; > +#define L3_PAGES (NR_EVENT_CHANNELS_L3/BITS_PER_LONG*sizeof(unsigned long)/PAGE_SIZE) > + unsigned long *evtchn_pending[L3_PAGES]; > + unsigned long *evtchn_mask[L3_PAGES]; > +#undef L3_PAGES > > struct grant_table *grant_table; > > -- > 1.7.10.4 > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
Jan Beulich
2013-Jan-03 10:36 UTC
Re: [RFC PATCH 2/3] Dynamically allocate domain->evtchn, also bump EVTCHNS_PER_BUCKET to 512.
>>> On 02.01.13 at 15:27, Wei Liu <Wei.Liu2@citrix.com> wrote: > On Wed, 2013-01-02 at 13:38 +0000, David Vrabel wrote: >> On 31/12/12 18:22, Wei Liu wrote: >> > From: Wei Liu <liuw@liuw.name> >> >> The changeset description needs to say why you''re increasing >> EVTCHNS_PER_BUCKET. I can''t tell why. >> > > Here is the tedious maths. My thought was that it is not very > interesting to see this in a change log, so I dropped it. But I will add > this in my later re-post of this series... > > #define EVTCHNS_PER_BUCKET ??? > #define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNEL_L3/EVTCHNS_PER_BUCKET) > d->evtchn = xzalloc_array(struct evtchn *, NR_EVTCHN_BUCKETS); > > We need to allow for 3-level evtchn, so use NR_EVENT_CHANNELS_L3 to > calculate NR_EVTCHN_BUCKETS. > > For 64 bit build, NR_EVENT_CHANNELS_L3 is 256k. The original value of > EVTCHNS_PER_BUCKET is 128, which means NR_EVTCHN_BUCKETS=2048, thus > d->evtchn has size of 2048*8 = 16KB = 4 pages. Given that only Dom0 or > driver domain will need 3-level event channel, this is really overkill > for most guests. > > If we bump EVTCHNS_PER_BUCKET to 512, d->evtchn becomes 512 * 8 = 4KB > 1 page, which I think is more space efficient for most guests.But not suitable for allocation through xmalloc() (as it''ll end up doing an order-1 page allocation). Either special case it here, or BUILD_BUG_ON() the effective size being exactly a page, or we may look into finally making xmalloc() return a page with no other tracking data when asked for exactly a page''s worth of bytes. Jan> Of course, bumping EVTCHNS_PER_BUCKETS will cause overhead in other > place. Xen allocates a buckets-size of struct evtchn at a time. So the > overhead is: > > sizeof(struct evtchn) * (512-128) ~= 18 bytes * 384 > > So the conclusion is: > > 1 page + sizeof(struct evtchn) * 384 < 4 pages > > IMHO this is a good reason to bump EVTCHNS_PER_BUCKETS to 512. > > > Wei. > >> > Signed-off-by: Wei Liu <wei.liu2@citrix.com> >> > --- >> > xen/common/event_channel.c | 8 ++++++++ >> > xen/include/xen/sched.h | 4 ++-- >> > 2 files changed, 10 insertions(+), 2 deletions(-) >> > >> > diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c >> > index 87e422e..9898f8e 100644 >> > --- a/xen/common/event_channel.c >> > +++ b/xen/common/event_channel.c >> > @@ -1172,6 +1172,12 @@ void notify_via_xen_event_channel(struct domain *ld, > int lport) >> > >> > int evtchn_init(struct domain *d) >> > { >> > + d->evtchn = (struct evtchn **) >> > + xzalloc_array(struct evtchn *, NR_EVTCHN_BUCKETS); >> >> Don''t need the cast here? >> >> David > > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
Jan Beulich
2013-Jan-03 10:46 UTC
Re: [RFC PATCH 3/3] Implement 3-level event channel routines.
>>> On 02.01.13 at 17:45, Ian Campbell <Ian.Campbell@citrix.com> wrote: > On Mon, 2012-12-31 at 18:22 +0000, Wei Liu wrote: >> + mapping = __map_domain_page_global(page); > > I don''t think you can validly use map_domain_page for a long lived > mapping. domain_page.h says "Allow temporary mapping of domain page > frames into Xen space.".That''s what distinguishes map_domain_page_global() from map_domain_page().> Are these array per-vcpu or shared? Because if they are per-vcpu you > don''t need the global variant.You still would, as the non-global set of mappings that can be active at a time is limited. But the leaf pages ought to be shared anyway. Jan
Wei Liu
2013-Jan-03 11:33 UTC
Re: [RFC PATCH 2/3] Dynamically allocate domain->evtchn, also bump EVTCHNS_PER_BUCKET to 512.
On Thu, 2013-01-03 at 10:36 +0000, Jan Beulich wrote:> >>> On 02.01.13 at 15:27, Wei Liu <Wei.Liu2@citrix.com> wrote: > > On Wed, 2013-01-02 at 13:38 +0000, David Vrabel wrote: > >> On 31/12/12 18:22, Wei Liu wrote: > >> > From: Wei Liu <liuw@liuw.name> > >> > >> The changeset description needs to say why you''re increasing > >> EVTCHNS_PER_BUCKET. I can''t tell why. > >> > > > > Here is the tedious maths. My thought was that it is not very > > interesting to see this in a change log, so I dropped it. But I will add > > this in my later re-post of this series... > > > > #define EVTCHNS_PER_BUCKET ??? > > #define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNEL_L3/EVTCHNS_PER_BUCKET) > > d->evtchn = xzalloc_array(struct evtchn *, NR_EVTCHN_BUCKETS); > > > > We need to allow for 3-level evtchn, so use NR_EVENT_CHANNELS_L3 to > > calculate NR_EVTCHN_BUCKETS. > > > > For 64 bit build, NR_EVENT_CHANNELS_L3 is 256k. The original value of > > EVTCHNS_PER_BUCKET is 128, which means NR_EVTCHN_BUCKETS=2048, thus > > d->evtchn has size of 2048*8 = 16KB = 4 pages. Given that only Dom0 or > > driver domain will need 3-level event channel, this is really overkill > > for most guests. > > > > If we bump EVTCHNS_PER_BUCKET to 512, d->evtchn becomes 512 * 8 = 4KB > > 1 page, which I think is more space efficient for most guests. > > But not suitable for allocation through xmalloc() (as it''ll end up > doing an order-1 page allocation). Either special case it here, or > BUILD_BUG_ON() the effective size being exactly a page, or we > may look into finally making xmalloc() return a page with no > other tracking data when asked for exactly a page''s worth of > bytes. >Is alloc_xen_heap_page suitable? But I presume there is always some tracking structure somewhere, so I didn''t even take that into account when I did calculation. Can you give me some pointers, thanks. Wei.
Jan Beulich
2013-Jan-03 11:35 UTC
Re: [RFC PATCH 3/3] Implement 3-level event channel routines.
>>> On 31.12.12 at 19:22, Wei Liu <wei.liu2@citrix.com> wrote: > @@ -633,7 +712,50 @@ static void evtchn_set_pending(struct vcpu *v, int port) > { > vcpu_mark_events_pending(v); > } > - > +} > + > +static void __evtchn_set_pending_l3(struct vcpu *v, int port) > +{ > + struct domain *d = v->domain; > + > + int page_no = port / EVTCHNS_PER_PAGE;Anything used as array index should be unsigned int.> + int offset = port % EVTCHNS_PER_PAGE; > + int l1cb = BITS_PER_EVTCHN_WORD(d)*BITS_PER_EVTCHN_WORD(d); > + int l2cb = BITS_PER_EVTCHN_WORD(d);These always being powers of 2 (afaict), using divides rather than shifts is pretty inefficient (and I don''t think the compiler has reasonable chances to recognize this and do the replacement for you).> + > + if ( test_and_set_bit(offset, d->evtchn_pending[page_no]) ) > + return; > + > + if ( !test_bit(offset, d->evtchn_mask[page_no]) && > + !test_and_set_bit(port / l2cb, > + v->evtchn_pending_sel_l2) && > + !test_and_set_bit(port / l1cb, > + &vcpu_info(v, evtchn_pending_sel)) )Indentation.> @@ -969,6 +1138,290 @@ out: > return rc; > } > > +static void __unmap_l2_sel(struct vcpu *v) > +{ > + unsigned long mfn; > + > + if ( v->evtchn_pending_sel_l2 != 0 ) > + { > + unmap_domain_page_global(v->evtchn_pending_sel_l2); > + mfn = virt_to_mfn(v->evtchn_pending_sel_l2);virt_to_mfn() is not valid on the output of map_domain_page{,_global}() (same further down). Yes, there is at least one example of this in the existing code, but that''s wrong too, and is getting eliminated by the 16Tb patch series I''m in the process of putting together.> + put_page_and_type(mfn_to_page(mfn)); > + > + v->evtchn_pending_sel_l2 = 0; > + } > +} > + > +static int __map_l2_sel(struct vcpu *v, unsigned long gfn, unsigned long off) > +{ > + void *mapping; > + int rc = -EINVAL; > + struct page_info *page; > + struct domain *d = v->domain; > + > + if ( off >= PAGE_SIZE )Is e.g. off == PAGE_SIZE-1 really valid here? As you''re mapping guest memory here, _anything_ that is invalid must be rejected, or else you''re creating a security hole.> + return rc; > + > + page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC); > + if ( !page ) > + goto out;Please be consistent here - always use return (as above) or (less preferred by me personally) always use goto.> + > + if ( !get_page_type(page, PGT_writable_page) ) > + { > + put_page(page); > + goto out; > + } > + > + mapping = __map_domain_page_global(page); > + if ( mapping == NULL ) > + { > + put_page_and_type(page); > + rc = -ENOMEM; > + goto out; > + } > + > + v->evtchn_pending_sel_l2 = (unsigned long *)(mapping + off);Pointless cast?> +static int __map_l3_arrays(struct domain *d, unsigned long *pending, > + unsigned long *mask) > +{ > + int i; > + void *pending_mapping, *mask_mapping; > + struct page_info *pending_page, *mask_page; > + unsigned long pending_gfn, mask_gfn; > + int rc = -EINVAL; > + > + for ( i = 0; > + i < MAX_L3_PAGES && pending[i] != 0 && mask[i] != 0; > + i++ ) > + { > + pending_gfn = pending[i]; > + mask_gfn = mask[i]; > + > + pending_page = get_page_from_gfn(d, pending_gfn, NULL, P2M_ALLOC); > + if ( !pending_page ) > + goto err; > + > + mask_page = get_page_from_gfn(d, mask_gfn, NULL, P2M_ALLOC); > + if ( !mask_page ) > + { > + put_page(pending_page); > + goto err; > + } > + > + if ( !get_page_type(pending_page, PGT_writable_page) ) > + { > + put_page(pending_page); > + put_page(mask_page); > + goto err; > + } > + > + if ( !get_page_type(mask_page, PGT_writable_page) ) > + { > + put_page_and_type(pending_page); > + put_page(mask_page); > + goto err; > + } > + > + pending_mapping = __map_domain_page_global(pending_page); > + if ( !pending_mapping ) > + { > + put_page_and_type(pending_page); > + put_page_and_type(mask_page); > + rc = -ENOMEM; > + goto err; > + } > + > + mask_mapping = __map_domain_page_global(mask_page); > + if ( !mask_mapping ) > + { > + unmap_domain_page_global(pending_mapping); > + put_page_and_type(pending_page); > + put_page_and_type(mask_page); > + rc = -ENOMEM; > + goto err;The error paths in this function constitute well over half of it - can this not be consolidated in some way?> +static void __evtchn_migrate_bitmap_l3(struct domain *d) > +{ > + struct vcpu *v; > + > + /* Easy way to migrate, just move existing selector down one level > + * then copy the pending array and mask array */ > + for_each_vcpu ( d, v ) > + { > + memcpy(&v->evtchn_pending_sel_l2[0], > + &vcpu_info(v, evtchn_pending_sel), > + sizeof(vcpu_info(v, evtchn_pending_sel))); > + > + memset(&vcpu_info(v, evtchn_pending_sel), 0, > + sizeof(vcpu_info(v, evtchn_pending_sel))); > + > + set_bit(0, &vcpu_info(v, evtchn_pending_sel)); > + } > + > + memcpy(d->evtchn_pending[0], &shared_info(d, evtchn_pending), > + sizeof(shared_info(d, evtchn_pending))); > + memcpy(d->evtchn_mask[0], &shared_info(d, evtchn_mask), > + sizeof(shared_info(d, evtchn_mask))); > +} > + > +static long __evtchn_register_3level(struct evtchn_register_3level *r) > +{ > + struct domain *d = current->domain; > + unsigned long mfns[r->nr_vcpus]; > + unsigned long offsets[r->nr_vcpus]; > + unsigned char was_up[r->nr_vcpus];This is an absolute no-go in the hypervisor: Did you consider what happens when a guest has a few hundred vCPU-s?> + int rc, i; > + struct vcpu *v; > + > + if ( d->evtchn_level == 3 ) > + return -EINVAL; > + > + if ( r->nr_vcpus > d->max_vcpus ) > + return -EINVAL; > + > + for ( i = 0; i < MAX_L3_PAGES; i++ ) > + if ( d->evtchn_pending[i] || d->evtchn_mask[i] ) > + return -EINVAL;This and the check immediately below should be redundant with the level == 3 check earlier on (or if they aren''t redundant, then you need to fix the code elsewhere).> + > + for_each_vcpu ( d, v ) > + if ( v->evtchn_pending_sel_l2 ) > + return -EINVAL; > + > + if ( copy_from_user(mfns, r->l2sel_mfn, > + sizeof(unsigned long)*r->nr_vcpus) )copy_from_guest()!> + return -EFAULT; > + > + if ( copy_from_user(offsets, r->l2sel_offset, > + sizeof(unsigned long)*r->nr_vcpus) ) > + return -EFAULT; > + > + /* put cpu offline */ > + for_each_vcpu ( d, v ) > + { > + if ( v == current ) > + was_up[v->vcpu_id] = 1; > + else > + was_up[v->vcpu_id] = !test_and_set_bit(_VPF_down, > + &v->pause_flags);This does not in any way halt that remote vCPU. But anyway - is it really that useful to allow establishing the 3rd level with all vCPU-s already up and event processing already happening for the subject domain? I.e. wouldn''t it suffice to require the guest to do the setup _before_ setting up the first event channel (which would imply on vCPU 0 to be active)?> @@ -1251,7 +1712,6 @@ void evtchn_move_pirqs(struct vcpu *v) > spin_unlock(&d->event_lock); > } > > -???> static void domain_dump_evtchn_info(struct domain *d) > { > unsigned int port; > @@ -1276,11 +1740,28 @@ static void domain_dump_evtchn_info(struct domain *d) > if ( chn->state == ECS_FREE ) > continue; > > - printk(" %4u [%d/%d]: s=%d n=%d x=%d", > - port, > - !!test_bit(port, &shared_info(d, evtchn_pending)), > - !!test_bit(port, &shared_info(d, evtchn_mask)), > - chn->state, chn->notify_vcpu_id, chn->xen_consumer); > + printk(" %4u", port); > + > + switch ( d->evtchn_level ) > + { > + case 2: > + printk(" [%d/%d]:", > + !!test_bit(port, &shared_info(d, evtchn_pending)), > + !!test_bit(port, &shared_info(d, evtchn_mask))); > + break; > + case 3: > + printk(" [%d/%d]:", > + !!test_bit(offset, d->evtchn_pending[page_no]), > + !!test_bit(offset, d->evtchn_mask[page_no]));Can''t you, btw, set [0] of these two to the &shared_info() values, and thus fold code here and perhaps elsewhere?> + break; > + default: > + printk(" %s: unknown event channel level %d for domain %d \n", > + __FUNCTION__, d->evtchn_level, d->domain_id); > + goto out; > + } > + > + printk(" s=%d n=%d x=%d", chn->state, chn->notify_vcpu_id, > + chn->xen_consumer);So still all event channels in one go?> --- a/xen/common/keyhandler.c > +++ b/xen/common/keyhandler.c > @@ -298,15 +298,15 @@ static void dump_domains(unsigned char key) > { > for_each_vcpu ( d, v ) > { > + unsigned int bits = BITS_PER_EVTCHN_WORD(d); > + for (int i = 2; i < d->evtchn_level; i++)Did you check that gcc 4.1.x deals with this non-C89 code fine, with no warning?> --- a/xen/common/schedule.c > +++ b/xen/common/schedule.c > @@ -693,7 +693,7 @@ static long do_poll(struct sched_poll *sched_poll) > goto out; > > rc = 0; > - if ( test_bit(port, &shared_info(d, evtchn_pending)) ) > + if ( evtchn_is_pending(d, port) )It would likely make the patch better readable if mechanical adjustments like this got split out into a prerequisite patch, even if used only in very few places.> --- a/xen/include/public/event_channel.h > +++ b/xen/include/public/event_channel.h > @@ -71,6 +71,7 @@ > #define EVTCHNOP_bind_vcpu 8 > #define EVTCHNOP_unmask 9 > #define EVTCHNOP_reset 10 > +#define EVTCHNOP_register_nlevel 11 > /* ` } */ > > typedef uint32_t evtchn_port_t; > @@ -258,6 +259,29 @@ struct evtchn_reset { > typedef struct evtchn_reset evtchn_reset_t; > > /* > + * EVTCHNOP_register_nlevel: Register N level event channels. > + * NOTES: > + * 1. currently only 3-level is supported. > + * 2. should fall back to basic 2-level if this call fails. > + */ > +#define MAX_L3_PAGES 8Without explanation, no-one will easily understand whether this number is made up or the result of some calculation. Also, with it having "MAX" in its name, I''d conclude there can be fewer than 8 pages passed by the guest, yet the interface lacks any indication of how many pages there are being passed.> +struct evtchn_register_3level { > + unsigned long evtchn_pending[MAX_L3_PAGES]; > + unsigned long evtchn_mask[MAX_L3_PAGES]; > + unsigned long *l2sel_mfn; > + unsigned long *l2sel_offset; > + uint32_t nr_vcpus; > +}; > + > +struct evtchn_register_nlevel { > + uint32_t level; > + union { > + struct evtchn_register_3level l3; > + } u; > +}; > +typedef struct evtchn_register_nlevel evtchn_register_nlevel_t; > + > +/* > * ` enum neg_errnoval > * ` HYPERVISOR_event_channel_op_compat(struct evtchn_op *op) > * ` > --- a/xen/include/public/xen.h > +++ b/xen/include/public/xen.h > @@ -554,9 +554,24 @@ DEFINE_XEN_GUEST_HANDLE(multicall_entry_t); > > /* > * Event channel endpoints per domain: > + * 2-level: > * 1024 if a long is 32 bits; 4096 if a long is 64 bits. > + * 3-level: > + * 32k if a long is 32 bits; 256k if a long is 64 bits; > */ > -#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64) > +#define NR_EVENT_CHANNELS_L2 (sizeof(unsigned long) * sizeof(unsigned long) * 64) > +#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * 64) > +#define NR_EVENT_CHANNELS(x) ({ unsigned int __v = 0; \This is not a valid thing to do in a public header: You can''t replace the object like NR_EVENT_CHANNELS with a function like alternative, you need to retain the old definition for consumers unaware of the new extension.> --- a/xen/include/xen/sched.h > +++ b/xen/include/xen/sched.h > @@ -64,7 +64,8 @@ extern struct domain *dom0; > __v;}) > > #define EVTCHNS_PER_BUCKET 512 > -#define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS / EVTCHNS_PER_BUCKET) > +#define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS_L3 / EVTCHNS_PER_BUCKET) > +#define EVTCHNS_PER_PAGE (PAGE_SIZE * 8)So is this 8 the same as the one above? If so, why don''t you use the #define? If not, where does this 8 come from?> @@ -104,7 +105,7 @@ void evtchn_destroy_final(struct domain *d); /* from > complete_domain_destroy */ > > struct waitqueue_vcpu; > > -struct vcpu > +struct vcpu???> @@ -275,6 +279,10 @@ struct domain > struct evtchn **evtchn; > spinlock_t event_lock; > unsigned int evtchn_level; > +#define L3_PAGES (NR_EVENT_CHANNELS_L3/BITS_PER_LONG*sizeof(unsigned long)/PAGE_SIZE)Without the use of BITS_TO_LONGS() and PFN_UP() it is not clear that you don''t cut off any non-zero bits here. Hence you ought to either use those macros, or put a validating BUILD_BUG_ON() somewhere. Jan
Jan Beulich
2013-Jan-03 11:39 UTC
Re: [RFC PATCH 2/3] Dynamically allocate domain->evtchn, also bump EVTCHNS_PER_BUCKET to 512.
>>> On 03.01.13 at 12:33, Wei Liu <Wei.Liu2@citrix.com> wrote: > On Thu, 2013-01-03 at 10:36 +0000, Jan Beulich wrote: >> >>> On 02.01.13 at 15:27, Wei Liu <Wei.Liu2@citrix.com> wrote: >> > On Wed, 2013-01-02 at 13:38 +0000, David Vrabel wrote: >> >> On 31/12/12 18:22, Wei Liu wrote: >> >> > From: Wei Liu <liuw@liuw.name> >> >> >> >> The changeset description needs to say why you''re increasing >> >> EVTCHNS_PER_BUCKET. I can''t tell why. >> >> >> > >> > Here is the tedious maths. My thought was that it is not very >> > interesting to see this in a change log, so I dropped it. But I will add >> > this in my later re-post of this series... >> > >> > #define EVTCHNS_PER_BUCKET ??? >> > #define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNEL_L3/EVTCHNS_PER_BUCKET) >> > d->evtchn = xzalloc_array(struct evtchn *, NR_EVTCHN_BUCKETS); >> > >> > We need to allow for 3-level evtchn, so use NR_EVENT_CHANNELS_L3 to >> > calculate NR_EVTCHN_BUCKETS. >> > >> > For 64 bit build, NR_EVENT_CHANNELS_L3 is 256k. The original value of >> > EVTCHNS_PER_BUCKET is 128, which means NR_EVTCHN_BUCKETS=2048, thus >> > d->evtchn has size of 2048*8 = 16KB = 4 pages. Given that only Dom0 or >> > driver domain will need 3-level event channel, this is really overkill >> > for most guests. >> > >> > If we bump EVTCHNS_PER_BUCKET to 512, d->evtchn becomes 512 * 8 = 4KB >> > 1 page, which I think is more space efficient for most guests. >> >> But not suitable for allocation through xmalloc() (as it''ll end up >> doing an order-1 page allocation). Either special case it here, or >> BUILD_BUG_ON() the effective size being exactly a page, or we >> may look into finally making xmalloc() return a page with no >> other tracking data when asked for exactly a page''s worth of >> bytes. >> > > Is alloc_xen_heap_page suitable? But I presume there is always some > tracking structure somewhere, so I didn''t even take that into account > when I did calculation.Yes, alloc_xenheap_page() is the right thing to use for exact page size allocations. There''s no tracking structure associated with that (other than struct page_info, which exists regardless of that allocation). Jan
Wei Liu
2013-Jan-08 17:33 UTC
Re: [RFC PATCH 3/3] Implement 3-level event channel routines.
On Thu, 2013-01-03 at 11:35 +0000, Jan Beulich wrote:> >>> On 31.12.12 at 19:22, Wei Liu <wei.liu2@citrix.com> wrote: > > > > +static void __unmap_l2_sel(struct vcpu *v) > > +{ > > + unsigned long mfn; > > + > > + if ( v->evtchn_pending_sel_l2 != 0 ) > > + { > > + unmap_domain_page_global(v->evtchn_pending_sel_l2); > > + mfn = virt_to_mfn(v->evtchn_pending_sel_l2); > > virt_to_mfn() is not valid on the output of > map_domain_page{,_global}() (same further down). Yes, there is > at least one example of this in the existing code, but that''s wrong > too, and is getting eliminated by the 16Tb patch series I''m in the > process of putting together. >So what''s the correct way to do this? Do I need to wait for your patch series? Wei.
Jan Beulich
2013-Jan-09 08:38 UTC
Re: [RFC PATCH 3/3] Implement 3-level event channel routines.
>>> On 08.01.13 at 18:33, Wei Liu <Wei.Liu2@citrix.com> wrote: > On Thu, 2013-01-03 at 11:35 +0000, Jan Beulich wrote: >> >>> On 31.12.12 at 19:22, Wei Liu <wei.liu2@citrix.com> wrote: >> > >> > +static void __unmap_l2_sel(struct vcpu *v) >> > +{ >> > + unsigned long mfn; >> > + >> > + if ( v->evtchn_pending_sel_l2 != 0 ) >> > + { >> > + unmap_domain_page_global(v->evtchn_pending_sel_l2); >> > + mfn = virt_to_mfn(v->evtchn_pending_sel_l2); >> >> virt_to_mfn() is not valid on the output of >> map_domain_page{,_global}() (same further down). Yes, there is >> at least one example of this in the existing code, but that''s wrong >> too, and is getting eliminated by the 16Tb patch series I''m in the >> process of putting together. >> > > So what''s the correct way to do this? Do I need to wait for your patch > series?Considering that the old 32-bit case of map_domain_page() doesn''t matter anymore, using domain_page_map_to_mfn() here would be the way to go (while the 32-bit version of it didn''t allow map_domain_page_global() to be handled through it, my 64-bit implementation of it will). And of course there''s the obvious alternative of storing the MFN rather than re-obtaining it in the teardown path. Jan
Ian Campbell
2013-Jan-09 10:56 UTC
Re: [RFC PATCH 3/3] Implement 3-level event channel routines.
On Wed, 2013-01-09 at 08:38 +0000, Jan Beulich wrote:> >>> On 08.01.13 at 18:33, Wei Liu <Wei.Liu2@citrix.com> wrote: > > On Thu, 2013-01-03 at 11:35 +0000, Jan Beulich wrote: > >> >>> On 31.12.12 at 19:22, Wei Liu <wei.liu2@citrix.com> wrote: > >> > > >> > +static void __unmap_l2_sel(struct vcpu *v) > >> > +{ > >> > + unsigned long mfn; > >> > + > >> > + if ( v->evtchn_pending_sel_l2 != 0 ) > >> > + { > >> > + unmap_domain_page_global(v->evtchn_pending_sel_l2); > >> > + mfn = virt_to_mfn(v->evtchn_pending_sel_l2); > >> > >> virt_to_mfn() is not valid on the output of > >> map_domain_page{,_global}() (same further down). Yes, there is > >> at least one example of this in the existing code, but that''s wrong > >> too, and is getting eliminated by the 16Tb patch series I''m in the > >> process of putting together. > >> > > > > So what''s the correct way to do this? Do I need to wait for your patch > > series? > > Considering that the old 32-bit case of map_domain_page() > doesn''t matter anymore,We still care about it on 32-bit ARM FWIW. (I@m not sure if that invalidates your advice below though)> using domain_page_map_to_mfn() > here would be the way to go (while the 32-bit version of it > didn''t allow map_domain_page_global() to be handled through > it, my 64-bit implementation of it will). > > And of course there''s the obvious alternative of storing the > MFN rather than re-obtaining it in the teardown path. > > Jan > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
Jan Beulich
2013-Jan-09 11:24 UTC
Re: [RFC PATCH 3/3] Implement 3-level event channel routines.
>>> On 09.01.13 at 11:56, Ian Campbell <Ian.Campbell@citrix.com> wrote: > On Wed, 2013-01-09 at 08:38 +0000, Jan Beulich wrote: >> >>> On 08.01.13 at 18:33, Wei Liu <Wei.Liu2@citrix.com> wrote: >> > On Thu, 2013-01-03 at 11:35 +0000, Jan Beulich wrote: >> >> >>> On 31.12.12 at 19:22, Wei Liu <wei.liu2@citrix.com> wrote: >> >> > >> >> > +static void __unmap_l2_sel(struct vcpu *v) >> >> > +{ >> >> > + unsigned long mfn; >> >> > + >> >> > + if ( v->evtchn_pending_sel_l2 != 0 ) >> >> > + { >> >> > + unmap_domain_page_global(v->evtchn_pending_sel_l2); >> >> > + mfn = virt_to_mfn(v->evtchn_pending_sel_l2); >> >> >> >> virt_to_mfn() is not valid on the output of >> >> map_domain_page{,_global}() (same further down). Yes, there is >> >> at least one example of this in the existing code, but that''s wrong >> >> too, and is getting eliminated by the 16Tb patch series I''m in the >> >> process of putting together. >> >> >> > >> > So what''s the correct way to do this? Do I need to wait for your patch >> > series? >> >> Considering that the old 32-bit case of map_domain_page() >> doesn''t matter anymore, > > We still care about it on 32-bit ARM FWIW. (I@m not sure if that > invalidates your advice below though)Right - which would require ARM to implement domain_page_map_to_mfn() (so far used only in x86 and tmem code).>> using domain_page_map_to_mfn() >> here would be the way to go (while the 32-bit version of it >> didn''t allow map_domain_page_global() to be handled through >> it, my 64-bit implementation of it will). >> >> And of course there''s the obvious alternative of storing the >> MFN rather than re-obtaining it in the teardown path.This second option of course would alway be valid, without architecture specific changes. Jan
Ian Campbell
2013-Jan-09 11:31 UTC
Re: [RFC PATCH 3/3] Implement 3-level event channel routines.
On Wed, 2013-01-09 at 11:24 +0000, Jan Beulich wrote:> >>> On 09.01.13 at 11:56, Ian Campbell <Ian.Campbell@citrix.com> wrote: > > On Wed, 2013-01-09 at 08:38 +0000, Jan Beulich wrote: > >> >>> On 08.01.13 at 18:33, Wei Liu <Wei.Liu2@citrix.com> wrote: > >> > On Thu, 2013-01-03 at 11:35 +0000, Jan Beulich wrote: > >> >> >>> On 31.12.12 at 19:22, Wei Liu <wei.liu2@citrix.com> wrote: > >> >> > > >> >> > +static void __unmap_l2_sel(struct vcpu *v) > >> >> > +{ > >> >> > + unsigned long mfn; > >> >> > + > >> >> > + if ( v->evtchn_pending_sel_l2 != 0 ) > >> >> > + { > >> >> > + unmap_domain_page_global(v->evtchn_pending_sel_l2); > >> >> > + mfn = virt_to_mfn(v->evtchn_pending_sel_l2); > >> >> > >> >> virt_to_mfn() is not valid on the output of > >> >> map_domain_page{,_global}() (same further down). Yes, there is > >> >> at least one example of this in the existing code, but that''s wrong > >> >> too, and is getting eliminated by the 16Tb patch series I''m in the > >> >> process of putting together. > >> >> > >> > > >> > So what''s the correct way to do this? Do I need to wait for your patch > >> > series? > >> > >> Considering that the old 32-bit case of map_domain_page() > >> doesn''t matter anymore, > > > > We still care about it on 32-bit ARM FWIW. (I@m not sure if that > > invalidates your advice below though) > > Right - which would require ARM to implement > domain_page_map_to_mfn() (so far used only in x86 and tmem > code).I only see it in x86 (specifically hvm_unmap_guest_frame) but I guess this is a useful interface for ARM to implement in any case and it doesn''t look to be terribly hard.> >> using domain_page_map_to_mfn() > >> here would be the way to go (while the 32-bit version of it > >> didn''t allow map_domain_page_global() to be handled through > >> it, my 64-bit implementation of it will). > >> > >> And of course there''s the obvious alternative of storing the > >> MFN rather than re-obtaining it in the teardown path. > > This second option of course would alway be valid, without > architecture specific changes.Right. Ian.
Jan Beulich
2013-Jan-09 11:41 UTC
Re: [RFC PATCH 3/3] Implement 3-level event channel routines.
>>> On 09.01.13 at 12:31, Ian Campbell <Ian.Campbell@citrix.com> wrote: > On Wed, 2013-01-09 at 11:24 +0000, Jan Beulich wrote: >> >>> On 09.01.13 at 11:56, Ian Campbell <Ian.Campbell@citrix.com> wrote: >> > On Wed, 2013-01-09 at 08:38 +0000, Jan Beulich wrote: >> >> >>> On 08.01.13 at 18:33, Wei Liu <Wei.Liu2@citrix.com> wrote: >> >> > On Thu, 2013-01-03 at 11:35 +0000, Jan Beulich wrote: >> >> >> >>> On 31.12.12 at 19:22, Wei Liu <wei.liu2@citrix.com> wrote: >> >> >> > >> >> >> > +static void __unmap_l2_sel(struct vcpu *v) >> >> >> > +{ >> >> >> > + unsigned long mfn; >> >> >> > + >> >> >> > + if ( v->evtchn_pending_sel_l2 != 0 ) >> >> >> > + { >> >> >> > + unmap_domain_page_global(v->evtchn_pending_sel_l2); >> >> >> > + mfn = virt_to_mfn(v->evtchn_pending_sel_l2); >> >> >> >> >> >> virt_to_mfn() is not valid on the output of >> >> >> map_domain_page{,_global}() (same further down). Yes, there is >> >> >> at least one example of this in the existing code, but that''s wrong >> >> >> too, and is getting eliminated by the 16Tb patch series I''m in the >> >> >> process of putting together. >> >> >> >> >> > >> >> > So what''s the correct way to do this? Do I need to wait for your patch >> >> > series? >> >> >> >> Considering that the old 32-bit case of map_domain_page() >> >> doesn''t matter anymore, >> > >> > We still care about it on 32-bit ARM FWIW. (I@m not sure if that >> > invalidates your advice below though) >> >> Right - which would require ARM to implement >> domain_page_map_to_mfn() (so far used only in x86 and tmem >> code). > > I only see it in x86 (specifically hvm_unmap_guest_frame) but I guess > this is a useful interface for ARM to implement in any case and it > doesn''t look to be terribly hard.Oh, the tmem one is from my 16Tb series that I''m hoping to post soon. Jan