Daniel Kiper
2011-Feb-03 16:30 UTC
[Xen-devel] [PATCH R3 7/7] xen/balloon: Memory hotplug support for Xen balloon driver
Features and fixes: - new version of memory hotplug patch which supports among others memory allocation policies during errors (try until success or stop at first error), - this version of patch was tested with tmem (selfballooning and frontswap) and works very well with it, - some other minor fixes. Signed-off-by: Daniel Kiper <dkiper@net-space.pl> --- drivers/xen/Kconfig | 10 ++ drivers/xen/balloon.c | 231 ++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 230 insertions(+), 11 deletions(-) diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 07bec09..8f880aa 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -9,6 +9,16 @@ config XEN_BALLOON the system to expand the domain''s memory allocation, or alternatively return unneeded memory to the system. +config XEN_BALLOON_MEMORY_HOTPLUG + bool "Memory hotplug support for Xen balloon driver" + default n + depends on XEN_BALLOON && MEMORY_HOTPLUG + help + Memory hotplug support for Xen balloon driver allows expanding memory + available for the system above limit declared at system startup. + It is very useful on critical systems which require long + run without rebooting. + config XEN_SCRUB_PAGES bool "Scrub pages before returning them to system" depends on XEN_BALLOON diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index b1e199c..e43e928 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -6,6 +6,12 @@ * Copyright (c) 2003, B Dragovic * Copyright (c) 2003-2004, M Williamson, K Fraser * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * Copyright (c) 2010 Daniel Kiper + * + * Memory hotplug support was written by Daniel Kiper. Work on + * it was sponsored by Google under Google Summer of Code 2010 + * program. Jeremy Fitzhardinge from Xen.org was the mentor for + * this project. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version 2 @@ -44,6 +50,7 @@ #include <linux/list.h> #include <linux/sysdev.h> #include <linux/gfp.h> +#include <linux/memory.h> #include <asm/page.h> #include <asm/pgalloc.h> @@ -80,6 +87,9 @@ enum bp_state { BP_HUNGRY }; +#define MH_POLICY_TRY_UNTIL_SUCCESS 0 +#define MH_POLICY_STOP_AT_FIRST_ERROR 1 + struct balloon_stats { /* We aim for ''current allocation'' == ''target allocation''. */ unsigned long current_pages; @@ -89,6 +99,10 @@ struct balloon_stats { unsigned long balloon_high; unsigned long schedule_delay; unsigned long max_schedule_delay; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + unsigned long boot_max_pfn; + unsigned long mh_policy; +#endif }; static DEFINE_MUTEX(balloon_mutex); @@ -206,18 +220,199 @@ static void update_schedule_delay(enum bp_state state) balloon_stats.schedule_delay = new_schedule_delay; } -static unsigned long current_target(void) +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +static struct resource *allocate_memory_resource(unsigned long nr_pages) +{ + resource_size_t r_min, r_size; + struct resource *r; + + /* + * Look for first unused memory region starting at page + * boundary. Skip last memory section created at boot time + * because it may contains unused memory pages with PG_reserved + * bit not set (online_pages() require PG_reserved bit set). + */ + + r = kzalloc(sizeof(struct resource), GFP_KERNEL); + + if (!r) + return NULL; + + r->name = "System RAM"; + r->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + r_min = PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 1)); + r_size = nr_pages << PAGE_SHIFT; + + if (allocate_resource(&iomem_resource, r, r_size, r_min, + ULONG_MAX, PAGE_SIZE, NULL, NULL) < 0) { + kfree(r); + return NULL; + } + + return r; +} + +static struct resource *adjust_memory_resource(struct resource *r, unsigned long nr_pages) +{ + int rc; + + if (r->end + 1 - (nr_pages << PAGE_SHIFT) == r->start) { + rc = release_resource(r); + BUG_ON(rc < 0); + kfree(r); + return NULL; + } + + rc = adjust_resource(r, r->start, r->end + 1 - r->start - + (nr_pages << PAGE_SHIFT)); + + BUG_ON(rc < 0); + + return r; +} + +static int allocate_additional_memory(struct resource *r, unsigned long nr_pages) +{ + int rc; + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + unsigned long flags, i, pfn, pfn_start; + + if (!nr_pages) + return 0; + + pfn_start = PFN_UP(r->end) - nr_pages; + + if (nr_pages > ARRAY_SIZE(frame_list)) + nr_pages = ARRAY_SIZE(frame_list); + + for (i = 0, pfn = pfn_start; i < nr_pages; ++i, ++pfn) + frame_list[i] = pfn; + + set_xen_guest_handle(reservation.extent_start, frame_list); + reservation.nr_extents = nr_pages; + + spin_lock_irqsave(&xen_reservation_lock, flags); + + rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); + + if (rc <= 0) + return (rc < 0) ? rc : -ENOMEM; + + for (i = 0, pfn = pfn_start; i < rc; ++i, ++pfn) { + BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && + phys_to_machine_mapping_valid(pfn)); + set_phys_to_machine(pfn, frame_list[i]); + } + + spin_unlock_irqrestore(&xen_reservation_lock, flags); + + return rc; +} + +static void hotplug_allocated_memory(struct resource *r) { - unsigned long target = balloon_stats.target_pages; + int nid, rc; + resource_size_t r_size; + struct memory_block *mem; + unsigned long pfn; + + r_size = r->end + 1 - r->start; + nid = memory_add_physaddr_to_nid(r->start); + + rc = add_registered_memory(nid, r->start, r_size); + + if (rc) { + pr_err("%s: add_registered_memory: Memory hotplug failed: %i\n", + __func__, rc); + balloon_stats.target_pages = balloon_stats.current_pages; + return; + } + + if (xen_pv_domain()) + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); ++pfn) + if (!PageHighMem(pfn_to_page(pfn))) { + rc = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(pfn_to_mfn(pfn), PAGE_KERNEL), 0); + BUG_ON(rc); + } - target = min(target, - balloon_stats.current_pages + - balloon_stats.balloon_low + - balloon_stats.balloon_high); + rc = online_pages(PFN_DOWN(r->start), r_size >> PAGE_SHIFT); - return target; + if (rc) { + pr_err("%s: online_pages: Failed: %i\n", __func__, rc); + balloon_stats.target_pages = balloon_stats.current_pages; + return; + } + + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); pfn += PAGES_PER_SECTION) { + mem = find_memory_block(__pfn_to_section(pfn)); + BUG_ON(!mem); + BUG_ON(!present_section_nr(mem->phys_index)); + mutex_lock(&mem->state_mutex); + mem->state = MEM_ONLINE; + mutex_unlock(&mem->state_mutex); + } + + balloon_stats.current_pages += r_size >> PAGE_SHIFT; } +static enum bp_state request_additional_memory(long credit) +{ + int rc; + static struct resource *r; + static unsigned long pages_left; + + if ((credit <= 0 || balloon_stats.balloon_low || + balloon_stats.balloon_high) && !r) + return BP_DONE; + + if (!r) { + r = allocate_memory_resource(credit); + + if (!r) + return BP_ERROR; + + pages_left = credit; + } + + rc = allocate_additional_memory(r, pages_left); + + if (rc < 0) { + if (balloon_stats.mh_policy == MH_POLICY_TRY_UNTIL_SUCCESS) + return BP_ERROR; + + r = adjust_memory_resource(r, pages_left); + + if (!r) + return BP_ERROR; + } else { + pages_left -= rc; + + if (pages_left) + return BP_HUNGRY; + } + + hotplug_allocated_memory(r); + + r = NULL; + + return BP_DONE; +} +#else +static enum bp_state request_additional_memory(long credit) +{ + if (balloon_stats.balloon_low && balloon_stats.balloon_high && + balloon_stats.target_pages > balloon_stats.current_pages) + balloon_stats.target_pages = balloon_stats.current_pages; + return BP_DONE; +} +#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ + static enum bp_state increase_reservation(unsigned long nr_pages) { enum bp_state state = BP_DONE; @@ -352,15 +547,17 @@ static enum bp_state decrease_reservation(unsigned long nr_pages) */ static void balloon_process(struct work_struct *work) { - enum bp_state rc, state = BP_DONE; + enum bp_state rc, state; long credit; mutex_lock(&balloon_mutex); do { - credit = current_target() - balloon_stats.current_pages; + credit = balloon_stats.target_pages - balloon_stats.current_pages; - if (credit > 0) { + state = request_additional_memory(credit); + + if (credit > 0 && state == BP_DONE) { rc = increase_reservation(credit); state = (rc == BP_ERROR) ? BP_ERROR : state; } @@ -450,6 +647,11 @@ static int __init balloon_init(void) balloon_stats.schedule_delay = 1; balloon_stats.max_schedule_delay = 32; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + balloon_stats.boot_max_pfn = max_pfn; + balloon_stats.mh_policy = MH_POLICY_STOP_AT_FIRST_ERROR; +#endif + register_balloon(&balloon_sysdev); /* @@ -506,6 +708,10 @@ BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high)); static SYSDEV_ULONG_ATTR(schedule_delay, 0644, balloon_stats.schedule_delay); static SYSDEV_ULONG_ATTR(max_schedule_delay, 0644, balloon_stats.max_schedule_delay); +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +static SYSDEV_ULONG_ATTR(memory_hotplug_policy, 0644, balloon_stats.mh_policy); +#endif + static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr, char *buf) { @@ -568,7 +774,10 @@ static struct sysdev_attribute *balloon_attrs[] = { &attr_target_kb, &attr_target, &attr_schedule_delay.attr, - &attr_max_schedule_delay.attr + &attr_max_schedule_delay.attr, +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + &attr_memory_hotplug_policy.attr +#endif }; static struct attribute *balloon_info_attrs[] = { -- 1.5.6.5 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Dave Hansen
2011-Feb-03 18:12 UTC
[Xen-devel] Re: [PATCH R3 7/7] xen/balloon: Memory hotplug support for Xen balloon driver
On Thu, 2011-02-03 at 17:30 +0100, Daniel Kiper wrote:> +static struct resource *allocate_memory_resource(unsigned long nr_pages) > +{ > + resource_size_t r_min, r_size; > + struct resource *r; > + > + /* > + * Look for first unused memory region starting at page > + * boundary. Skip last memory section created at boot time > + * because it may contains unused memory pages with PG_reserved > + * bit not set (online_pages() require PG_reserved bit set). > + */Could you elaborate on this comment a bit? I think it''s covering both the "PAGE_SIZE" argument to allocate_resource() and something else, but I''m not quite sure.> + r = kzalloc(sizeof(struct resource), GFP_KERNEL); > + > + if (!r) > + return NULL; > + > + r->name = "System RAM"; > + r->flags = IORESOURCE_MEM | IORESOURCE_BUSY; > + r_min = PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 1));Did you do this for alignment reasons? It might be a better idea to just make a nice sparsemem function to do alignment.> + r_size = nr_pages << PAGE_SHIFT; > + > + if (allocate_resource(&iomem_resource, r, r_size, r_min, > + ULONG_MAX, PAGE_SIZE, NULL, NULL) < 0) { > + kfree(r); > + return NULL; > + } > + > + return r; > +}This function should probably be made generic. I bet some more hypervisors come around and want to use this. They generally won''t care where the memory goes, and the kernel can allocate a spot for them. ...> +static void hotplug_allocated_memory(struct resource *r) > { > - unsigned long target = balloon_stats.target_pages; > + int nid, rc; > + resource_size_t r_size; > + struct memory_block *mem; > + unsigned long pfn; > + > + r_size = r->end + 1 - r->start; > + nid = memory_add_physaddr_to_nid(r->start); > + > + rc = add_registered_memory(nid, r->start, r_size); > + > + if (rc) { > + pr_err("%s: add_registered_memory: Memory hotplug failed: %i\n", > + __func__, rc); > + balloon_stats.target_pages = balloon_stats.current_pages; > + return; > + } > + > + if (xen_pv_domain()) > + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); ++pfn) > + if (!PageHighMem(pfn_to_page(pfn))) { > + rc = HYPERVISOR_update_va_mapping( > + (unsigned long)__va(pfn << PAGE_SHIFT), > + mfn_pte(pfn_to_mfn(pfn), PAGE_KERNEL), 0); > + BUG_ON(rc); > + } > > - target = min(target, > - balloon_stats.current_pages + > - balloon_stats.balloon_low + > - balloon_stats.balloon_high); > + rc = online_pages(PFN_DOWN(r->start), r_size >> PAGE_SHIFT);The memory hotplug code is a bit weird at first glance. It has two distinct stages: first, add_memory() is called from architecture-specific code. Later, online_pages() is called, but that part is driven by userspace. For all the other hotplug users online_pages() is done later, and triggered by userspace. The idea is that you could have memory sitting in "added, but offline" state which would be trivial to remove if someone else needed it, but also trivial to add without the need to allocate additional memory. In other words, I think you can take everything from and including online_pages() down in the function and take it out. Use a udev hotplug rule to online it immediately if that''s what you want.> - return target; > + if (rc) { > + pr_err("%s: online_pages: Failed: %i\n", __func__, rc); > + balloon_stats.target_pages = balloon_stats.current_pages; > + return; > + } > + > + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); pfn += PAGES_PER_SECTION) { > + mem = find_memory_block(__pfn_to_section(pfn)); > + BUG_ON(!mem); > + BUG_ON(!present_section_nr(mem->phys_index)); > + mutex_lock(&mem->state_mutex); > + mem->state = MEM_ONLINE; > + mutex_unlock(&mem->state_mutex); > + } > + > + balloon_stats.current_pages += r_size >> PAGE_SHIFT; > } > > +static enum bp_state request_additional_memory(long credit) > +{ > + int rc; > + static struct resource *r; > + static unsigned long pages_left; > + > + if ((credit <= 0 || balloon_stats.balloon_low || > + balloon_stats.balloon_high) && !r) > + return BP_DONE; > + > + if (!r) { > + r = allocate_memory_resource(credit); > + > + if (!r) > + return BP_ERROR; > + > + pages_left = credit; > + }''r'' is effectively a global variable here. Could you give it a more proper name? Maybe "last add location" or something. It might even make sense to move it up in to the global scope to make it _much_ more clear that it''s not just locally used.> + rc = allocate_additional_memory(r, pages_left); > + > + if (rc < 0) { > + if (balloon_stats.mh_policy == MH_POLICY_TRY_UNTIL_SUCCESS) > + return BP_ERROR; > + > + r = adjust_memory_resource(r, pages_left); > + > + if (!r) > + return BP_ERROR; > + } else { > + pages_left -= rc; > + > + if (pages_left) > + return BP_HUNGRY; > + } > + > + hotplug_allocated_memory(r); > + > + r = NULL; > + > + return BP_DONE; > +}Could you explain a bit about why you chose to do this stuff with memory resources? Is that the only visibility that you have in to what memory the guest actually has? What troubles did you run in to when you did add_memory(0, balloon_stats.boot_max_pfn, credit); ? It''s just that all the other memory hotplug users are _told_ by the hardware where to put things. Is that not the case here? -- Dave _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel Kiper
2011-Feb-07 14:12 UTC
[Xen-devel] Re: [PATCH R3 7/7] xen/balloon: Memory hotplug support for Xen balloon driver
On Thu, Feb 03, 2011 at 10:12:24AM -0800, Dave Hansen wrote:> On Thu, 2011-02-03 at 17:30 +0100, Daniel Kiper wrote: > > +static struct resource *allocate_memory_resource(unsigned long nr_pages) > > +{ > > + resource_size_t r_min, r_size; > > + struct resource *r; > > + > > + /* > > + * Look for first unused memory region starting at page > > + * boundary. Skip last memory section created at boot time > > + * because it may contains unused memory pages with PG_reserved > > + * bit not set (online_pages() require PG_reserved bit set). > > + */ > > Could you elaborate on this comment a bit? I think it''s covering both > the "PAGE_SIZE" argument to allocate_resource() and something else, but > I''m not quite sure.Yes, you are right. Aligment to PAGE_SIZE is done by allocate_resource(). Additionally, r_min (calculated below) sets lower limit at which hoplugged memory could be installed (due to PG_reserved bit requirment set up by online_pages()). Later r_min is put as an argument to allocate_resource() call.> > + r = kzalloc(sizeof(struct resource), GFP_KERNEL); > > + > > + if (!r) > > + return NULL; > > + > > + r->name = "System RAM"; > > + r->flags = IORESOURCE_MEM | IORESOURCE_BUSY; > > + r_min = PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 1)); > > Did you do this for alignment reasons? It might be a better idea to > just make a nice sparsemem function to do alignment.Please look above.> > + r_size = nr_pages << PAGE_SHIFT; > > + > > + if (allocate_resource(&iomem_resource, r, r_size, r_min, > > + ULONG_MAX, PAGE_SIZE, NULL, NULL) < 0) { > > + kfree(r); > > + return NULL; > > + } > > + > > + return r; > > +} > > This function should probably be made generic. I bet some more > hypervisors come around and want to use this. They generally won''t care > where the memory goes, and the kernel can allocate a spot for them.Yes, you are right. I think about new project in which this function will be generic and then I would move it to some more generic place. Now, I think it should stay here.> ... > > +static void hotplug_allocated_memory(struct resource *r) > > { > > - unsigned long target = balloon_stats.target_pages; > > + int nid, rc; > > + resource_size_t r_size; > > + struct memory_block *mem; > > + unsigned long pfn; > > + > > + r_size = r->end + 1 - r->start; > > + nid = memory_add_physaddr_to_nid(r->start); > > + > > + rc = add_registered_memory(nid, r->start, r_size); > > + > > + if (rc) { > > + pr_err("%s: add_registered_memory: Memory hotplug failed: %i\n", > > + __func__, rc); > > + balloon_stats.target_pages = balloon_stats.current_pages; > > + return; > > + } > > + > > + if (xen_pv_domain()) > > + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); ++pfn) > > + if (!PageHighMem(pfn_to_page(pfn))) { > > + rc = HYPERVISOR_update_va_mapping( > > + (unsigned long)__va(pfn << PAGE_SHIFT), > > + mfn_pte(pfn_to_mfn(pfn), PAGE_KERNEL), 0); > > + BUG_ON(rc); > > + } > > > > - target = min(target, > > - balloon_stats.current_pages + > > - balloon_stats.balloon_low + > > - balloon_stats.balloon_high); > > + rc = online_pages(PFN_DOWN(r->start), r_size >> PAGE_SHIFT); > > The memory hotplug code is a bit weird at first glance. It has two > distinct stages: first, add_memory() is called from > architecture-specific code. Later, online_pages() is called, but that > part is driven by userspace. > > For all the other hotplug users online_pages() is done later, and > triggered by userspace. The idea is that you could have memory sitting > in "added, but offline" state which would be trivial to remove if > someone else needed it, but also trivial to add without the need to > allocate additional memory. > > In other words, I think you can take everything from and including > online_pages() down in the function and take it out. Use a udev hotplug > rule to online it immediately if that''s what you want.I agree. I discussed a bit about this problem with Jeremy, too. However, there are some problems to implement that solution now. First of all it is possible to online hotplugged memory using sysfs interface only in chunks called sections. It means that it is not possible online once again section which was onlined ealier partialy populated and now it contains new pages to online. In this situation sysfs interface emits Invalid argument error. In theory it should be possible to offline and then online whole section once again, however, if memory from this section was used is not possible to do that. It means that those properties does not allow hotplug memory in guest in finer granulity than section and sysfs interface is too inflexible to be used in that solution. That is why I decided to online hoplugged memory using API which does not have those limitations. I think that two solutions are possible (in order of prefernce) to cope with that problem and move onlining to user domain: - migrate to new sysfs interface for onlining which allows address space description when onlining memory, - create new onlining sysfs inteface for Xen only (I think that first solution is much better because it will be generic). However, those solution require time to implementation and that is why, I think, that my solution (let''s call it workaround) should be implemented till new sysfs interface will be available.> > - return target; > > + if (rc) { > > + pr_err("%s: online_pages: Failed: %i\n", __func__, rc); > > + balloon_stats.target_pages = balloon_stats.current_pages; > > + return; > > + } > > + > > + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); pfn += PAGES_PER_SECTION) { > > + mem = find_memory_block(__pfn_to_section(pfn)); > > + BUG_ON(!mem); > > + BUG_ON(!present_section_nr(mem->phys_index)); > > + mutex_lock(&mem->state_mutex); > > + mem->state = MEM_ONLINE; > > + mutex_unlock(&mem->state_mutex); > > + } > > + > > + balloon_stats.current_pages += r_size >> PAGE_SHIFT; > > } > > > > +static enum bp_state request_additional_memory(long credit) > > +{ > > + int rc; > > + static struct resource *r; > > + static unsigned long pages_left; > > + > > + if ((credit <= 0 || balloon_stats.balloon_low || > > + balloon_stats.balloon_high) && !r) > > + return BP_DONE; > > + > > + if (!r) { > > + r = allocate_memory_resource(credit); > > + > > + if (!r) > > + return BP_ERROR; > > + > > + pages_left = credit; > > + } > > ''r'' is effectively a global variable here. Could you give it a more > proper name? Maybe "last add location" or something. It might even > make sense to move it up in to the global scope to make it _much_ more > clear that it''s not just locally used.This is used between subsequent calls (memory is allocated in chunks and onlined when all requested memory is allocated) to store address space for allocation. That is why it is static. It is used only in this function and if it is required then pointer to this variable is passed to called functions. I think that it should stay local to this function and do not pollute global variable namespace. However, I will rename it to something more meaningful.> > + rc = allocate_additional_memory(r, pages_left); > > + > > + if (rc < 0) { > > + if (balloon_stats.mh_policy == MH_POLICY_TRY_UNTIL_SUCCESS) > > + return BP_ERROR; > > + > > + r = adjust_memory_resource(r, pages_left); > > + > > + if (!r) > > + return BP_ERROR; > > + } else { > > + pages_left -= rc; > > + > > + if (pages_left) > > + return BP_HUNGRY; > > + } > > + > > + hotplug_allocated_memory(r); > > + > > + r = NULL; > > + > > + return BP_DONE; > > +} > > Could you explain a bit about why you chose to do this stuff with memory > resources? Is that the only visibility that you have in to what memory > the guest actually has?That depends on balloon_stats.mh_policy. If it is MH_POLICY_TRY_UNTIL_SUCCESS and memory allocation error appears then *r is not changed and attempts are made to allocate more memory. If it is MH_POLICY_STOP_AT_FIRST_ERROR and memory allocation error appears then *r is alligned to currently successfully allocated memory and memory is onlined (if size != 0).> What troubles did you run in to when you did > > add_memory(0, balloon_stats.boot_max_pfn, credit); > > ? > > It''s just that all the other memory hotplug users are _told_ by the > hardware where to put things. Is that not the case here?No. On bare metal BIOS/firmware cares about proper address space and when everything goes OK then notify operating system about memory hotplug. In Xen (and probably in others platforms will be) it is not a case and memory should be allocated ealier by guest. However, before doing that, I think, address space should be reserved (it is small chance that something will request overlaping address space, however, I think it is better to do that than sorry). Additionally, IIRC, add_memory() requires that underlying memory is available before its call. That is why I decided to create add_registered_memory() which do all things which are done by add_memory() excluding address space reservation which is done by allocate_memory_resource() before memory allocation. Daniel _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Dave Hansen
2011-Feb-08 17:22 UTC
[Xen-devel] Re: [PATCH R3 7/7] xen/balloon: Memory hotplug support for Xen balloon driver
On Mon, 2011-02-07 at 15:12 +0100, Daniel Kiper wrote:> On Thu, Feb 03, 2011 at 10:12:24AM -0800, Dave Hansen wrote: > > On Thu, 2011-02-03 at 17:30 +0100, Daniel Kiper wrote: > > > +static struct resource *allocate_memory_resource(unsigned long nr_pages) > > > +{ > > > + resource_size_t r_min, r_size; > > > + struct resource *r; > > > + > > > + /* > > > + * Look for first unused memory region starting at page > > > + * boundary. Skip last memory section created at boot time > > > + * because it may contains unused memory pages with PG_reserved > > > + * bit not set (online_pages() require PG_reserved bit set). > > > + */ > > > > Could you elaborate on this comment a bit? I think it''s covering both > > the "PAGE_SIZE" argument to allocate_resource() and something else, but > > I''m not quite sure. > > Yes, you are right. Aligment to PAGE_SIZE is done by allocate_resource(). > Additionally, r_min (calculated below) sets lower limit at which hoplugged > memory could be installed (due to PG_reserved bit requirment set up by > online_pages()). Later r_min is put as an argument to allocate_resource() call.OK, and you''ll update the comment on that?> > > + r = kzalloc(sizeof(struct resource), GFP_KERNEL); > > > + > > > + if (!r) > > > + return NULL; > > > + > > > + r->name = "System RAM"; > > > + r->flags = IORESOURCE_MEM | IORESOURCE_BUSY; > > > + r_min = PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 1)); > > > > Did you do this for alignment reasons? It might be a better idea to > > just make a nice sparsemem function to do alignment. > > Please look above.You spoke about page alignment up there. Why is this section-aligned? Should we make an "align to section" function in generic sparsemem code?> > > + r_size = nr_pages << PAGE_SHIFT; > > > + > > > + if (allocate_resource(&iomem_resource, r, r_size, r_min, > > > + ULONG_MAX, PAGE_SIZE, NULL, NULL) < 0) { > > > + kfree(r); > > > + return NULL; > > > + } > > > + > > > + return r; > > > +} > > > > This function should probably be made generic. I bet some more > > hypervisors come around and want to use this. They generally won''t care > > where the memory goes, and the kernel can allocate a spot for them. > > Yes, you are right. I think about new project in which > this function will be generic and then I would move it to > some more generic place. Now, I think it should stay here.Please move it to generic code. It doesn''t belong in Xen code.> > In other words, I think you can take everything from and including > > online_pages() down in the function and take it out. Use a udev hotplug > > rule to online it immediately if that''s what you want. > > I agree. I discussed a bit about this problem with Jeremy, too. However, > there are some problems to implement that solution now. First of all it is > possible to online hotplugged memory using sysfs interface only in chunks > called sections. It means that it is not possible online once again section > which was onlined ealier partialy populated and now it contains new pages > to online. In this situation sysfs interface emits Invalid argument error. > In theory it should be possible to offline and then online whole section > once again, however, if memory from this section was used is not possible > to do that. It means that those properties does not allow hotplug memory > in guest in finer granulity than section and sysfs interface is too inflexible > to be used in that solution. That is why I decided to online hoplugged memory > using API which does not have those limitations.Sure, you have to _online_ the whole thing at once, but you don''t have to actually make the pages available. You also don''t need to hook in to the memory resource code like you''re doing. It''s sufficient to just try and add the memory. If you get -EEXIST, then you can''t add it there, so move up and try again. int xen_balloon_add_memory(u64 size) { unsigned long top_of_mem = max_pfn; top_of_mem = section_align_up(top_of_mem); while (1) { int ret = add_memory(nid, top_of_mem, size); if (ret == -EEXIST) continue; // error handling... break; } return...; } As for telling the hypervisor where you''ve mapped things, that should be done in arch_add_memory(). When it comes down to online_page(), you don''t want your pages freed back in to the buddy allocator, you want them put in to the balloon. So, take the __free_page() in online_page(), and put a Xen hook in there. +void __attribute__((weak)) arch_free_hotplug_page(struct page *page) +{ + __free_page(page); +} void online_page(struct page *page) { unsigned long pfn = page_to_pfn(page); ... - __free_page(page); + arch_free_hotplug_page(page); } Then, have Xen override it: void arch_free_hotplug_page(struct page *page) { if (xen_need_to_inflate_balloon()) put_page_in_balloon(page); else __free_page(page); }> Additionally, IIRC, add_memory() requires > that underlying memory is available before its call.No, that''s not correct. s390''s memory isn''t available until after it calls vmem_add_mapping(). See arch/s390/mm/init.c> That is why I > decided to create add_registered_memory() which do all things which are > done by add_memory() excluding address space reservation which is done > by allocate_memory_resource() before memory allocation.The added memory itself isn''t touched until online_pages() time. That''s yet another reason it was separated out logically in the code. -- Dave _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Konrad Rzeszutek Wilk
2011-Feb-10 16:53 UTC
[Xen-devel] Re: [PATCH R3 7/7] xen/balloon: Memory hotplug support for Xen balloon driver
On Thu, Feb 03, 2011 at 05:30:33PM +0100, Daniel Kiper wrote:> Features and fixes: > - new version of memory hotplug patch which supports > among others memory allocation policies during errors > (try until success or stop at first error), > - this version of patch was tested with tmem > (selfballooning and frontswap) and works > very well with it, > - some other minor fixes. > > Signed-off-by: Daniel Kiper <dkiper@net-space.pl> > --- > drivers/xen/Kconfig | 10 ++ > drivers/xen/balloon.c | 231 ++++++++++++++++++++++++++++++++++++++++++++++--- > 2 files changed, 230 insertions(+), 11 deletions(-) > > diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig > index 07bec09..8f880aa 100644 > --- a/drivers/xen/Kconfig > +++ b/drivers/xen/Kconfig > @@ -9,6 +9,16 @@ config XEN_BALLOON > the system to expand the domain''s memory allocation, or alternatively > return unneeded memory to the system. > > +config XEN_BALLOON_MEMORY_HOTPLUG > + bool "Memory hotplug support for Xen balloon driver" > + default n > + depends on XEN_BALLOON && MEMORY_HOTPLUG > + help > + Memory hotplug support for Xen balloon driver allows expanding memory > + available for the system above limit declared at system startup. > + It is very useful on critical systems which require long > + run without rebooting. > + > config XEN_SCRUB_PAGES > bool "Scrub pages before returning them to system" > depends on XEN_BALLOON > diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c > index b1e199c..e43e928 100644 > --- a/drivers/xen/balloon.c > +++ b/drivers/xen/balloon.c > @@ -6,6 +6,12 @@ > * Copyright (c) 2003, B Dragovic > * Copyright (c) 2003-2004, M Williamson, K Fraser > * Copyright (c) 2005 Dan M. Smith, IBM Corporation > + * Copyright (c) 2010 Daniel Kiper > + * > + * Memory hotplug support was written by Daniel Kiper. Work on > + * it was sponsored by Google under Google Summer of Code 2010 > + * program. Jeremy Fitzhardinge from Xen.org was the mentor for > + * this project. > * > * This program is free software; you can redistribute it and/or > * modify it under the terms of the GNU General Public License version 2 > @@ -44,6 +50,7 @@ > #include <linux/list.h> > #include <linux/sysdev.h> > #include <linux/gfp.h> > +#include <linux/memory.h> > > #include <asm/page.h> > #include <asm/pgalloc.h> > @@ -80,6 +87,9 @@ enum bp_state { > BP_HUNGRY > }; > > +#define MH_POLICY_TRY_UNTIL_SUCCESS 0 > +#define MH_POLICY_STOP_AT_FIRST_ERROR 1Cool.> + > struct balloon_stats { > /* We aim for ''current allocation'' == ''target allocation''. */ > unsigned long current_pages; > @@ -89,6 +99,10 @@ struct balloon_stats { > unsigned long balloon_high; > unsigned long schedule_delay; > unsigned long max_schedule_delay; > +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG > + unsigned long boot_max_pfn; > + unsigned long mh_policy; > +#endif > }; > > static DEFINE_MUTEX(balloon_mutex); > @@ -206,18 +220,199 @@ static void update_schedule_delay(enum bp_state state) > balloon_stats.schedule_delay = new_schedule_delay; > } > > -static unsigned long current_target(void) > +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG > +static struct resource *allocate_memory_resource(unsigned long nr_pages) > +{ > + resource_size_t r_min, r_size; > + struct resource *r; > + > + /* > + * Look for first unused memory region starting at page > + * boundary. Skip last memory section created at boot time > + * because it may contains unused memory pages with PG_reserved > + * bit not set (online_pages() require PG_reserved bit set). > + */ > + > + r = kzalloc(sizeof(struct resource), GFP_KERNEL); > + > + if (!r) > + return NULL; > + > + r->name = "System RAM"; > + r->flags = IORESOURCE_MEM | IORESOURCE_BUSY; > + r_min = PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 1)); > + r_size = nr_pages << PAGE_SHIFT; > + > + if (allocate_resource(&iomem_resource, r, r_size, r_min, > + ULONG_MAX, PAGE_SIZE, NULL, NULL) < 0) { > + kfree(r); > + return NULL; > + } > + > + return r; > +} > + > +static struct resource *adjust_memory_resource(struct resource *r, unsigned long nr_pages) > +{ > + int rc; > + > + if (r->end + 1 - (nr_pages << PAGE_SHIFT) == r->start) {Will this actually occur? Say I called ''allocate_additional_memory'' with 512 and got -ENOMEM (so the hypercall failed complelty). The mh->policy MH_POLICY_STOP_AT_FIRST_ERROR. So we end up here. Assume the r_min is 0x100000000, then r->start is 0x100000000 and r->end is 0x100200000. So: 100200001 - (200000) == 0x100000000 ?> + rc = release_resource(r); > + BUG_ON(rc < 0); > + kfree(r); > + return NULL; > + } > + > + rc = adjust_resource(r, r->start, r->end + 1 - r->start - > + (nr_pages << PAGE_SHIFT));If we wanted 512 pages, and only got 256 and want to adjust the region, we would want it be: 0x100000000 -> 0x100100000 right? So with the third argument that comes out to be: 0x100200000 + 1 - 0x100000000 - (100000) = 100001 which is just one page above what we requested?> + > + BUG_ON(rc < 0);Can we just do WARN_ON, and return NULL instead (and also release the resource)?> + > + return r; > +} > + > +static int allocate_additional_memory(struct resource *r, unsigned long nr_pages) > +{ > + int rc; > + struct xen_memory_reservation reservation = { > + .address_bits = 0, > + .extent_order = 0, > + .domid = DOMID_SELF > + }; > + unsigned long flags, i, pfn, pfn_start; > + > + if (!nr_pages) > + return 0; > + > + pfn_start = PFN_UP(r->end) - nr_pages; > + > + if (nr_pages > ARRAY_SIZE(frame_list)) > + nr_pages = ARRAY_SIZE(frame_list); > + > + for (i = 0, pfn = pfn_start; i < nr_pages; ++i, ++pfn) > + frame_list[i] = pfn; > + > + set_xen_guest_handle(reservation.extent_start, frame_list); > + reservation.nr_extents = nr_pages; > + > + spin_lock_irqsave(&xen_reservation_lock, flags); > + > + rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); > + > + if (rc <= 0) > + return (rc < 0) ? rc : -ENOMEM; > +So if we populated some of them (say we want to 512, but only did 64), don''t we want to do the loop below? Also you look to be forgetting to do a spin_unlock_irqrestore if you quit here.> + for (i = 0, pfn = pfn_start; i < rc; ++i, ++pfn) { > + BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && > + phys_to_machine_mapping_valid(pfn)); > + set_phys_to_machine(pfn, frame_list[i]); > + } > + > + spin_unlock_irqrestore(&xen_reservation_lock, flags); > + > + return rc; > +} > + > +static void hotplug_allocated_memory(struct resource *r) > { > - unsigned long target = balloon_stats.target_pages; > + int nid, rc; > + resource_size_t r_size; > + struct memory_block *mem; > + unsigned long pfn; > + > + r_size = r->end + 1 - r->start;Why bump it by one byte?> + nid = memory_add_physaddr_to_nid(r->start); > + > + rc = add_registered_memory(nid, r->start, r_size); > + > + if (rc) { > + pr_err("%s: add_registered_memory: Memory hotplug failed: %i\n", > + __func__, rc); > + balloon_stats.target_pages = balloon_stats.current_pages; > + return; > + } > + > + if (xen_pv_domain()) > + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); ++pfn)I think you the r->start to be PFN_UP just in case the r->start is not page aligned. Thought I am not sure it would even happen anymore, as M A Young found the culprit that made it possible for us to setup memory regions non-aligned and that is fixed now (in 2.6.38).> + if (!PageHighMem(pfn_to_page(pfn))) { > + rc = HYPERVISOR_update_va_mapping( > + (unsigned long)__va(pfn << PAGE_SHIFT), > + mfn_pte(pfn_to_mfn(pfn), PAGE_KERNEL), 0); > + BUG_ON(rc); > + } > > - target = min(target, > - balloon_stats.current_pages + > - balloon_stats.balloon_low + > - balloon_stats.balloon_high); > + rc = online_pages(PFN_DOWN(r->start), r_size >> PAGE_SHIFT); > > - return target; > + if (rc) { > + pr_err("%s: online_pages: Failed: %i\n", __func__, rc); > + balloon_stats.target_pages = balloon_stats.current_pages; > + return; > + } > + > + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); pfn += PAGES_PER_SECTION) {Ditto. Can you do PFN_UP(r->start)?> + mem = find_memory_block(__pfn_to_section(pfn)); > + BUG_ON(!mem); > + BUG_ON(!present_section_nr(mem->phys_index)); > + mutex_lock(&mem->state_mutex); > + mem->state = MEM_ONLINE; > + mutex_unlock(&mem->state_mutex); > + } > + > + balloon_stats.current_pages += r_size >> PAGE_SHIFT; > } > > +static enum bp_state request_additional_memory(long credit) > +{ > + int rc; > + static struct resource *r; > + static unsigned long pages_left; > + > + if ((credit <= 0 || balloon_stats.balloon_low || > + balloon_stats.balloon_high) && !r) > + return BP_DONE; > + > + if (!r) { > + r = allocate_memory_resource(credit); > + > + if (!r) > + return BP_ERROR; > + > + pages_left = credit; > + } > + > + rc = allocate_additional_memory(r, pages_left); > + > + if (rc < 0) { > + if (balloon_stats.mh_policy == MH_POLICY_TRY_UNTIL_SUCCESS) > + return BP_ERROR; > + > + r = adjust_memory_resource(r, pages_left); > + > + if (!r) > + return BP_ERROR;Say we failed the hypercall completly and got -ENOMEM from the ''allocate_additional_memory''. I presume the adjust_memory_resource at this point would have deleted ''r'', which means that (!r) and we return BP_ERROR. But that means that we aren''t following the MH_POLICY_STOP_AT_FIRST_ERROR as the balloon_process will retry again and the again, and again??> + } else { > + pages_left -= rc; > +So say I request 512 pages (mh_policy is MH_POLICY_STOP_AT_FIRST_ERROR), but only got 256. I adjust the pages_left to be 256 and then> + if (pages_left) > + return BP_HUNGRY;we return BP_HUNGRY. That makes ''balloon_process'' retry with 512 pages, and we keep on trying and call "allocate_additional_memory", which fails once more (returns 256), and we end up returning BP_HUNGRY, and retry... and so on. Would it be make sense to have a check here for the MH_POLICY_STOP_AT_FIRST_ERROR and if so call the adjust_memory_memory_resource as well?> + } > + > + hotplug_allocated_memory(r); > + > + r = NULL; > + > + return BP_DONE; > +} > +#else > +static enum bp_state request_additional_memory(long credit) > +{ > + if (balloon_stats.balloon_low && balloon_stats.balloon_high && > + balloon_stats.target_pages > balloon_stats.current_pages) > + balloon_stats.target_pages = balloon_stats.current_pages; > + return BP_DONE; > +} > +#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ > + > static enum bp_state increase_reservation(unsigned long nr_pages) > { > enum bp_state state = BP_DONE; > @@ -352,15 +547,17 @@ static enum bp_state decrease_reservation(unsigned long nr_pages) > */ > static void balloon_process(struct work_struct *work) > { > - enum bp_state rc, state = BP_DONE; > + enum bp_state rc, state; > long credit; > > mutex_lock(&balloon_mutex); > > do { > - credit = current_target() - balloon_stats.current_pages; > + credit = balloon_stats.target_pages - balloon_stats.current_pages; > > - if (credit > 0) { > + state = request_additional_memory(credit); > + > + if (credit > 0 && state == BP_DONE) { > rc = increase_reservation(credit); > state = (rc == BP_ERROR) ? BP_ERROR : state; > } > @@ -450,6 +647,11 @@ static int __init balloon_init(void) > balloon_stats.schedule_delay = 1; > balloon_stats.max_schedule_delay = 32; > > +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG > + balloon_stats.boot_max_pfn = max_pfn; > + balloon_stats.mh_policy = MH_POLICY_STOP_AT_FIRST_ERROR; > +#endif > + > register_balloon(&balloon_sysdev); > > /* > @@ -506,6 +708,10 @@ BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high)); > static SYSDEV_ULONG_ATTR(schedule_delay, 0644, balloon_stats.schedule_delay); > static SYSDEV_ULONG_ATTR(max_schedule_delay, 0644, balloon_stats.max_schedule_delay); > > +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG > +static SYSDEV_ULONG_ATTR(memory_hotplug_policy, 0644, balloon_stats.mh_policy); > +#endif > + > static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr, > char *buf) > { > @@ -568,7 +774,10 @@ static struct sysdev_attribute *balloon_attrs[] = { > &attr_target_kb, > &attr_target, > &attr_schedule_delay.attr, > - &attr_max_schedule_delay.attr > + &attr_max_schedule_delay.attr, > +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG > + &attr_memory_hotplug_policy.attr > +#endif > }; > > static struct attribute *balloon_info_attrs[] = { > -- > 1.5.6.5 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Konrad Rzeszutek Wilk
2011-Feb-10 17:01 UTC
[Xen-devel] Re: [PATCH R3 7/7] xen/balloon: Memory hotplug support for Xen balloon driver
On Tue, Feb 08, 2011 at 09:22:26AM -0800, Dave Hansen wrote:> On Mon, 2011-02-07 at 15:12 +0100, Daniel Kiper wrote: > > On Thu, Feb 03, 2011 at 10:12:24AM -0800, Dave Hansen wrote: > > > On Thu, 2011-02-03 at 17:30 +0100, Daniel Kiper wrote: > > > > +static struct resource *allocate_memory_resource(unsigned long nr_pages) > > > > +{ > > > > + resource_size_t r_min, r_size; > > > > + struct resource *r; > > > > + > > > > + /* > > > > + * Look for first unused memory region starting at page > > > > + * boundary. Skip last memory section created at boot time > > > > + * because it may contains unused memory pages with PG_reserved > > > > + * bit not set (online_pages() require PG_reserved bit set). > > > > + */ > > > > > > Could you elaborate on this comment a bit? I think it''s covering both > > > the "PAGE_SIZE" argument to allocate_resource() and something else, but > > > I''m not quite sure. > > > > Yes, you are right. Aligment to PAGE_SIZE is done by allocate_resource(). > > Additionally, r_min (calculated below) sets lower limit at which hoplugged > > memory could be installed (due to PG_reserved bit requirment set up by > > online_pages()). Later r_min is put as an argument to allocate_resource() call. > > OK, and you''ll update the comment on that? > > > > > + r = kzalloc(sizeof(struct resource), GFP_KERNEL); > > > > + > > > > + if (!r) > > > > + return NULL; > > > > + > > > > + r->name = "System RAM"; > > > > + r->flags = IORESOURCE_MEM | IORESOURCE_BUSY; > > > > + r_min = PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 1)); > > > > > > Did you do this for alignment reasons? It might be a better idea to > > > just make a nice sparsemem function to do alignment. > > > > Please look above. > > You spoke about page alignment up there. Why is this section-aligned? > Should we make an "align to section" function in generic sparsemem code? > > > > > + r_size = nr_pages << PAGE_SHIFT; > > > > + > > > > + if (allocate_resource(&iomem_resource, r, r_size, r_min, > > > > + ULONG_MAX, PAGE_SIZE, NULL, NULL) < 0) { > > > > + kfree(r); > > > > + return NULL; > > > > + } > > > > + > > > > + return r; > > > > +} > > > > > > This function should probably be made generic. I bet some more > > > hypervisors come around and want to use this. They generally won''t care > > > where the memory goes, and the kernel can allocate a spot for them. > > > > Yes, you are right. I think about new project in which > > this function will be generic and then I would move it to > > some more generic place. Now, I think it should stay here. > > Please move it to generic code. It doesn''t belong in Xen code. > > > > In other words, I think you can take everything from and including > > > online_pages() down in the function and take it out. Use a udev hotplug > > > rule to online it immediately if that''s what you want. > > > > I agree. I discussed a bit about this problem with Jeremy, too. However, > > there are some problems to implement that solution now. First of all it is > > possible to online hotplugged memory using sysfs interface only in chunks > > called sections. It means that it is not possible online once again section > > which was onlined ealier partialy populated and now it contains new pages > > to online. In this situation sysfs interface emits Invalid argument error. > > In theory it should be possible to offline and then online whole section > > once again, however, if memory from this section was used is not possible > > to do that. It means that those properties does not allow hotplug memory > > in guest in finer granulity than section and sysfs interface is too inflexible > > to be used in that solution. That is why I decided to online hoplugged memory > > using API which does not have those limitations. > > Sure, you have to _online_ the whole thing at once, but you don''t have > to actually make the pages available. You also don''t need to hook in to > the memory resource code like you''re doing. It''s sufficient to just try > and add the memory. If you get -EEXIST, then you can''t add it there, so > move up and try again. > > int xen_balloon_add_memory(u64 size) > { > unsigned long top_of_mem = max_pfn; > top_of_mem = section_align_up(top_of_mem); > > while (1) { > int ret = add_memory(nid, top_of_mem, size); > if (ret == -EEXIST) > continue; > // error handling... > break; > } > return...; > } > > As for telling the hypervisor where you''ve mapped things, that should be > done in arch_add_memory(). > > When it comes down to online_page(), you don''t want your pages freed > back in to the buddy allocator, you want them put in to the balloon. > So, take the __free_page() in online_page(), and put a Xen hook in > there. > > +void __attribute__((weak)) arch_free_hotplug_page(struct page *page) > +{ > + __free_page(page); > +}I somehow have a vague recollection that the __weak was frowned upon? The issues were that when you compile a pv-ops kernel it can run as baremetal so the..> > void online_page(struct page *page) > { > unsigned long pfn = page_to_pfn(page); > ... > - __free_page(page); > + arch_free_hotplug_page(page); > } > > Then, have Xen override it: > > void arch_free_hotplug_page(struct page *page) > { > if (xen_need_to_inflate_balloon()) > put_page_in_balloon(page); > else > __free_page(page);call above would get called even on baremetal (and would require the header file arch/x86/include/memory_hotplug.h to pull in header file from the balloon driver). If we are going to persue this it might be prudent to follow what we did for MSI: 1525bf0d8f059a38c6e79353583854e1981b2e67 294ee6f89cfd629e276f632a6003a0fad7785dce b5401a96b59475c1c878439caecb8c521bdfd4ad _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Dave Hansen
2011-Feb-10 17:16 UTC
[Xen-devel] Re: [PATCH R3 7/7] xen/balloon: Memory hotplug support for Xen balloon driver
On Thu, 2011-02-10 at 12:01 -0500, Konrad Rzeszutek Wilk wrote:> On Tue, Feb 08, 2011 at 09:22:26AM -0800, Dave Hansen wrote: > > On Mon, 2011-02-07 at 15:12 +0100, Daniel Kiper wrote: > > > I agree. I discussed a bit about this problem with Jeremy, too. However, > > > there are some problems to implement that solution now. First of all it is > > > possible to online hotplugged memory using sysfs interface only in chunks > > > called sections. It means that it is not possible online once again section > > > which was onlined ealier partialy populated and now it contains new pages > > > to online. In this situation sysfs interface emits Invalid argument error. > > > In theory it should be possible to offline and then online whole section > > > once again, however, if memory from this section was used is not possible > > > to do that. It means that those properties does not allow hotplug memory > > > in guest in finer granulity than section and sysfs interface is too inflexible > > > to be used in that solution. That is why I decided to online hoplugged memory > > > using API which does not have those limitations. > > > > Sure, you have to _online_ the whole thing at once, but you don''t have > > to actually make the pages available. You also don''t need to hook in to > > the memory resource code like you''re doing. It''s sufficient to just try > > and add the memory. If you get -EEXIST, then you can''t add it there, so > > move up and try again. > > > > int xen_balloon_add_memory(u64 size) > > { > > unsigned long top_of_mem = max_pfn; > > top_of_mem = section_align_up(top_of_mem); > > > > while (1) { > > int ret = add_memory(nid, top_of_mem, size); > > if (ret == -EEXIST) > > continue; > > // error handling... > > break; > > } > > return...; > > } > > > > As for telling the hypervisor where you''ve mapped things, that should be > > done in arch_add_memory(). > > > > When it comes down to online_page(), you don''t want your pages freed > > back in to the buddy allocator, you want them put in to the balloon. > > So, take the __free_page() in online_page(), and put a Xen hook in > > there. > > > > +void __attribute__((weak)) arch_free_hotplug_page(struct page *page) > > +{ > > + __free_page(page); > > +} > > I somehow have a vague recollection that the __weak was frowned upon? The issues > were that when you compile a pv-ops kernel it can run as baremetal so the..There are a bunch of alternatives to using ''weak''. Any of those would probably be fine as well. Anything that allows us to check whether the page should go back in to the allocator or the balloon.> > void online_page(struct page *page) > > { > > unsigned long pfn = page_to_pfn(page); > > ... > > - __free_page(page); > > + arch_free_hotplug_page(page); > > } > > > > Then, have Xen override it: > > > > void arch_free_hotplug_page(struct page *page) > > { > > if (xen_need_to_inflate_balloon()) > > put_page_in_balloon(page); > > else > > __free_page(page); > > call above would get called even on baremetal (and would require the header > file arch/x86/include/memory_hotplug.h to pull in header file from the balloon > driver). If we are going to persue this it might be prudent to follow what we did for MSI: > > 1525bf0d8f059a38c6e79353583854e1981b2e67 > 294ee6f89cfd629e276f632a6003a0fad7785dce > b5401a96b59475c1c878439caecb8c521bdfd4adThat looks a bit complicated for what I''m trying to do here, but whatever works. -- Dave _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel Kiper
2011-Feb-11 23:13 UTC
[Xen-devel] Re: [PATCH R3 7/7] xen/balloon: Memory hotplug support for Xen balloon driver
On Tue, Feb 08, 2011 at 09:22:26AM -0800, Dave Hansen wrote:> On Mon, 2011-02-07 at 15:12 +0100, Daniel Kiper wrote: > > On Thu, Feb 03, 2011 at 10:12:24AM -0800, Dave Hansen wrote: > > > On Thu, 2011-02-03 at 17:30 +0100, Daniel Kiper wrote: > > > > +static struct resource *allocate_memory_resource(unsigned long nr_pages) > > > > +{ > > > > + resource_size_t r_min, r_size; > > > > + struct resource *r; > > > > + > > > > + /* > > > > + * Look for first unused memory region starting at page > > > > + * boundary. Skip last memory section created at boot time > > > > + * because it may contains unused memory pages with PG_reserved > > > > + * bit not set (online_pages() require PG_reserved bit set). > > > > + */ > > > > > > Could you elaborate on this comment a bit? I think it''s covering both > > > the "PAGE_SIZE" argument to allocate_resource() and something else, but > > > I''m not quite sure. > > > > Yes, you are right. Aligment to PAGE_SIZE is done by allocate_resource(). > > Additionally, r_min (calculated below) sets lower limit at which hoplugged > > memory could be installed (due to PG_reserved bit requirment set up by > > online_pages()). Later r_min is put as an argument to allocate_resource() call. > > OK, and you''ll update the comment on that?OK.> > > > + r = kzalloc(sizeof(struct resource), GFP_KERNEL); > > > > + > > > > + if (!r) > > > > + return NULL; > > > > + > > > > + r->name = "System RAM"; > > > > + r->flags = IORESOURCE_MEM | IORESOURCE_BUSY; > > > > + r_min = PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 1)); > > > > > > Did you do this for alignment reasons? It might be a better idea to > > > just make a nice sparsemem function to do alignment. > > > > Please look above. > > You spoke about page alignment up there. Why is this section-aligned? > Should we make an "align to section" function in generic sparsemem code?It is done because all pages in relevant section starting from max_pfn to the end of that section do not have PG_reserved bit set. It was tested on Linux Kernel Ver. 2.6.32.x, however, I am going to do some tests on current Linus tree. Currently, I do not expect that "align to section" function is required by others.> > > > + r_size = nr_pages << PAGE_SHIFT; > > > > + > > > > + if (allocate_resource(&iomem_resource, r, r_size, r_min, > > > > + ULONG_MAX, PAGE_SIZE, NULL, NULL) < 0) { > > > > + kfree(r); > > > > + return NULL; > > > > + } > > > > + > > > > + return r; > > > > +} > > > > > > This function should probably be made generic. I bet some more > > > hypervisors come around and want to use this. They generally won''t care > > > where the memory goes, and the kernel can allocate a spot for them. > > > > Yes, you are right. I think about new project in which > > this function will be generic and then I would move it to > > some more generic place. Now, I think it should stay here. > > Please move it to generic code. It doesn''t belong in Xen code.OK.> > > In other words, I think you can take everything from and including > > > online_pages() down in the function and take it out. Use a udev hotplug > > > rule to online it immediately if that''s what you want. > > > > I agree. I discussed a bit about this problem with Jeremy, too. However, > > there are some problems to implement that solution now. First of all it is > > possible to online hotplugged memory using sysfs interface only in chunks > > called sections. It means that it is not possible online once again section > > which was onlined ealier partialy populated and now it contains new pages > > to online. In this situation sysfs interface emits Invalid argument error. > > In theory it should be possible to offline and then online whole section > > once again, however, if memory from this section was used is not possible > > to do that. It means that those properties does not allow hotplug memory > > in guest in finer granulity than section and sysfs interface is too inflexible > > to be used in that solution. That is why I decided to online hoplugged memory > > using API which does not have those limitations. > > Sure, you have to _online_ the whole thing at once, but you don''t have > to actually make the pages available. You also don''t need to hook in to > the memory resource code like you''re doing. It''s sufficient to just try > and add the memory. If you get -EEXIST, then you can''t add it there, so > move up and try again. > > int xen_balloon_add_memory(u64 size) > { > unsigned long top_of_mem = max_pfn; > top_of_mem = section_align_up(top_of_mem); > > while (1) { > int ret = add_memory(nid, top_of_mem, size); > if (ret == -EEXIST) > continue; > // error handling... > break; > } > return...; > } > > As for telling the hypervisor where you''ve mapped things, that should be > done in arch_add_memory(). > > When it comes down to online_page(), you don''t want your pages freed > back in to the buddy allocator, you want them put in to the balloon. > So, take the __free_page() in online_page(), and put a Xen hook in > there. > > +void __attribute__((weak)) arch_free_hotplug_page(struct page *page) > +{ > + __free_page(page); > +}I think that this function should be registered dynamically at init stage by specific balloon driver (in this case Xen balloon driver).> void online_page(struct page *page) > { > unsigned long pfn = page_to_pfn(page); > ... > - __free_page(page); > + arch_free_hotplug_page(page); > } > > Then, have Xen override it: > > void arch_free_hotplug_page(struct page *page) > { > if (xen_need_to_inflate_balloon()) > put_page_in_balloon(page); > else > __free_page(page); > }Whole idea looks quiet interesting. I will try to migrate to that solution.> > Additionally, IIRC, add_memory() requires > > that underlying memory is available before its call. > > No, that''s not correct. s390''s memory isn''t available until after it > calls vmem_add_mapping(). See arch/s390/mm/init.cI was right to some extent. First versions of memory hotplug code were written on the base of Linux Kernel Ver. 2.6.32.x. Tests done on that versions showed that add_memory() required that underlying memory should be available before its call. However, after short investigation it came out that there are some issues with some Xen calls. Those issues does not exists in current Linus tree. Daniel _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Dave Hansen
2011-Feb-11 23:22 UTC
[Xen-devel] Re: [PATCH R3 7/7] xen/balloon: Memory hotplug support for Xen balloon driver
On Sat, 2011-02-12 at 00:13 +0100, Daniel Kiper wrote:> On Tue, Feb 08, 2011 at 09:22:26AM -0800, Dave Hansen wrote: > > You spoke about page alignment up there. Why is this section-aligned? > > Should we make an "align to section" function in generic sparsemem code? > > It is done because all pages in relevant section starting from max_pfn > to the end of that section do not have PG_reserved bit set. It was tested > on Linux Kernel Ver. 2.6.32.x, however, I am going to do some tests on > current Linus tree. Currently, I do not expect that "align to section" > function is required by others.It doesn''t matter if it gets used by anybody else. It''s a generic function that fits in well with the other sparsemem code. It should go there. ...> > As for telling the hypervisor where you''ve mapped things, that should be > > done in arch_add_memory(). > > > > When it comes down to online_page(), you don''t want your pages freed > > back in to the buddy allocator, you want them put in to the balloon. > > So, take the __free_page() in online_page(), and put a Xen hook in > > there. > > > > +void __attribute__((weak)) arch_free_hotplug_page(struct page *page) > > +{ > > + __free_page(page); > > +} > > I think that this function should be registered dynamically at init > stage by specific balloon driver (in this case Xen balloon driver).That sounds fine to me. I guess we could use some of the subarch stuff or the pv_ops structure to do it as well. This isn''t exactly a hot path, either, so I''m not worried about it being some kind of conditional. Really, anything that allows us to divert pages over to the Xen balloon driver rather than the buddy allocator is probably just fine.> > > Additionally, IIRC, add_memory() requires > > > that underlying memory is available before its call. > > > > No, that''s not correct. s390''s memory isn''t available until after it > > calls vmem_add_mapping(). See arch/s390/mm/init.c > > I was right to some extent. First versions of memory hotplug code were > written on the base of Linux Kernel Ver. 2.6.32.x. Tests done on that > versions showed that add_memory() required that underlying memory should > be available before its call. However, after short investigation it came > out that there are some issues with some Xen calls. Those issues does > not exists in current Linus tree.Sounds good, I''m looking forward to your next patch. -- Dave _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel Kiper
2011-Feb-12 00:45 UTC
[Xen-devel] Re: [PATCH R3 7/7] xen/balloon: Memory hotplug support for Xen balloon driver
On Thu, Feb 10, 2011 at 11:53:16AM -0500, Konrad Rzeszutek Wilk wrote:> On Thu, Feb 03, 2011 at 05:30:33PM +0100, Daniel Kiper wrote:[...]> > +static struct resource *adjust_memory_resource(struct resource *r, unsigned long nr_pages) > > +{ > > + int rc; > > + > > + if (r->end + 1 - (nr_pages << PAGE_SHIFT) == r->start) { > > Will this actually occur? Say I called ''allocate_additional_memory'' with 512 > and got -ENOMEM (so the hypercall failed complelty). The mh->policy > MH_POLICY_STOP_AT_FIRST_ERROR. So we end up here. Assume the r_min is > 0x100000000, then r->start is 0x100000000 and r->end is 0x100200000. > > So: > 100200001 - (200000) == 0x100000000 ?r->end points always to last byte in currently allocated resource. It means that: r->end == r->start + size - 1> > + rc = release_resource(r); > > + BUG_ON(rc < 0); > > + kfree(r); > > + return NULL; > > + } > > + > > + rc = adjust_resource(r, r->start, r->end + 1 - r->start - > > + (nr_pages << PAGE_SHIFT)); > > If we wanted 512 pages, and only got 256 and want to adjust the region, we > would want it be: > 0x100000000 -> 0x100100000 right? > > So with the third argument that comes out to be: > > 0x100200000 + 1 - 0x100000000 - (100000) = 100001 > > which is just one page above what we requested?Please, look above.> > + > > + BUG_ON(rc < 0); > > Can we just do WARN_ON, and return NULL instead (and also release the resource)?I will rethink that once again.> > + > > + return r; > > +} > > + > > +static int allocate_additional_memory(struct resource *r, unsigned long nr_pages) > > +{ > > + int rc; > > + struct xen_memory_reservation reservation = { > > + .address_bits = 0, > > + .extent_order = 0, > > + .domid = DOMID_SELF > > + }; > > + unsigned long flags, i, pfn, pfn_start; > > + > > + if (!nr_pages) > > + return 0; > > + > > + pfn_start = PFN_UP(r->end) - nr_pages; > > + > > + if (nr_pages > ARRAY_SIZE(frame_list)) > > + nr_pages = ARRAY_SIZE(frame_list); > > + > > + for (i = 0, pfn = pfn_start; i < nr_pages; ++i, ++pfn) > > + frame_list[i] = pfn; > > + > > + set_xen_guest_handle(reservation.extent_start, frame_list); > > + reservation.nr_extents = nr_pages; > > + > > + spin_lock_irqsave(&xen_reservation_lock, flags); > > + > > + rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); > > + > > + if (rc <= 0) > > + return (rc < 0) ? rc : -ENOMEM; > > + > > So if we populated some of them (say we want to 512, but only did 64), > don''t we want to do the loop below? Also you look to be forgetting to > do a spin_unlock_irqrestore if you quit here.Loop which you mentioned is skipped only when HYPERVISOR_memory_op() does not allocate anything (0 pages) or something went wrong and an error code was returned. spin_lock_irqsave()/spin_unlock_irqrestore() should be removed as like it was done in increase_reservation()/decrease_reservation(). I overlooked those calls. Thanks.> > + for (i = 0, pfn = pfn_start; i < rc; ++i, ++pfn) { > > + BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && > > + phys_to_machine_mapping_valid(pfn)); > > + set_phys_to_machine(pfn, frame_list[i]); > > + } > > + > > + spin_unlock_irqrestore(&xen_reservation_lock, flags); > > + > > + return rc; > > +} > > + > > +static void hotplug_allocated_memory(struct resource *r) > > { > > - unsigned long target = balloon_stats.target_pages; > > + int nid, rc; > > + resource_size_t r_size; > > + struct memory_block *mem; > > + unsigned long pfn; > > + > > + r_size = r->end + 1 - r->start; > > Why bump it by one byte?Please, look above.> > + nid = memory_add_physaddr_to_nid(r->start); > > + > > + rc = add_registered_memory(nid, r->start, r_size); > > + > > + if (rc) { > > + pr_err("%s: add_registered_memory: Memory hotplug failed: %i\n", > > + __func__, rc); > > + balloon_stats.target_pages = balloon_stats.current_pages; > > + return; > > + } > > + > > + if (xen_pv_domain()) > > + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); ++pfn) > > I think you the r->start to be PFN_UP just in case the r->start is not page > aligned. Thought I am not sure it would even happen anymore, as M A Young > found the culprit that made it possible for us to setup memory regions > non-aligned and that is fixed now (in 2.6.38).r->start is always page aligned because allocate_resource() always returns page aligned resource (it is forced by arguments).> > + if (!PageHighMem(pfn_to_page(pfn))) { > > + rc = HYPERVISOR_update_va_mapping( > > + (unsigned long)__va(pfn << PAGE_SHIFT), > > + mfn_pte(pfn_to_mfn(pfn), PAGE_KERNEL), 0); > > + BUG_ON(rc); > > + } > > > > - target = min(target, > > - balloon_stats.current_pages + > > - balloon_stats.balloon_low + > > - balloon_stats.balloon_high); > > + rc = online_pages(PFN_DOWN(r->start), r_size >> PAGE_SHIFT); > > > > - return target; > > + if (rc) { > > + pr_err("%s: online_pages: Failed: %i\n", __func__, rc); > > + balloon_stats.target_pages = balloon_stats.current_pages; > > + return; > > + } > > + > > + for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); pfn += PAGES_PER_SECTION) { > > Ditto. Can you do PFN_UP(r->start)?Please, look above.> > + mem = find_memory_block(__pfn_to_section(pfn)); > > + BUG_ON(!mem); > > + BUG_ON(!present_section_nr(mem->phys_index)); > > + mutex_lock(&mem->state_mutex); > > + mem->state = MEM_ONLINE; > > + mutex_unlock(&mem->state_mutex); > > + } > > + > > + balloon_stats.current_pages += r_size >> PAGE_SHIFT; > > } > > > > +static enum bp_state request_additional_memory(long credit) > > +{ > > + int rc; > > + static struct resource *r; > > + static unsigned long pages_left; > > + > > + if ((credit <= 0 || balloon_stats.balloon_low || > > + balloon_stats.balloon_high) && !r) > > + return BP_DONE; > > + > > + if (!r) { > > + r = allocate_memory_resource(credit); > > + > > + if (!r) > > + return BP_ERROR; > > + > > + pages_left = credit; > > + } > > + > > + rc = allocate_additional_memory(r, pages_left); > > + > > + if (rc < 0) { > > + if (balloon_stats.mh_policy == MH_POLICY_TRY_UNTIL_SUCCESS) > > + return BP_ERROR; > > + > > + r = adjust_memory_resource(r, pages_left); > > + > > + if (!r) > > + return BP_ERROR; > > Say we failed the hypercall completly and got -ENOMEM from the ''allocate_additional_memory''. > I presume the adjust_memory_resource at this point would have deleted ''r'', which means > that (!r) and we return BP_ERROR. > > But that means that we aren''t following the MH_POLICY_STOP_AT_FIRST_ERROR as > the balloon_process will retry again and the again, and again??You are right. It should be return BP_DONE.> > + } else { > > + pages_left -= rc; > > + > > So say I request 512 pages (mh_policy is MH_POLICY_STOP_AT_FIRST_ERROR), > but only got 256. I adjust the pages_left to be 256 and then > > + if (pages_left) > > + return BP_HUNGRY; > > we return BP_HUNGRY. That makes ''balloon_process'' retry with 512 pages, and we > keep on trying and call "allocate_additional_memory", which fails once more > (returns 256), and we end up returning BP_HUNGRY, and retry... and so on. > > Would it be make sense to have a check here for the MH_POLICY_STOP_AT_FIRST_ERROR > and if so call the adjust_memory_memory_resource as well?Here it is OK. First time allocate_additional_memory() returns 256 which is OK and next time if more memory is not available then it returns rc < 0 which forces execution of "if (rc < 0) {..." (as it was expected). Daniel _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel