xen/common/page_alloc.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++ xen/include/public/xen.h | 1 + 2 files changed, 89 insertions(+), 0 deletions(-) When a low memory threshold on the Xen heap is reached, we fire a global dom0 virq. If someone''s listening, they can free up some more memory. The low threshold is configurable via the command line token ''low_mem_virq_limit", and defaults to 64MiB. We define a new virq VIRQ_ENOMEM. Potential listeners include squeezed, xenballoond, or anything else that can be fired through xencommons. We error-check the low mem virq against initial available heap (after dom0 allocation), to avoid firing immediately. Virq issuing is controlled by a hysteresis algorithm: when memory dips below a threshold, the virq is issued and the next virq will fire when memory shrinks another order of magnitude. The virq will not fire again in the current "band" until memory grows over the next higher order of magnitude. Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org> diff -r dd69d9b1aee9 -r da02cb8485de xen/common/page_alloc.c --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -35,6 +35,7 @@ #include <xen/perfc.h> #include <xen/numa.h> #include <xen/nodemask.h> +#include <xen/event.h> #include <xen/tmem.h> #include <xen/tmem_xen.h> #include <public/sysctl.h> @@ -300,6 +301,87 @@ static unsigned long init_node_heap(int return needed; } +/* Default to 64 MiB */ +#define DEFAULT_LOW_MEM_VIRQ_MIB 64 +#define MAX_LOW_MEM_VIRQ_MIB 1024 + +static unsigned long long __read_mostly opt_low_mem_virq = + (DEFAULT_LOW_MEM_VIRQ_MIB << 20); +size_param("low_mem_virq_limit", opt_low_mem_virq); + +/* Thresholds to control hysteresis. In pages */ +/* When memory grows above this threshold, reset hysteresis. + * -1 initially to not reset until at least one virq issued. */ +static unsigned long low_mem_virq_high = -1UL; +/* Threshold at which we issue virq */ +static unsigned long low_mem_virq_th = 0; +/* Original threshold after all checks completed */ +static unsigned long low_mem_virq_orig = 0; +/* Order for current threshold */ +static unsigned int low_mem_virq_th_order = 0; + +/* Perform bootstrapping checks and set bounds */ +static void setup_low_mem_virq(void) +{ + unsigned int order; + unsigned long long threshold; + + /* Dom0 has already been allocated by now. So check we won''t + * be complaining immediately with whatever''s left of the heap. */ + threshold = min(opt_low_mem_virq, (unsigned long long) + (total_avail_pages << PAGE_SHIFT)); + + /* Then, cap to some predefined maximum */ + threshold = min(threshold, (unsigned long long) + (MAX_LOW_MEM_VIRQ_MIB << 20)); + + /* Threshold bytes -> pages */ + low_mem_virq_th = threshold >> PAGE_SHIFT; + + /* Next, round the threshold down to the next order */ + order = get_order_from_pages(low_mem_virq_th); + if ( (1 << order) > low_mem_virq_th ) + order--; + + /* Set bounds, ready to go */ + low_mem_virq_th = low_mem_virq_orig = 1 << order; + low_mem_virq_th_order = order; + + printk("Current low memory virq threshold set at 0x%lx pages.\n", + low_mem_virq_th); +} + +static void check_low_mem_virq(void) +{ + if ( total_avail_pages <= low_mem_virq_th ) + { + send_global_virq(VIRQ_ENOMEM); + + /* Update thresholds. Next warning will be when we drop below + * next order. However, we wait until we grow beyond one + * order above us to complain again at the current order */ + low_mem_virq_high = 1 << (low_mem_virq_th_order + 1); + if ( low_mem_virq_th_order > 0 ) + low_mem_virq_th_order--; + low_mem_virq_th = 1 << low_mem_virq_th_order; + return; + } + + if ( total_avail_pages >= low_mem_virq_high ) + { + /* Reset hysteresis. Bring threshold up one order. + * If we are back where originally set, set high + * threshold to -1 to avoid further growth of + * virq threshold. */ + low_mem_virq_th_order++; + low_mem_virq_th = 1 << low_mem_virq_th_order; + if ( low_mem_virq_th == low_mem_virq_orig ) + low_mem_virq_high = -1UL; + else + low_mem_virq_high = 1 << (low_mem_virq_th_order + 2); + } +} + /* Allocate 2^@order contiguous pages. */ static struct page_info *alloc_heap_pages( unsigned int zone_lo, unsigned int zone_hi, @@ -420,6 +502,8 @@ static struct page_info *alloc_heap_page total_avail_pages -= request; ASSERT(total_avail_pages >= 0); + check_low_mem_virq(); + if ( d != NULL ) d->last_alloc_node = node; @@ -1022,6 +1106,10 @@ void __init scrub_heap_pages(void) } printk("done.\n"); + + /* Now that the heap is initialized, run checks and set bounds + * for the low mem virq algorithm. */ + setup_low_mem_virq(); } diff -r dd69d9b1aee9 -r da02cb8485de xen/include/public/xen.h --- a/xen/include/public/xen.h +++ b/xen/include/public/xen.h @@ -157,6 +157,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t); #define VIRQ_PCPU_STATE 9 /* G. (DOM0) PCPU state changed */ #define VIRQ_MEM_EVENT 10 /* G. (DOM0) A memory event has occured */ #define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient */ +#define VIRQ_ENOMEM 12 /* G. (DOM0) Dangerously low on heap memory */ /* Architecture-specific VIRQ definitions. */ #define VIRQ_ARCH_0 16
>>> On 23.02.12 at 20:59, Andres Lagar-Cavilla <andres@lagarcavilla.org> wrote: > @@ -300,6 +301,87 @@ static unsigned long init_node_heap(int > return needed; > } > > +/* Default to 64 MiB */ > +#define DEFAULT_LOW_MEM_VIRQ_MIB 64 > +#define MAX_LOW_MEM_VIRQ_MIB 1024 > + > +static unsigned long long __read_mostly opt_low_mem_virq = > + (DEFAULT_LOW_MEM_VIRQ_MIB << 20); > +size_param("low_mem_virq_limit", opt_low_mem_virq); > + > +/* Thresholds to control hysteresis. In pages */ > +/* When memory grows above this threshold, reset hysteresis. > + * -1 initially to not reset until at least one virq issued. */ > +static unsigned long low_mem_virq_high = -1UL; > +/* Threshold at which we issue virq */ > +static unsigned long low_mem_virq_th = 0; > +/* Original threshold after all checks completed */ > +static unsigned long low_mem_virq_orig = 0; > +/* Order for current threshold */ > +static unsigned int low_mem_virq_th_order = 0; > + > +/* Perform bootstrapping checks and set bounds */ > +static void setup_low_mem_virq(void)__init> +{ > + unsigned int order; > + unsigned long long threshold; > + > + /* Dom0 has already been allocated by now. So check we won''t > + * be complaining immediately with whatever''s left of the heap. */ > + threshold = min(opt_low_mem_virq, (unsigned long long) > + (total_avail_pages << PAGE_SHIFT));The cast needs to be on total_avail_pages, not the result of the shift. Also, unsigned long long is the wrong type (paddr_t was invented for this very purpose). Further, the initial threshold should clearly be *below* the currently available amount (e.g. at half of it).> + > + /* Then, cap to some predefined maximum */ > + threshold = min(threshold, (unsigned long long) > + (MAX_LOW_MEM_VIRQ_MIB << 20));Same here wrt the cast.> + > + /* Threshold bytes -> pages */ > + low_mem_virq_th = threshold >> PAGE_SHIFT; > + > + /* Next, round the threshold down to the next order */ > + order = get_order_from_pages(low_mem_virq_th); > + if ( (1 << order) > low_mem_virq_th ) > + order--; > + > + /* Set bounds, ready to go */ > + low_mem_virq_th = low_mem_virq_orig = 1 << order;1UL << ...> + low_mem_virq_th_order = order; > + > + printk("Current low memory virq threshold set at 0x%lx pages.\n","Initial ..."> + low_mem_virq_th); > +} > + > +static void check_low_mem_virq(void) > +{ > + if ( total_avail_pages <= low_mem_virq_th ) > + { > + send_global_virq(VIRQ_ENOMEM); > + > + /* Update thresholds. Next warning will be when we drop below > + * next order. However, we wait until we grow beyond one > + * order above us to complain again at the current order */ > + low_mem_virq_high = 1 << (low_mem_virq_th_order + 1);1UL << ...> + if ( low_mem_virq_th_order > 0 ) > + low_mem_virq_th_order--; > + low_mem_virq_th = 1 << low_mem_virq_th_order;Same here.> + return; > + } > + > + if ( total_avail_pages >= low_mem_virq_high ) > + { > + /* Reset hysteresis. Bring threshold up one order. > + * If we are back where originally set, set high > + * threshold to -1 to avoid further growth of > + * virq threshold. */ > + low_mem_virq_th_order++; > + low_mem_virq_th = 1 << low_mem_virq_th_order;And here.> + if ( low_mem_virq_th == low_mem_virq_orig ) > + low_mem_virq_high = -1UL; > + else > + low_mem_virq_high = 1 << (low_mem_virq_th_order + 2);And here.> + } > +} > + > /* Allocate 2^@order contiguous pages. */ > static struct page_info *alloc_heap_pages( > unsigned int zone_lo, unsigned int zone_hi, > @@ -420,6 +502,8 @@ static struct page_info *alloc_heap_page > total_avail_pages -= request; > ASSERT(total_avail_pages >= 0); > > + check_low_mem_virq(); > + > if ( d != NULL ) > d->last_alloc_node = node; > > @@ -1022,6 +1106,10 @@ void __init scrub_heap_pages(void) > } > > printk("done.\n"); > + > + /* Now that the heap is initialized, run checks and set bounds > + * for the low mem virq algorithm. */ > + setup_low_mem_virq(); > } > > > diff -r dd69d9b1aee9 -r da02cb8485de xen/include/public/xen.h > --- a/xen/include/public/xen.h > +++ b/xen/include/public/xen.h > @@ -157,6 +157,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t); > #define VIRQ_PCPU_STATE 9 /* G. (DOM0) PCPU state changed */ > #define VIRQ_MEM_EVENT 10 /* G. (DOM0) A memory event has occured */ > #define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient */ > +#define VIRQ_ENOMEM 12 /* G. (DOM0) Dangerously low on heap memory */Either the default threshold ought to be *much* lower (say 64k), or the "dangerously" here is completely misleading.> > /* Architecture-specific VIRQ definitions. */ > #define VIRQ_ARCH_0 16Given that this new vIRQ ought to be handled in user space, do you have an implementation ready to contribute as well? Jan
Andres Lagar-Cavilla
2012-Feb-24 17:18 UTC
Re: [PATCH] Global virq for low memory situations
>>>> On 23.02.12 at 20:59, Andres Lagar-Cavilla <andres@lagarcavilla.org> >>>> wrote: >> @@ -300,6 +301,87 @@ static unsigned long init_node_heap(int >> return needed; >> } >> >> +/* Default to 64 MiB */ >> +#define DEFAULT_LOW_MEM_VIRQ_MIB 64 >> +#define MAX_LOW_MEM_VIRQ_MIB 1024 >> + >> +static unsigned long long __read_mostly opt_low_mem_virq >> + (DEFAULT_LOW_MEM_VIRQ_MIB << >> 20); >> +size_param("low_mem_virq_limit", opt_low_mem_virq); >> + >> +/* Thresholds to control hysteresis. In pages */ >> +/* When memory grows above this threshold, reset hysteresis. >> + * -1 initially to not reset until at least one virq issued. */ >> +static unsigned long low_mem_virq_high = -1UL; >> +/* Threshold at which we issue virq */ >> +static unsigned long low_mem_virq_th = 0; >> +/* Original threshold after all checks completed */ >> +static unsigned long low_mem_virq_orig = 0; >> +/* Order for current threshold */ >> +static unsigned int low_mem_virq_th_order = 0; >> + >> +/* Perform bootstrapping checks and set bounds */ >> +static void setup_low_mem_virq(void) > > __init > >> +{ >> + unsigned int order; >> + unsigned long long threshold; >> + >> + /* Dom0 has already been allocated by now. So check we won''t >> + * be complaining immediately with whatever''s left of the heap. */ >> + threshold = min(opt_low_mem_virq, (unsigned long long) >> + (total_avail_pages << PAGE_SHIFT)); > > The cast needs to be on total_avail_pages, not the result of the > shift. Also, unsigned long long is the wrong type (paddr_t was > invented for this very purpose). > > Further, the initial threshold should clearly be *below* the currently > available amount (e.g. at half of it).That''s debatable. Let''s set aside the semantics of what is "really" or "dangerously" low, and assume the admin knows what he/she is doing when setting the threshold in the command line. If the amount of available memory is below that threshold, then the moment an allocation happens we need to warn.> >> + >> + /* Then, cap to some predefined maximum */ >> + threshold = min(threshold, (unsigned long long) >> + (MAX_LOW_MEM_VIRQ_MIB << 20)); > > Same here wrt the cast. > >> + >> + /* Threshold bytes -> pages */ >> + low_mem_virq_th = threshold >> PAGE_SHIFT; >> + >> + /* Next, round the threshold down to the next order */ >> + order = get_order_from_pages(low_mem_virq_th); >> + if ( (1 << order) > low_mem_virq_th ) >> + order--; >> + >> + /* Set bounds, ready to go */ >> + low_mem_virq_th = low_mem_virq_orig = 1 << order; > > 1UL << ... > >> + low_mem_virq_th_order = order; >> + >> + printk("Current low memory virq threshold set at 0x%lx pages.\n", > > "Initial ..." > >> + low_mem_virq_th); >> +} >> + >> +static void check_low_mem_virq(void) >> +{ >> + if ( total_avail_pages <= low_mem_virq_th ) >> + { >> + send_global_virq(VIRQ_ENOMEM); >> + >> + /* Update thresholds. Next warning will be when we drop below >> + * next order. However, we wait until we grow beyond one >> + * order above us to complain again at the current order */ >> + low_mem_virq_high = 1 << (low_mem_virq_th_order + 1); > > 1UL << ... > >> + if ( low_mem_virq_th_order > 0 ) >> + low_mem_virq_th_order--; >> + low_mem_virq_th = 1 << low_mem_virq_th_order; > > Same here. > >> + return; >> + } >> + >> + if ( total_avail_pages >= low_mem_virq_high ) >> + { >> + /* Reset hysteresis. Bring threshold up one order. >> + * If we are back where originally set, set high >> + * threshold to -1 to avoid further growth of >> + * virq threshold. */ >> + low_mem_virq_th_order++; >> + low_mem_virq_th = 1 << low_mem_virq_th_order; > > And here. > >> + if ( low_mem_virq_th == low_mem_virq_orig ) >> + low_mem_virq_high = -1UL; >> + else >> + low_mem_virq_high = 1 << (low_mem_virq_th_order + 2); > > And here.All of these I''ll do, thanks Jan.> >> + } >> +} >> + >> /* Allocate 2^@order contiguous pages. */ >> static struct page_info *alloc_heap_pages( >> unsigned int zone_lo, unsigned int zone_hi, >> @@ -420,6 +502,8 @@ static struct page_info *alloc_heap_page >> total_avail_pages -= request; >> ASSERT(total_avail_pages >= 0); >> >> + check_low_mem_virq(); >> + >> if ( d != NULL ) >> d->last_alloc_node = node; >> >> @@ -1022,6 +1106,10 @@ void __init scrub_heap_pages(void) >> } >> >> printk("done.\n"); >> + >> + /* Now that the heap is initialized, run checks and set bounds >> + * for the low mem virq algorithm. */ >> + setup_low_mem_virq(); >> } >> >> >> diff -r dd69d9b1aee9 -r da02cb8485de xen/include/public/xen.h >> --- a/xen/include/public/xen.h >> +++ b/xen/include/public/xen.h >> @@ -157,6 +157,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t); >> #define VIRQ_PCPU_STATE 9 /* G. (DOM0) PCPU state changed >> */ >> #define VIRQ_MEM_EVENT 10 /* G. (DOM0) A memory event has occured >> */ >> #define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient >> */ >> +#define VIRQ_ENOMEM 12 /* G. (DOM0) Dangerously low on heap memory >> */ > > Either the default threshold ought to be *much* lower (say 64k), or > the "dangerously" here is completely misleading. > >> >> /* Architecture-specific VIRQ definitions. */ >> #define VIRQ_ARCH_0 16 > > Given that this new vIRQ ought to be handled in user space, do you > have an implementation ready to contribute as well?I have my little daemon that I use for this. It''s nothing you would not expect: listen for the virq, balloon dom0 down a bit. I can throw it into the tree, although I don''t know where. Or I can post it to the list for posterity. Ideally all "memory management daemons" out there that find it useful would pick it up -- certainly the hope is for squeezed to do that, but I view that as a separate effort. Andres> > Jan >
>>> On 24.02.12 at 18:18, "Andres Lagar-Cavilla" <andres@lagarcavilla.org> wrote: >>>>> On 23.02.12 at 20:59, Andres Lagar-Cavilla <andres@lagarcavilla.org> wrote: >>> +{ >>> + unsigned int order; >>> + unsigned long long threshold; >>> + >>> + /* Dom0 has already been allocated by now. So check we won''t >>> + * be complaining immediately with whatever''s left of the heap. */ >>> + threshold = min(opt_low_mem_virq, (unsigned long long) >>> + (total_avail_pages << PAGE_SHIFT)); >> >> The cast needs to be on total_avail_pages, not the result of the >> shift. Also, unsigned long long is the wrong type (paddr_t was >> invented for this very purpose). >> >> Further, the initial threshold should clearly be *below* the currently >> available amount (e.g. at half of it). > > That''s debatable. Let''s set aside the semantics of what is "really" or > "dangerously" low, and assume the admin knows what he/she is doing when > setting the threshold in the command line. If the amount of available > memory is below that threshold, then the moment an allocation happens we > need to warn.My comment wasn''t about command line specified values - those certainly should be obeyed to. But the threshold chosen when there was nothing said on the command line should imo not result in an immediate warning.>>> /* Architecture-specific VIRQ definitions. */ >>> #define VIRQ_ARCH_0 16 >> >> Given that this new vIRQ ought to be handled in user space, do you >> have an implementation ready to contribute as well? > > I have my little daemon that I use for this. It''s nothing you would not > expect: listen for the virq, balloon dom0 down a bit. I can throw it into > the tree, although I don''t know where. Or I can post it to the list for > posterity. Ideally all "memory management daemons" out there that find it > useful would pick it up -- certainly the hope is for squeezed to do that, > but I view that as a separate effort.At least posting it would be nice, so people could make suggestions as to where it might go in the tree, and in what shape. Jan