These patches fix that by making the kernel work like the classic Xen kernel. arch/x86/include/asm/xen/page.h | 1 + arch/x86/xen/p2m.c | 104 +++++++++++++++++++++++--------------- arch/x86/xen/setup.c | 106 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 164 insertions(+), 47 deletions(-) Konrad Rzeszutek Wilk (7): xen/p2m: Move code around to allow for better re-usage. xen/p2m: Allow alloc_p2m_middle to call reserve_brk depending on argument xen/p2m: Collapse early_alloc_p2m_middle redundant checks. xen/p2m: An early bootup variant of set_phys_to_machine xen/setup: Transfer MFNs from non-RAM E820 entries and gaps to E820 RAM xen/setup: Make dom0_mem=XGB behavior be similar to classic Xen kernels. xen/setup: Only print "Freeing XXX-YYY pfn range: Z pages freed" if Z > 0
Konrad Rzeszutek Wilk
2012-Mar-30 20:37 UTC
[PATCH 1/7] xen/p2m: Move code around to allow for better re-usage.
We are going to be using the early_alloc_p2m (and early_alloc_p2m_middle) code in follow up patches which are not related to setting identity pages. Hence lets move the code out in its own function and rename them as appropiate. Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> --- arch/x86/xen/p2m.c | 62 ++++++++++++++++++++++++++++----------------------- 1 files changed, 34 insertions(+), 28 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 1b267e7..3cc3afe 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -499,7 +499,7 @@ static bool alloc_p2m(unsigned long pfn) return true; } -static bool __init __early_alloc_p2m(unsigned long pfn) +static bool __init early_alloc_p2m_middle(unsigned long pfn) { unsigned topidx, mididx, idx; @@ -541,6 +541,36 @@ static bool __init __early_alloc_p2m(unsigned long pfn) } return idx != 0; } + +static bool __init early_alloc_p2m(unsigned long pfn) +{ + unsigned topidx = p2m_top_index(pfn); + unsigned long *mid_mfn_p; + unsigned long **mid; + + mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; + if (mid == p2m_mid_missing) { + mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + + p2m_mid_init(mid); + + p2m_top[topidx] = mid; + + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + } + /* And the save/restore P2M tables.. */ + if (mid_mfn_p == p2m_mid_missing_mfn) { + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p); + + p2m_top_mfn_p[topidx] = mid_mfn_p; + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); + /* Note: we don''t set mid_mfn_p[midix] here, + * look in early_alloc_p2m_middle */ + } + return true; +} unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) { @@ -559,35 +589,11 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned long *mid_mfn_p; - unsigned long **mid; - - mid = p2m_top[topidx]; - mid_mfn_p = p2m_top_mfn_p[topidx]; - if (mid == p2m_mid_missing) { - mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - - p2m_mid_init(mid); - - p2m_top[topidx] = mid; - - BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); - } - /* And the save/restore P2M tables.. */ - if (mid_mfn_p == p2m_mid_missing_mfn) { - mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p); - - p2m_top_mfn_p[topidx] = mid_mfn_p; - p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); - /* Note: we don''t set mid_mfn_p[midix] here, - * look in __early_alloc_p2m */ - } + WARN_ON(!early_alloc_p2m(pfn)); } - __early_alloc_p2m(pfn_s); - __early_alloc_p2m(pfn_e); + early_alloc_p2m_middle(pfn_s); + early_alloc_p2m_middle(pfn_e); for (pfn = pfn_s; pfn < pfn_e; pfn++) if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) -- 1.7.7.5
Konrad Rzeszutek Wilk
2012-Mar-30 20:37 UTC
[PATCH 2/7] xen/p2m: Allow alloc_p2m_middle to call reserve_brk depending on argument
For identity cases we want to call reserve_brk only on the boundary conditions of the middle P2M (so P2M[x][y][0] = extend_brk). This is to work around identify regions (PCI spaces, gaps in E820) which are not aligned on 2MB regions. However for the case were we want to allocate P2M middle leafs at the early bootup stage, irregardless of this alignment check we need some means of doing that. For that we provide the new argument. Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> --- arch/x86/xen/p2m.c | 10 +++++----- 1 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 3cc3afe..8b3a395 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -499,7 +499,7 @@ static bool alloc_p2m(unsigned long pfn) return true; } -static bool __init early_alloc_p2m_middle(unsigned long pfn) +static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary) { unsigned topidx, mididx, idx; @@ -508,7 +508,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn) idx = p2m_index(pfn); /* Pfff.. No boundary cross-over, lets get out. */ - if (!idx) + if (!idx && check_boundary) return false; WARN(p2m_top[topidx][mididx] == p2m_identity, @@ -531,7 +531,7 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn) p2m_top[topidx][mididx] = p2m; /* For save/restore we need to MFN of the P2M saved */ - + mid_mfn_p = p2m_top_mfn_p[topidx]; WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", @@ -592,8 +592,8 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, WARN_ON(!early_alloc_p2m(pfn)); } - early_alloc_p2m_middle(pfn_s); - early_alloc_p2m_middle(pfn_e); + early_alloc_p2m_middle(pfn_s, true); + early_alloc_p2m_middle(pfn_e, true); for (pfn = pfn_s; pfn < pfn_e; pfn++) if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) -- 1.7.7.5
Konrad Rzeszutek Wilk
2012-Mar-30 20:37 UTC
[PATCH 3/7] xen/p2m: Collapse early_alloc_p2m_middle redundant checks.
At the start of the function we were checking for idx != 0 and bailing out. And later calling extend_brk if idx != 0. That is unnecessary so remove that checks. Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> --- arch/x86/xen/p2m.c | 25 ++++++++++++------------- 1 files changed, 12 insertions(+), 13 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 8b3a395..952edef 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -502,6 +502,8 @@ static bool alloc_p2m(unsigned long pfn) static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary) { unsigned topidx, mididx, idx; + unsigned long *p2m; + unsigned long *mid_mfn_p; topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); @@ -522,24 +524,21 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary return false; /* Boundary cross-over for the edges: */ - if (idx) { - unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); - unsigned long *mid_mfn_p; + p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_init(p2m); + p2m_init(p2m); - p2m_top[topidx][mididx] = p2m; + p2m_top[topidx][mididx] = p2m; - /* For save/restore we need to MFN of the P2M saved */ + /* For save/restore we need to MFN of the P2M saved */ - mid_mfn_p = p2m_top_mfn_p[topidx]; - WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), - "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", - topidx, mididx); - mid_mfn_p[mididx] = virt_to_mfn(p2m); + mid_mfn_p = p2m_top_mfn_p[topidx]; + WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), + "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", + topidx, mididx); + mid_mfn_p[mididx] = virt_to_mfn(p2m); - } - return idx != 0; + return true; } static bool __init early_alloc_p2m(unsigned long pfn) -- 1.7.7.5
Konrad Rzeszutek Wilk
2012-Mar-30 20:37 UTC
[PATCH 4/7] xen/p2m: An early bootup variant of set_phys_to_machine
During early bootup we can''t use alloc_page, so to allocate leaf pages in the P2M we need to use extend_brk. For that we are utilizing the early_alloc_p2m and early_alloc_p2m_middle functions to do the job for us. This function follows the same logic as set_phys_to_machine. Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> --- arch/x86/include/asm/xen/page.h | 1 + arch/x86/xen/p2m.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 0 deletions(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index c34f96c..93971e8 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -44,6 +44,7 @@ extern unsigned long machine_to_phys_nr; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); +extern bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern unsigned long set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 952edef..ffd08c4 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -570,6 +570,21 @@ static bool __init early_alloc_p2m(unsigned long pfn) } return true; } +bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (unlikely(!__set_phys_to_machine(pfn, mfn))) { + if (!early_alloc_p2m(pfn)) + return false; + + if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/)) + return false; + + if (!__set_phys_to_machine(pfn, mfn)) + return false; + } + + return true; +} unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) { -- 1.7.7.5
Konrad Rzeszutek Wilk
2012-Mar-30 20:37 UTC
[PATCH 5/7] xen/setup: Transfer MFNs from non-RAM E820 entries and gaps to E820 RAM
When the Xen hypervisor boots a PV kernel it hands it two pieces of information: nr_pages and a made up E820 entry. The nr_pages value defines the range from zero to nr_pages of PFNs which have a valid Machine Frame Number (MFN) underneath it. The E820 mirrors that (with the VGA hole): BIOS-provided physical RAM map: Xen: 0000000000000000 - 00000000000a0000 (usable) Xen: 00000000000a0000 - 0000000000100000 (reserved) Xen: 0000000000100000 - 0000000080800000 (usable) The fun comes when a PV guest that is run with a system E820 - that can either be the initial domain or a PCI PV guest, where the E820 looks like the normal thing: BIOS-provided physical RAM map: Xen: 0000000000000000 - 000000000009e000 (usable) Xen: 000000000009ec00 - 0000000000100000 (reserved) Xen: 0000000000100000 - 0000000020000000 (usable) Xen: 0000000020000000 - 0000000020200000 (reserved) Xen: 0000000020200000 - 0000000040000000 (usable) Xen: 0000000040000000 - 0000000040200000 (reserved) Xen: 0000000040200000 - 00000000bad80000 (usable) Xen: 00000000bad80000 - 00000000badc9000 (ACPI NVS) .. With that overlaying the nr_pages directly on the E820 does not work as there are gaps and non-RAM regions that won''t be used by the memory allocator. The ''xen_release_chunk'' helps with that by punching holes in the P2M (PFN to MFN lookup tree) for those regions and tells us that: Freeing 20000-20200 pfn range: 512 pages freed Freeing 40000-40200 pfn range: 512 pages freed Freeing bad80-badf4 pfn range: 116 pages freed Freeing badf6-bae7f pfn range: 137 pages freed Freeing bb000-100000 pfn range: 282624 pages freed Released 283999 pages of unused memory Those 283999 pages are subtracted from the nr_pages and are returned to the hypervisor. The end result is that the initial domain boots with 1GB less memory as the nr_pages has been subtraced by the amount of pages residing within the PCI hole. It can balloon up to that if desired using ''xl mem-set 0 8092'', but the balloon driver is not always compiled in for the initial domain. The ''xen_exchange_chunk'' solves this by transfering the MFNs that would have been freed to the E820_RAM entries that are past the nr_pages by using the early_set_phys_to_machine mechanism that allows the P2M tree to allocate new leafs during early bootup. It does that by copying the MFNs to the E820_RAM that has not been used and setting the old PFNs to INVALID_P2M_ENTRY. The end result is that the kernel can now boot with the nr_pages without having to subtract the 283999 pages. We will now get: -Released 283999 pages of unused memory +Exchanged 283999 pages .. snip.. -Memory: 6487732k/9208688k available (5817k kernel code, 1136060k absent, 1584896k reserved, 2900k data, 692k init) +Memory: 6503888k/8072692k available (5817k kernel code, 1136060k absent, 432744k reserved, 2900k data, 692k init) which is more in line with classic XenOLinux. Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> --- arch/x86/xen/setup.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 files changed, 82 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1ba8dff..2a12143 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -120,12 +120,89 @@ static unsigned long __init xen_release_chunk(unsigned long start, return len; } +static unsigned long __init xen_exchange_chunk(unsigned long start_pfn, + unsigned long end_pfn, unsigned long nr_pages, unsigned long exchanged, + unsigned long *pages_left, const struct e820entry *list, + size_t map_size) +{ + const struct e820entry *entry; + unsigned int i; + unsigned long credits = (end_pfn - start_pfn) + *pages_left; + unsigned long done = 0; + + for (i = 0, entry = list; i < map_size; i++, entry++) { + unsigned long s_pfn; + unsigned long e_pfn; + unsigned long pfn; + unsigned long dest_pfn; + long nr; + + if (credits == 0) + break; + + if (entry->type != E820_RAM) + continue; + + e_pfn = PFN_UP(entry->addr + entry->size); + + /* We only care about E820 _after_ the xen_start_info->nr_pages */ + if (e_pfn <= nr_pages) + continue; + + s_pfn = PFN_DOWN(entry->addr); + /* If the E820 falls within the nr_pages, we want to start + * at the nr_pages PFN (plus whatever we already had exchanged) + * If that would mean going past the E820 entry, skip it + */ + if (s_pfn <= nr_pages) { + nr = e_pfn - exchanged - nr_pages; + dest_pfn = nr_pages + exchanged; + } else { + nr = e_pfn - exchanged - s_pfn; + dest_pfn = s_pfn + exchanged; + } + /* If we had filled this E820_RAM entry, go to the next one. */ + if (nr <= 0) + continue; + + pr_debug("[%lx->%lx] (starting at %lx and have space for %ld pages) will move %ld pages from [%lx->%lx]\n", + s_pfn, e_pfn, dest_pfn, nr, credits, start_pfn, end_pfn); + + for (pfn = start_pfn; pfn < start_pfn + nr; pfn++) { + unsigned long mfn = pfn_to_mfn(pfn); + + if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) + break; + + if (!early_set_phys_to_machine(dest_pfn, mfn)) + break; + + /* You would think we should do HYPERVISOR_update_va_mapping + * but we don''t need to as the hypervisor only sets up the + * initial pagetables up to nr_pages, and we stick the MFNs + * past that. + */ + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + ++dest_pfn; + ++done; + if (--credits == 0) + break; + } + } + if (done) + printk(KERN_INFO "Transfered from %lx->%lx range %ld pages\n", start_pfn, end_pfn, done); + /* How many left on the next iteration */ + *pages_left = credits; + return done; +} static unsigned long __init xen_set_identity_and_release( const struct e820entry *list, size_t map_size, unsigned long nr_pages) { phys_addr_t start = 0; unsigned long released = 0; unsigned long identity = 0; + unsigned long exchanged = 0; + unsigned long credits = 0; const struct e820entry *entry; int i; @@ -151,17 +228,19 @@ static unsigned long __init xen_set_identity_and_release( end_pfn = PFN_UP(entry->addr); if (start_pfn < end_pfn) { - if (start_pfn < nr_pages) + exchanged += xen_exchange_chunk(start_pfn, end_pfn, nr_pages, + exchanged, &credits, list, map_size); + if (start_pfn < nr_pages) { released += xen_release_chunk( start_pfn, min(end_pfn, nr_pages)); - + } identity += set_phys_range_identity( start_pfn, end_pfn); } start = end; } } - + printk(KERN_INFO "Exchanged %lu pages\n", exchanged); printk(KERN_INFO "Released %lu pages of unused memory\n", released); printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); -- 1.7.7.5
Konrad Rzeszutek Wilk
2012-Mar-30 20:37 UTC
[PATCH 6/7] xen/setup: Make dom0_mem=XGB behavior be similar to classic Xen kernels.
Meaning that we will allocate up to XGB and not consider the rest of the memory as a possible balloon goal. This results in /proc/meminfo reporting: -MemTotal: 2845024 kB -MemFree: 2497716 kB +MemTotal: 2927192 kB +MemFree: 2458952 kB ... -DirectMap4k: 8304640 kB +DirectMap4k: 3063808 kB DirectMap2M: 0 kB on a 8GB machine with ''dom0_mem=3GB'' on the Xen hypervisor line. Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> --- arch/x86/xen/setup.c | 16 ++++++++++++++++ 1 files changed, 16 insertions(+), 0 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 2a12143..4e4aa8e 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -261,11 +261,27 @@ static unsigned long __init xen_get_max_pages(void) * the current maximum rather than the static maximum. In this * case the e820 map provided to us will cover the static * maximum region. + * + * The dom0_mem=min:X,max:Y tweaks options differently depending + * on the version, but in general this is what we get: + * | XENMEM_maximum_reser | nr_pages + * --------------++-----------------------+------------------- + * no dom0_mem | INT_MAX | the max_phys_pfn + * =3G | INT_MAX | 786432 + * =max:3G | 786432 | 786432 + * =min:1G,max:3G| 262144 | 786432 + * + * The =3G is often used and it lead to us initially setting + * 786432 and allowing dom0 to balloon up to the max_physical_pfn. + * This is at odd with the classic XenOClassic so lets emulate + * the classic behavior. */ if (xen_initial_domain()) { ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); if (ret > 0) max_pages = ret; + if (ret == -1UL) + max_pages = xen_start_info->nr_pages; } return min(max_pages, MAX_DOMAIN_PAGES); -- 1.7.7.5
Konrad Rzeszutek Wilk
2012-Mar-30 20:37 UTC
[PATCH 7/7] xen/setup: Only print "Freeing XXX-YYY pfn range: Z pages freed" if Z > 0
Otherwise we can get these meaningless: Freeing bad80-badf4 pfn range: 0 pages freed Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> --- arch/x86/xen/setup.c | 5 +++-- 1 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 4e4aa8e..42d5005 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -114,8 +114,9 @@ static unsigned long __init xen_release_chunk(unsigned long start, len++; } } - printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", - start, end, len); + if (len) + printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", + start, end, len); return len; } -- 1.7.7.5
David Vrabel
2012-Apr-03 08:48 UTC
Re: [Xen-devel] [PATCH 5/7] xen/setup: Transfer MFNs from non-RAM E820 entries and gaps to E820 RAM
On 30/03/12 21:37, Konrad Rzeszutek Wilk wrote:> We will now get: > > -Released 283999 pages of unused memory > +Exchanged 283999 pages > .. snip.. > -Memory: 6487732k/9208688k available (5817k kernel code, 1136060k absent, 1584896k reserved, 2900k data, 692k init) > +Memory: 6503888k/8072692k available (5817k kernel code, 1136060k absent, 432744k reserved, 2900k data, 692k init)This isn''t correct. You''ve have lost ~1 GB of memory which are the pages that were supposed to be moved. The additional 1GB of reserved memory in the old case is the balloon. In xen_memory_setup() where it loops through the e820 to clip the RAM regions you need to factor in the additional memory you''ve moved. In this loop you may need to count the pages in the RAM region instead of the simple (addr < mem_end) test. Take care with RAM regions with partial pages and the like.> which is more in line with classic XenOLinux. > > Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> > --- > arch/x86/xen/setup.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++-- > 1 files changed, 82 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c > index 1ba8dff..2a12143 100644 > --- a/arch/x86/xen/setup.c > +++ b/arch/x86/xen/setup.c > @@ -120,12 +120,89 @@ static unsigned long __init xen_release_chunk(unsigned long start, > return len; > } > > +static unsigned long __init xen_exchange_chunk(unsigned long start_pfn, > + unsigned long end_pfn, unsigned long nr_pages, unsigned long exchanged, > + unsigned long *pages_left, const struct e820entry *list, > + size_t map_size) > +{[...]> + > + for (pfn = start_pfn; pfn < start_pfn + nr; pfn++) { > + unsigned long mfn = pfn_to_mfn(pfn); > + > + if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) > + break; > + > + if (!early_set_phys_to_machine(dest_pfn, mfn)) > + break; > + > + /* You would think we should do HYPERVISOR_update_va_mapping > + * but we don''t need to as the hypervisor only sets up the > + * initial pagetables up to nr_pages, and we stick the MFNs > + * past that. > + */Hmmm. Are you sure this is safe? What happens if Linux tries to use these pages before creating new page tables? e.g., via some early boot allocator before the final page tables are setup? (This might not be a problem, I haven''t checked). You''ve may have gotten away with it for now because the moved MFNs are marked as unusable. David
David Vrabel
2012-Apr-03 08:58 UTC
Re: [Xen-devel] [PATCH 6/7] xen/setup: Make dom0_mem=XGB behavior be similar to classic Xen kernels.
On 30/03/12 21:37, Konrad Rzeszutek Wilk wrote:> Meaning that we will allocate up to XGB and not consider the > rest of the memory as a possible balloon goal.I agree with Jan when he commented on the equivalent Xen patch for this behaviour. The current behaviour is better than the classic one. With your new behaviour it will no longer possible to specify an unlimited balloon but a limited number of initial pages. This is behaviour that Jan said he used. This problem is better solved by improving the documentation. A review of the xen.org wiki where dom0_mem is mentioned would be a good start, and an update to the recently added section for distro developers. David> This results in /proc/meminfo reporting: > > -MemTotal: 2845024 kB > -MemFree: 2497716 kB > +MemTotal: 2927192 kB > +MemFree: 2458952 kB > ... > -DirectMap4k: 8304640 kB > +DirectMap4k: 3063808 kB > DirectMap2M: 0 kB > > on a 8GB machine with ''dom0_mem=3GB'' on the Xen hypervisor line. > > Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> > --- > arch/x86/xen/setup.c | 16 ++++++++++++++++ > 1 files changed, 16 insertions(+), 0 deletions(-) > > diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c > index 2a12143..4e4aa8e 100644 > --- a/arch/x86/xen/setup.c > +++ b/arch/x86/xen/setup.c > @@ -261,11 +261,27 @@ static unsigned long __init xen_get_max_pages(void) > * the current maximum rather than the static maximum. In this > * case the e820 map provided to us will cover the static > * maximum region. > + * > + * The dom0_mem=min:X,max:Y tweaks options differently depending > + * on the version, but in general this is what we get: > + * | XENMEM_maximum_reser | nr_pages > + * --------------++-----------------------+------------------- > + * no dom0_mem | INT_MAX | the max_phys_pfn > + * =3G | INT_MAX | 786432 > + * =max:3G | 786432 | 786432 > + * =min:1G,max:3G| 262144 | 786432 > + * > + * The =3G is often used and it lead to us initially setting > + * 786432 and allowing dom0 to balloon up to the max_physical_pfn. > + * This is at odd with the classic XenOClassic so lets emulate > + * the classic behavior. > */ > if (xen_initial_domain()) { > ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); > if (ret > 0) > max_pages = ret; > + if (ret == -1UL) > + max_pages = xen_start_info->nr_pages; > } > > return min(max_pages, MAX_DOMAIN_PAGES);
Jan Beulich
2012-Apr-03 09:46 UTC
Re: [Xen-devel] [PATCH 6/7] xen/setup: Make dom0_mem=XGB behavior be similar to classic Xen kernels.
>>> On 03.04.12 at 10:58, David Vrabel <david.vrabel@citrix.com> wrote: > With your new behaviour it will no longer possible to specify an > unlimited balloon but a limited number of initial pages. This is > behaviour that Jan said he used.An unlimited balloon was never possible afaict (as that would have implied setting up an "infinite" number of struct page instances at boot time. What I''m using is "dom0_mem=-<num>M" together with the kernel option "mem=<num>G", such that max-balloon > initial alloc (usually I set max-balloon to approximately the amount of memory in the system, so the upper limit is "infinite" in the sense that I can''t go higher anyway, but it''s not truly infinity). Jan
Konrad Rzeszutek Wilk
2012-Apr-03 13:13 UTC
Re: [Xen-devel] [PATCH 5/7] xen/setup: Transfer MFNs from non-RAM E820 entries and gaps to E820 RAM
On Tue, Apr 03, 2012 at 09:48:43AM +0100, David Vrabel wrote:> On 30/03/12 21:37, Konrad Rzeszutek Wilk wrote: > > We will now get: > > > > -Released 283999 pages of unused memory > > +Exchanged 283999 pages > > .. snip.. > > -Memory: 6487732k/9208688k available (5817k kernel code, 1136060k absent, 1584896k reserved, 2900k data, 692k init) > > +Memory: 6503888k/8072692k available (5817k kernel code, 1136060k absent, 432744k reserved, 2900k data, 692k init) > > This isn''t correct. You''ve have lost ~1 GB of memory which are the > pages that were supposed to be moved. The additional 1GB of reserved > memory in the old case is the balloon.Whoops.> > In xen_memory_setup() where it loops through the e820 to clip the RAM > regions you need to factor in the additional memory you''ve moved. In > this loop you may need to count the pages in the RAM region instead of > the simple (addr < mem_end) test. Take care with RAM regions with > partial pages and the like.<nods> I did some more exhaustive testing and hit some issues> > > which is more in line with classic XenOLinux. > > > > Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> > > --- > > arch/x86/xen/setup.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++-- > > 1 files changed, 82 insertions(+), 3 deletions(-) > > > > diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c > > index 1ba8dff..2a12143 100644 > > --- a/arch/x86/xen/setup.c > > +++ b/arch/x86/xen/setup.c > > @@ -120,12 +120,89 @@ static unsigned long __init xen_release_chunk(unsigned long start, > > return len; > > } > > > > +static unsigned long __init xen_exchange_chunk(unsigned long start_pfn, > > + unsigned long end_pfn, unsigned long nr_pages, unsigned long exchanged, > > + unsigned long *pages_left, const struct e820entry *list, > > + size_t map_size) > > +{ > [...] > > + > > + for (pfn = start_pfn; pfn < start_pfn + nr; pfn++) { > > + unsigned long mfn = pfn_to_mfn(pfn); > > + > > + if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) > > + break; > > + > > + if (!early_set_phys_to_machine(dest_pfn, mfn)) > > + break; > > + > > + /* You would think we should do HYPERVISOR_update_va_mapping > > + * but we don''t need to as the hypervisor only sets up the > > + * initial pagetables up to nr_pages, and we stick the MFNs > > + * past that. > > + */ > > Hmmm. Are you sure this is safe? What happens if Linux tries to use > these pages before creating new page tables? e.g., via some early boot > allocator before the final page tables are setup? (This might not be a > problem, I haven''t checked).I think this is what I am hitting actually, but not entirely sure.> > You''ve may have gotten away with it for now because the moved MFNs are > marked as unusable.Right, and they should be marked ''usuable''. There is a forthcoming patch that does that but it isn''t ready yet.> > David > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
Konrad Rzeszutek Wilk
2012-Apr-06 20:59 UTC
Re: [Xen-devel] [PATCH 6/7] xen/setup: Make dom0_mem=XGB behavior be similar to classic Xen kernels.
On Tue, Apr 03, 2012 at 09:58:44AM +0100, David Vrabel wrote:> On 30/03/12 21:37, Konrad Rzeszutek Wilk wrote: > > Meaning that we will allocate up to XGB and not consider the > > rest of the memory as a possible balloon goal. > > I agree with Jan when he commented on the equivalent Xen patch for this > behaviour. The current behaviour is better than the classic one.Current behavior in the hypervisor or with current Linux kernel?> > With your new behaviour it will no longer possible to specify an > unlimited balloon but a limited number of initial pages. This is > behaviour that Jan said he used.I am not sure I see the problem - I mean if one uses: dom0_mem=min:8G,max:16G I understand that we want to start at 8GB and if the user choose to - balloon up to 16GB. But doing this: dom0_mem=8G and allocating pagetables up to .. say 32GB, seems counter-intuive as the effect is similar to having no ''dom0_mem'' except that the initial size is smaller.> > This problem is better solved by improving the documentation. A review > of the xen.org wiki where dom0_mem is mentioned would be a good start, > and an update to the recently added section for distro developers. > > David > > > This results in /proc/meminfo reporting: > > > > -MemTotal: 2845024 kB > > -MemFree: 2497716 kB > > +MemTotal: 2927192 kB > > +MemFree: 2458952 kB > > ... > > -DirectMap4k: 8304640 kB > > +DirectMap4k: 3063808 kB > > DirectMap2M: 0 kB > > > > on a 8GB machine with ''dom0_mem=3GB'' on the Xen hypervisor line. > > > > Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> > > --- > > arch/x86/xen/setup.c | 16 ++++++++++++++++ > > 1 files changed, 16 insertions(+), 0 deletions(-) > > > > diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c > > index 2a12143..4e4aa8e 100644 > > --- a/arch/x86/xen/setup.c > > +++ b/arch/x86/xen/setup.c > > @@ -261,11 +261,27 @@ static unsigned long __init xen_get_max_pages(void) > > * the current maximum rather than the static maximum. In this > > * case the e820 map provided to us will cover the static > > * maximum region. > > + * > > + * The dom0_mem=min:X,max:Y tweaks options differently depending > > + * on the version, but in general this is what we get: > > + * | XENMEM_maximum_reser | nr_pages > > + * --------------++-----------------------+------------------- > > + * no dom0_mem | INT_MAX | the max_phys_pfn > > + * =3G | INT_MAX | 786432 > > + * =max:3G | 786432 | 786432 > > + * =min:1G,max:3G| 262144 | 786432 > > + * > > + * The =3G is often used and it lead to us initially setting > > + * 786432 and allowing dom0 to balloon up to the max_physical_pfn. > > + * This is at odd with the classic XenOClassic so lets emulate > > + * the classic behavior. > > */ > > if (xen_initial_domain()) { > > ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); > > if (ret > 0) > > max_pages = ret; > > + if (ret == -1UL) > > + max_pages = xen_start_info->nr_pages; > > } > > > > return min(max_pages, MAX_DOMAIN_PAGES); > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
Konrad Rzeszutek Wilk
2012-Apr-06 21:01 UTC
Re: [Xen-devel] [PATCH 6/7] xen/setup: Make dom0_mem=XGB behavior be similar to classic Xen kernels.
On Tue, Apr 03, 2012 at 10:46:41AM +0100, Jan Beulich wrote:> >>> On 03.04.12 at 10:58, David Vrabel <david.vrabel@citrix.com> wrote: > > With your new behaviour it will no longer possible to specify an > > unlimited balloon but a limited number of initial pages. This is > > behaviour that Jan said he used. > > An unlimited balloon was never possible afaict (as that would have > implied setting up an "infinite" number of struct page instances at > boot time. > > What I''m using is "dom0_mem=-<num>M" together with the kernel > option "mem=<num>G", such that max-balloon > initial alloc (usually > I set max-balloon to approximately the amount of memory in the > system, so the upper limit is "infinite" in the sense that I can''t go > higher anyway, but it''s not truly infinity).Couldn''t you do the same thing with ''dom0_mem=X,max:Y''> > Jan
Konrad Rzeszutek Wilk
2012-Apr-06 21:02 UTC
Re: [Xen-devel] [PATCH 5/7] xen/setup: Transfer MFNs from non-RAM E820 entries and gaps to E820 RAM
On Tue, Apr 03, 2012 at 09:13:44AM -0400, Konrad Rzeszutek Wilk wrote:> On Tue, Apr 03, 2012 at 09:48:43AM +0100, David Vrabel wrote: > > On 30/03/12 21:37, Konrad Rzeszutek Wilk wrote: > > > We will now get: > > > > > > -Released 283999 pages of unused memory > > > +Exchanged 283999 pages > > > .. snip.. > > > -Memory: 6487732k/9208688k available (5817k kernel code, 1136060k absent, 1584896k reserved, 2900k data, 692k init) > > > +Memory: 6503888k/8072692k available (5817k kernel code, 1136060k absent, 432744k reserved, 2900k data, 692k init) > > > > This isn''t correct. You''ve have lost ~1 GB of memory which are the > > pages that were supposed to be moved. The additional 1GB of reserved > > memory in the old case is the balloon. > > Whoops. > > > > In xen_memory_setup() where it loops through the e820 to clip the RAM > > regions you need to factor in the additional memory you''ve moved. In > > this loop you may need to count the pages in the RAM region instead of > > the simple (addr < mem_end) test. Take care with RAM regions with > > partial pages and the like. > > <nods> I did some more exhaustive testing and hit some issues.. which is that moving the MFNs in the P2M is fine from the Linux kernel perspective but the changes won''t be reflected in the M2P. To make the M2P have the new PFNs I have to use the populate_physmap hypercall. A new-ish version will be posted soon.
Jan Beulich
2012-Apr-09 16:39 UTC
Re: [Xen-devel] [PATCH 6/7] xen/setup: Make dom0_mem=XGB behavior be similar to classic Xen kernels.
>>> Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> 04/06/12 11:06 PM >>> >On Tue, Apr 03, 2012 at 10:46:41AM +0100, Jan Beulich wrote: >> >>> On 03.04.12 at 10:58, David Vrabel <david.vrabel@citrix.com> wrote: >> > With your new behaviour it will no longer possible to specify an >> > unlimited balloon but a limited number of initial pages. This is >> > behaviour that Jan said he used. >> >> An unlimited balloon was never possible afaict (as that would have >> implied setting up an "infinite" number of struct page instances at >> boot time. >> >> What I''m using is "dom0_mem=-<num>M" together with the kernel >> option "mem=<num>G", such that max-balloon > initial alloc (usually >> I set max-balloon to approximately the amount of memory in the >> system, so the upper limit is "infinite" in the sense that I can''t go >> higher anyway, but it''s not truly infinity). > >Couldn''t you do the same thing with ''dom0_mem=X,max:Y''That would be possible (albeit not exactly identical in behavior). But my main point in the discussion was to not modify existing behavior without actual need to (including the desire to not have to modify dozens of command lines). Jan
Jan Beulich
2012-Apr-09 16:56 UTC
Re: [Xen-devel] [PATCH 6/7] xen/setup: Make dom0_mem=XGB behavior be similar to classic Xen kernels.
>>> Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> 04/06/12 11:04 PM >>> >> With your new behaviour it will no longer possible to specify an >> unlimited balloon but a limited number of initial pages. This is >> behaviour that Jan said he used. > >I am not sure I see the problem - I mean if one uses: > >dom0_mem=min:8G,max:16G > >I understand that we want to start at 8GB and if the user >choose to - balloon up to 16GB. > >But doing this: > >dom0_mem=8G > >and allocating pagetables up to .. say 32GB, seems counter-intuive >as the effect is similar to having no ''dom0_mem'' except that the initial >size is smaller.What''s counter intuitive here? There may not be a need - from the perspective of the kernel - for a hard upper limit enforced by Xen (i.e. the pseudo infinity we have right now may be quite fine). Anyway, as said in the other reply already - unless this is to address a bug, I don''t see the point in changing behavior that has been that way for a pretty long time. Jan
Konrad Rzeszutek Wilk
2012-Apr-09 21:33 UTC
Re: [Xen-devel] [PATCH 6/7] xen/setup: Make dom0_mem=XGB behavior be similar to classic Xen kernels.
On Mon, Apr 09, 2012 at 05:39:35PM +0100, Jan Beulich wrote:> >>> Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> 04/06/12 11:06 PM >>> > >On Tue, Apr 03, 2012 at 10:46:41AM +0100, Jan Beulich wrote: > >> >>> On 03.04.12 at 10:58, David Vrabel <david.vrabel@citrix.com> wrote: > >> > With your new behaviour it will no longer possible to specify an > >> > unlimited balloon but a limited number of initial pages. This is > >> > behaviour that Jan said he used. > >> > >> An unlimited balloon was never possible afaict (as that would have > >> implied setting up an "infinite" number of struct page instances at > >> boot time. > >> > >> What I''m using is "dom0_mem=-<num>M" together with the kernel > >> option "mem=<num>G", such that max-balloon > initial alloc (usually > >> I set max-balloon to approximately the amount of memory in the > >> system, so the upper limit is "infinite" in the sense that I can''t go > >> higher anyway, but it''s not truly infinity). > > > >Couldn''t you do the same thing with ''dom0_mem=X,max:Y'' > > That would be possible (albeit not exactly identical in behavior). But my > main point in the discussion was to not modify existing behavior without > actual need to (including the desire to not have to modify dozens of > command lines).Right. I don''t want to modify the hypervisor - my goal is to bring the pvops kernel in line with how XenOLinux does it.
Konrad Rzeszutek Wilk
2012-Apr-09 21:49 UTC
Re: [Xen-devel] [PATCH 6/7] xen/setup: Make dom0_mem=XGB behavior be similar to classic Xen kernels.
On Mon, Apr 09, 2012 at 05:56:11PM +0100, Jan Beulich wrote:> >>> Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> 04/06/12 11:04 PM >>> > >> With your new behaviour it will no longer possible to specify an > >> unlimited balloon but a limited number of initial pages. This is > >> behaviour that Jan said he used. > > > >I am not sure I see the problem - I mean if one uses: > > > >dom0_mem=min:8G,max:16G > > > >I understand that we want to start at 8GB and if the user > >choose to - balloon up to 16GB. > > > >But doing this: > > > >dom0_mem=8G > > > >and allocating pagetables up to .. say 32GB, seems counter-intuive > >as the effect is similar to having no ''dom0_mem'' except that the initial > >size is smaller. > > What''s counter intuitive here? There may not be a need - from the perspective > of the kernel - for a hard upper limit enforced by Xen (i.e. the pseudo infinity > we have right now may be quite fine).Counter intuitive in that when one uses ''dom0_mem=8G'' it implies some clipping. And with the pvops kernel we don''t do any clipping - we allocate pagetables up to the the limit of the E820 space. So on a 32GB box, we end up with pagetables addressing 32GB, of which 24GB are balloon space.> > Anyway, as said in the other reply already - unless this is to address a bug, I > don''t see the point in changing behavior that has been that way for a pretty > long time.The bug here is that if you say ''dom0_mem=max:4G'' the amount of memory that dom0 boots is not the same. It actually is smaller (by about one 1GB since that is the amount of memort that gets ballooned out from the E820 gaps and E820 RESERVED/ACPI PFN spots). The first set of patches did this a bit ineptly, but the next version populate''s the P2M and M2P so you actually end up with 4GB of memory in dom0 instead of the 3GB. This is what we end up with without any dom0_mem argument: 2.6.32 SLES: Memory: 7538688k/8079432k available (3971k kernel code, 8192k absent, 532300k reserved, 2491k data, 348k init) MemTotal: 8063140 kB MemFree: 7421504 kB DirectMap4k: 8071240 kB Domain-0 0 7873 4 r----- 20.3 3.3: Memory: 6486452k/9208688k available (5825k kernel code, 1136060k absent, 1586176k reserved, 2890k data, 692k init) MemTotal: 6716156 kB MemFree: 6365696 kB DirectMap4k: 8078192 kB Domain-0 0 6774 4 r----- 26.0 3.3+patches: Memory: 7621460k/9208688k available (5817k kernel code, 1136060k absent, 451168k reserved, 2899k data, 692k init) MemTotal: 7849924 kB MemFree: 7500748 kB DirectMap4k: 8078192 kB Domain-0 0 7883 4 r----- 11.9 and .. hm, I lost the outputs I had with dom0_mem=X, but this is what I get with 3.3 and 3.3+this patch: dom0_mem=1G -Memory: 610884k/9435136k available (5817k kernel code, 1136060k absent, 7688192k reserved, 2899k data, 696k init) +Memory: 724184k/1053064k available (5817k kernel code, 4552k absent, 324328k reserved, 2899k data, 696k init) I think the SLES kernel has the same behavior, but will have to wait until next week when I am back to be double sure. When it comes to "infinite" balloon - I think the work that Daniel did on using the memory hotplug mechanism to add memory is preferable. That way pagetables are put in the newly added memory space.> > Jan