David Vrabel
2013-Nov-06 14:49 UTC
[PATCH 4/9] kexec: extend hypercall with improved load/unload ops
From: David Vrabel <david.vrabel@citrix.com> In the existing kexec hypercall, the load and unload ops depend on internals of the Linux kernel (the page list and code page provided by the kernel). The code page is used to transition between Xen context and the image so using kernel code doesn''t make sense and will not work for PVH guests. Add replacement KEXEC_CMD_kexec_load and KEXEC_CMD_kexec_unload ops that no longer require a code page to be provided by the guest -- Xen now provides the code for calling the image directly. The new load op looks similar to the Linux kexec_load system call and allows the guest to provide the image data to be loaded. The guest specifies the architecture of the image which may be a 32-bit subarch of the hypervisor''s architecture (i.e., an EM_386 image on an EM_X86_64 hypervisor). The toolstack can now load images without kernel involvement. This is required for supporting kexec when using a dom0 with an upstream kernel. Crash images are copied directly into the crash region on load. Default images are copied into domheap pages and a list of source and destination machine addresses is created. This is list is used in kexec_reloc() to relocate the image to its destination. The old load and unload sub-ops are still available (as KEXEC_CMD_load_v1 and KEXEC_CMD_unload_v1) and are implemented on top of the new infrastructure. Signed-off-by: David Vrabel <david.vrabel@citrix.com> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> --- xen/arch/x86/machine_kexec.c | 192 +++++++++++------ xen/arch/x86/x86_64/Makefile | 2 +- xen/arch/x86/x86_64/compat_kexec.S | 187 ---------------- xen/arch/x86/x86_64/kexec_reloc.S | 198 +++++++++++++++++ xen/common/kexec.c | 398 +++++++++++++++++++++++++++++------ xen/common/kimage.c | 122 +++++++++++- xen/include/asm-x86/fixmap.h | 3 - xen/include/asm-x86/machine_kexec.h | 16 ++ xen/include/xen/kexec.h | 16 +- xen/include/xen/kimage.h | 6 + 10 files changed, 804 insertions(+), 336 deletions(-) delete mode 100644 xen/arch/x86/x86_64/compat_kexec.S create mode 100644 xen/arch/x86/x86_64/kexec_reloc.S create mode 100644 xen/include/asm-x86/machine_kexec.h diff --git a/xen/arch/x86/machine_kexec.c b/xen/arch/x86/machine_kexec.c index 68b9705..b70d5a6 100644 --- a/xen/arch/x86/machine_kexec.c +++ b/xen/arch/x86/machine_kexec.c @@ -1,9 +1,18 @@ /****************************************************************************** * machine_kexec.c * + * Copyright (C) 2013 Citrix Systems R&D Ltd. + * + * Portions derived from Linux''s arch/x86/kernel/machine_kexec_64.c. + * + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> + * * Xen port written by: * - Simon ''Horms'' Horman <horms@verge.net.au> * - Magnus Damm <magnus@valinux.co.jp> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. */ #include <xen/types.h> @@ -11,63 +20,124 @@ #include <xen/guest_access.h> #include <asm/fixmap.h> #include <asm/hpet.h> +#include <asm/page.h> +#include <asm/machine_kexec.h> -typedef void (*relocate_new_kernel_t)( - unsigned long indirection_page, - unsigned long *page_list, - unsigned long start_address, - unsigned int preserve_context); - -int machine_kexec_load(int type, int slot, xen_kexec_image_t *image) +/* + * Add a mapping for a page to the page tables used during kexec. + */ +int machine_kexec_add_page(struct kexec_image *image, unsigned long vaddr, + unsigned long maddr) { - unsigned long prev_ma = 0; - int fix_base = FIX_KEXEC_BASE_0 + (slot * (KEXEC_XEN_NO_PAGES >> 1)); - int k; + struct page_info *l4_page; + struct page_info *l3_page; + struct page_info *l2_page; + struct page_info *l1_page; + l4_pgentry_t *l4 = NULL; + l3_pgentry_t *l3 = NULL; + l2_pgentry_t *l2 = NULL; + l1_pgentry_t *l1 = NULL; + int ret = -ENOMEM; + + l4_page = image->aux_page; + if ( !l4_page ) + { + l4_page = kimage_alloc_control_page(image, 0); + if ( !l4_page ) + goto out; + image->aux_page = l4_page; + } - /* setup fixmap to point to our pages and record the virtual address - * in every odd index in page_list[]. - */ + l4 = __map_domain_page(l4_page); + l4 += l4_table_offset(vaddr); + if ( !(l4e_get_flags(*l4) & _PAGE_PRESENT) ) + { + l3_page = kimage_alloc_control_page(image, 0); + if ( !l3_page ) + goto out; + l4e_write(l4, l4e_from_page(l3_page, __PAGE_HYPERVISOR)); + } + else + l3_page = l4e_get_page(*l4); + + l3 = __map_domain_page(l3_page); + l3 += l3_table_offset(vaddr); + if ( !(l3e_get_flags(*l3) & _PAGE_PRESENT) ) + { + l2_page = kimage_alloc_control_page(image, 0); + if ( !l2_page ) + goto out; + l3e_write(l3, l3e_from_page(l2_page, __PAGE_HYPERVISOR)); + } + else + l2_page = l3e_get_page(*l3); + + l2 = __map_domain_page(l2_page); + l2 += l2_table_offset(vaddr); + if ( !(l2e_get_flags(*l2) & _PAGE_PRESENT) ) + { + l1_page = kimage_alloc_control_page(image, 0); + if ( !l1_page ) + goto out; + l2e_write(l2, l2e_from_page(l1_page, __PAGE_HYPERVISOR)); + } + else + l1_page = l2e_get_page(*l2); + + l1 = __map_domain_page(l1_page); + l1 += l1_table_offset(vaddr); + l1e_write(l1, l1e_from_pfn(maddr >> PAGE_SHIFT, __PAGE_HYPERVISOR)); + + ret = 0; +out: + if ( l1 ) + unmap_domain_page(l1); + if ( l2 ) + unmap_domain_page(l2); + if ( l3 ) + unmap_domain_page(l3); + if ( l4 ) + unmap_domain_page(l4); + return ret; +} - for ( k = 0; k < KEXEC_XEN_NO_PAGES; k++ ) +int machine_kexec_load(struct kexec_image *image) +{ + void *code_page; + int ret; + + switch ( image->arch ) { - if ( (k & 1) == 0 ) - { - /* Even pages: machine address. */ - prev_ma = image->page_list[k]; - } - else - { - /* Odd pages: va for previous ma. */ - if ( is_pv_32on64_domain(dom0) ) - { - /* - * The compatability bounce code sets up a page table - * with a 1-1 mapping of the first 1G of memory so - * VA==PA here. - * - * This Linux purgatory code still sets up separate - * high and low mappings on the control page (entries - * 0 and 1) but it is harmless if they are equal since - * that PT is not live at the time. - */ - image->page_list[k] = prev_ma; - } - else - { - set_fixmap(fix_base + (k >> 1), prev_ma); - image->page_list[k] = fix_to_virt(fix_base + (k >> 1)); - } - } + case EM_386: + case EM_X86_64: + break; + default: + return -EINVAL; } + code_page = __map_domain_page(image->control_code_page); + memcpy(code_page, kexec_reloc, kexec_reloc_size); + unmap_domain_page(code_page); + + /* + * Add a mapping for the control code page to the same virtual + * address as kexec_reloc. This allows us to keep running after + * these page tables are loaded in kexec_reloc. + */ + ret = machine_kexec_add_page(image, (unsigned long)kexec_reloc, + page_to_maddr(image->control_code_page)); + if ( ret < 0 ) + return ret; + return 0; } -void machine_kexec_unload(int type, int slot, xen_kexec_image_t *image) +void machine_kexec_unload(struct kexec_image *image) { + /* no-op. kimage_free() frees all control pages. */ } -void machine_reboot_kexec(xen_kexec_image_t *image) +void machine_reboot_kexec(struct kexec_image *image) { BUG_ON(smp_processor_id() != 0); smp_send_stop(); @@ -75,13 +145,10 @@ void machine_reboot_kexec(xen_kexec_image_t *image) BUG(); } -void machine_kexec(xen_kexec_image_t *image) +void machine_kexec(struct kexec_image *image) { - struct desc_ptr gdt_desc = { - .base = (unsigned long)(boot_cpu_gdt_table - FIRST_RESERVED_GDT_ENTRY), - .limit = LAST_RESERVED_GDT_BYTE - }; int i; + unsigned long reloc_flags = 0; /* We are about to permenantly jump out of the Xen context into the kexec * purgatory code. We really dont want to be still servicing interupts. @@ -109,29 +176,12 @@ void machine_kexec(xen_kexec_image_t *image) * not like running with NMIs disabled. */ enable_nmis(); - /* - * compat_machine_kexec() returns to idle pagetables, which requires us - * to be running on a static GDT mapping (idle pagetables have no GDT - * mappings in their per-domain mapping area). - */ - asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); + if ( image->arch == EM_386 ) + reloc_flags |= KEXEC_RELOC_FLAG_COMPAT; - if ( is_pv_32on64_domain(dom0) ) - { - compat_machine_kexec(image->page_list[1], - image->indirection_page, - image->page_list, - image->start_address); - } - else - { - relocate_new_kernel_t rnk; - - rnk = (relocate_new_kernel_t) image->page_list[1]; - (*rnk)(image->indirection_page, image->page_list, - image->start_address, - 0 /* preserve_context */); - } + kexec_reloc(page_to_maddr(image->control_code_page), + page_to_maddr(image->aux_page), + image->head, image->entry_maddr, reloc_flags); } int machine_kexec_get(xen_kexec_range_t *range) diff --git a/xen/arch/x86/x86_64/Makefile b/xen/arch/x86/x86_64/Makefile index d56e12d..7f8fb3d 100644 --- a/xen/arch/x86/x86_64/Makefile +++ b/xen/arch/x86/x86_64/Makefile @@ -11,11 +11,11 @@ obj-y += mmconf-fam10h.o obj-y += mmconfig_64.o obj-y += mmconfig-shared.o obj-y += compat.o -obj-bin-y += compat_kexec.o obj-y += domain.o obj-y += physdev.o obj-y += platform_hypercall.o obj-y += cpu_idle.o obj-y += cpufreq.o +obj-bin-y += kexec_reloc.o obj-$(crash_debug) += gdbstub.o diff --git a/xen/arch/x86/x86_64/compat_kexec.S b/xen/arch/x86/x86_64/compat_kexec.S deleted file mode 100644 index fc92af9..0000000 --- a/xen/arch/x86/x86_64/compat_kexec.S +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Compatibility kexec handler. - */ - -/* - * NOTE: We rely on Xen not relocating itself above the 4G boundary. This is - * currently true but if it ever changes then compat_pg_table will - * need to be moved back below 4G at run time. - */ - -#include <xen/config.h> - -#include <asm/asm_defns.h> -#include <asm/msr.h> -#include <asm/page.h> - -/* The unrelocated physical address of a symbol. */ -#define SYM_PHYS(sym) ((sym) - __XEN_VIRT_START) - -/* Load physical address of symbol into register and relocate it. */ -#define RELOCATE_SYM(sym,reg) mov $SYM_PHYS(sym), reg ; \ - add xen_phys_start(%rip), reg - -/* - * Relocate a physical address in memory. Size of temporary register - * determines size of the value to relocate. - */ -#define RELOCATE_MEM(addr,reg) mov addr(%rip), reg ; \ - add xen_phys_start(%rip), reg ; \ - mov reg, addr(%rip) - - .text - - .code64 - -ENTRY(compat_machine_kexec) - /* x86/64 x86/32 */ - /* %rdi - relocate_new_kernel_t CALL */ - /* %rsi - indirection page 4(%esp) */ - /* %rdx - page_list 8(%esp) */ - /* %rcx - start address 12(%esp) */ - /* cpu has pae 16(%esp) */ - - /* Shim the 64 bit page_list into a 32 bit page_list. */ - mov $12,%r9 - lea compat_page_list(%rip), %rbx -1: dec %r9 - movl (%rdx,%r9,8),%eax - movl %eax,(%rbx,%r9,4) - test %r9,%r9 - jnz 1b - - RELOCATE_SYM(compat_page_list,%rdx) - - /* Relocate compatibility mode entry point address. */ - RELOCATE_MEM(compatibility_mode_far,%eax) - - /* Relocate compat_pg_table. */ - RELOCATE_MEM(compat_pg_table, %rax) - RELOCATE_MEM(compat_pg_table+0x8, %rax) - RELOCATE_MEM(compat_pg_table+0x10,%rax) - RELOCATE_MEM(compat_pg_table+0x18,%rax) - - /* - * Setup an identity mapped region in PML4[0] of idle page - * table. - */ - RELOCATE_SYM(l3_identmap,%rax) - or $0x63,%rax - mov %rax, idle_pg_table(%rip) - - /* Switch to idle page table. */ - RELOCATE_SYM(idle_pg_table,%rax) - movq %rax, %cr3 - - /* Switch to identity mapped compatibility stack. */ - RELOCATE_SYM(compat_stack,%rax) - movq %rax, %rsp - - /* Save xen_phys_start for 32 bit code. */ - movq xen_phys_start(%rip), %rbx - - /* Jump to low identity mapping in compatibility mode. */ - ljmp *compatibility_mode_far(%rip) - ud2 - -compatibility_mode_far: - .long SYM_PHYS(compatibility_mode) - .long __HYPERVISOR_CS32 - - /* - * We use 5 words of stack for the arguments passed to the kernel. The - * kernel only uses 1 word before switching to its own stack. Allocate - * 16 words to give "plenty" of room. - */ - .fill 16,4,0 -compat_stack: - - .code32 - -#undef RELOCATE_SYM -#undef RELOCATE_MEM - -/* - * Load physical address of symbol into register and relocate it. %rbx - * contains xen_phys_start(%rip) saved before jump to compatibility - * mode. - */ -#define RELOCATE_SYM(sym,reg) mov $SYM_PHYS(sym), reg ; \ - add %ebx, reg - -compatibility_mode: - /* Setup some sane segments. */ - movl $__HYPERVISOR_DS32, %eax - movl %eax, %ds - movl %eax, %es - movl %eax, %fs - movl %eax, %gs - movl %eax, %ss - - /* Push arguments onto stack. */ - pushl $0 /* 20(%esp) - preserve context */ - pushl $1 /* 16(%esp) - cpu has pae */ - pushl %ecx /* 12(%esp) - start address */ - pushl %edx /* 8(%esp) - page list */ - pushl %esi /* 4(%esp) - indirection page */ - pushl %edi /* 0(%esp) - CALL */ - - /* Disable paging and therefore leave 64 bit mode. */ - movl %cr0, %eax - andl $~X86_CR0_PG, %eax - movl %eax, %cr0 - - /* Switch to 32 bit page table. */ - RELOCATE_SYM(compat_pg_table, %eax) - movl %eax, %cr3 - - /* Clear MSR_EFER[LME], disabling long mode */ - movl $MSR_EFER,%ecx - rdmsr - btcl $_EFER_LME,%eax - wrmsr - - /* Re-enable paging, but only 32 bit mode now. */ - movl %cr0, %eax - orl $X86_CR0_PG, %eax - movl %eax, %cr0 - jmp 1f -1: - - popl %eax - call *%eax - ud2 - - .data - .align 4 -compat_page_list: - .fill 12,4,0 - - .align 32,0 - - /* - * These compat page tables contain an identity mapping of the - * first 4G of the physical address space. - */ -compat_pg_table: - .long SYM_PHYS(compat_pg_table_l2) + 0*PAGE_SIZE + 0x01, 0 - .long SYM_PHYS(compat_pg_table_l2) + 1*PAGE_SIZE + 0x01, 0 - .long SYM_PHYS(compat_pg_table_l2) + 2*PAGE_SIZE + 0x01, 0 - .long SYM_PHYS(compat_pg_table_l2) + 3*PAGE_SIZE + 0x01, 0 - - .section .data.page_aligned, "aw", @progbits - .align PAGE_SIZE,0 -compat_pg_table_l2: - .macro identmap from=0, count=512 - .if \count-1 - identmap "(\from+0)","(\count/2)" - identmap "(\from+(0x200000*(\count/2)))","(\count/2)" - .else - .quad 0x00000000000000e3 + \from - .endif - .endm - - identmap 0x00000000 - identmap 0x40000000 - identmap 0x80000000 - identmap 0xc0000000 diff --git a/xen/arch/x86/x86_64/kexec_reloc.S b/xen/arch/x86/x86_64/kexec_reloc.S new file mode 100644 index 0000000..7a16c85 --- /dev/null +++ b/xen/arch/x86/x86_64/kexec_reloc.S @@ -0,0 +1,198 @@ +/* + * Relocate a kexec_image to its destination and call it. + * + * Copyright (C) 2013 Citrix Systems R&D Ltd. + * + * Portions derived from Linux''s arch/x86/kernel/relocate_kernel_64.S. + * + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ +#include <xen/config.h> +#include <xen/kimage.h> + +#include <asm/asm_defns.h> +#include <asm/msr.h> +#include <asm/page.h> +#include <asm/machine_kexec.h> + + .text + .align PAGE_SIZE + .code64 + +ENTRY(kexec_reloc) + /* %rdi - code page maddr */ + /* %rsi - page table maddr */ + /* %rdx - indirection page maddr */ + /* %rcx - entry maddr (%rbp) */ + /* %r8 - flags */ + + movq %rcx, %rbp + + /* Setup stack. */ + leaq (reloc_stack - kexec_reloc)(%rdi), %rsp + + /* Load reloc page table. */ + movq %rsi, %cr3 + + /* Jump to identity mapped code. */ + leaq (identity_mapped - kexec_reloc)(%rdi), %rax + jmpq *%rax + +identity_mapped: + /* + * Set cr0 to a known state: + * - Paging enabled + * - Alignment check disabled + * - Write protect disabled + * - No task switch + * - Don''t do FP software emulation. + * - Protected mode enabled + */ + movq %cr0, %rax + andl $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax + orl $(X86_CR0_PG | X86_CR0_PE), %eax + movq %rax, %cr0 + + /* + * Set cr4 to a known state: + * - physical address extension enabled + */ + movl $X86_CR4_PAE, %eax + movq %rax, %cr4 + + movq %rdx, %rdi + call relocate_pages + + /* Need to switch to 32-bit mode? */ + testq $KEXEC_RELOC_FLAG_COMPAT, %r8 + jnz call_32_bit + +call_64_bit: + /* Call the image entry point. This should never return. */ + callq *%rbp + ud2 + +call_32_bit: + /* Setup IDT. */ + lidt compat_mode_idt(%rip) + + /* Load compat GDT. */ + leaq compat_mode_gdt(%rip), %rax + movq %rax, (compat_mode_gdt_desc + 2)(%rip) + lgdt compat_mode_gdt_desc(%rip) + + /* Relocate compatibility mode entry point address. */ + leal compatibility_mode(%rip), %eax + movl %eax, compatibility_mode_far(%rip) + + /* Enter compatibility mode. */ + ljmp *compatibility_mode_far(%rip) + +relocate_pages: + /* %rdi - indirection page maddr */ + pushq %rbx + + cld + movq %rdi, %rbx + xorl %edi, %edi + xorl %esi, %esi + +next_entry: /* top, read another word for the indirection page */ + + movq (%rbx), %rcx + addq $8, %rbx +is_dest: + testb $IND_DESTINATION, %cl + jz is_ind + movq %rcx, %rdi + andq $PAGE_MASK, %rdi + jmp next_entry +is_ind: + testb $IND_INDIRECTION, %cl + jz is_done + movq %rcx, %rbx + andq $PAGE_MASK, %rbx + jmp next_entry +is_done: + testb $IND_DONE, %cl + jnz done +is_source: + testb $IND_SOURCE, %cl + jz is_zero + movq %rcx, %rsi /* For every source page do a copy */ + andq $PAGE_MASK, %rsi + movl $(PAGE_SIZE / 8), %ecx + rep movsq + jmp next_entry +is_zero: + testb $IND_ZERO, %cl + jz next_entry + movl $(PAGE_SIZE / 8), %ecx /* Zero the destination page. */ + xorl %eax, %eax + rep stosq + jmp next_entry +done: + popq %rbx + ret + + .code32 + +compatibility_mode: + /* Setup some sane segments. */ + movl $0x0008, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %fs + movl %eax, %gs + movl %eax, %ss + + /* Disable paging and therefore leave 64 bit mode. */ + movl %cr0, %eax + andl $~X86_CR0_PG, %eax + movl %eax, %cr0 + + /* Disable long mode */ + movl $MSR_EFER, %ecx + rdmsr + andl $~EFER_LME, %eax + wrmsr + + /* Clear cr4 to disable PAE. */ + xorl %eax, %eax + movl %eax, %cr4 + + /* Call the image entry point. This should never return. */ + call *%ebp + ud2 + + .align 4 +compatibility_mode_far: + .long 0x00000000 /* set in call_32_bit above */ + .word 0x0010 + +compat_mode_gdt_desc: + .word (3*8)-1 + .quad 0x0000000000000000 /* set in call_32_bit above */ + + .align 8 +compat_mode_gdt: + .quad 0x0000000000000000 /* null */ + .quad 0x00cf92000000ffff /* 0x0008 ring 0 data */ + .quad 0x00cf9a000000ffff /* 0x0010 ring 0 code, compatibility */ + +compat_mode_idt: + .word 0 /* limit */ + .long 0 /* base */ + + /* + * 16 words of stack are more than enough. + */ + .fill 16,8,0 +reloc_stack: + + .globl kexec_reloc_size +kexec_reloc_size: + .long . - kexec_reloc diff --git a/xen/common/kexec.c b/xen/common/kexec.c index 7b23df0..c5450ba 100644 --- a/xen/common/kexec.c +++ b/xen/common/kexec.c @@ -25,6 +25,7 @@ #include <xen/version.h> #include <xen/console.h> #include <xen/kexec.h> +#include <xen/kimage.h> #include <public/elfnote.h> #include <xsm/xsm.h> #include <xen/cpu.h> @@ -47,7 +48,7 @@ static Elf_Note *xen_crash_note; static cpumask_t crash_saved_cpus; -static xen_kexec_image_t kexec_image[KEXEC_IMAGE_NR]; +static struct kexec_image *kexec_image[KEXEC_IMAGE_NR]; #define KEXEC_FLAG_DEFAULT_POS (KEXEC_IMAGE_NR + 0) #define KEXEC_FLAG_CRASH_POS (KEXEC_IMAGE_NR + 1) @@ -55,8 +56,6 @@ static xen_kexec_image_t kexec_image[KEXEC_IMAGE_NR]; static unsigned long kexec_flags = 0; /* the lowest bits are for KEXEC_IMAGE... */ -static spinlock_t kexec_lock = SPIN_LOCK_UNLOCKED; - static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; static size_t vmcoreinfo_size = 0; @@ -311,14 +310,14 @@ void kexec_crash(void) kexec_common_shutdown(); kexec_crash_save_cpu(); machine_crash_shutdown(); - machine_kexec(&kexec_image[KEXEC_IMAGE_CRASH_BASE + pos]); + machine_kexec(kexec_image[KEXEC_IMAGE_CRASH_BASE + pos]); BUG(); } static long kexec_reboot(void *_image) { - xen_kexec_image_t *image = _image; + struct kexec_image *image = _image; kexecing = TRUE; @@ -734,63 +733,264 @@ static void crash_save_vmcoreinfo(void) #endif } -static int kexec_load_unload_internal(unsigned long op, xen_kexec_load_v1_t *load) +static void kexec_unload_image(struct kexec_image *image) { - xen_kexec_image_t *image; + if ( !image ) + return; + + machine_kexec_unload(image); + kimage_free(image); +} + +static int kexec_exec(XEN_GUEST_HANDLE_PARAM(void) uarg) +{ + xen_kexec_exec_t exec; + struct kexec_image *image; + int base, bit, pos, ret = -EINVAL; + + if ( unlikely(copy_from_guest(&exec, uarg, 1)) ) + return -EFAULT; + + if ( kexec_load_get_bits(exec.type, &base, &bit) ) + return -EINVAL; + + pos = (test_bit(bit, &kexec_flags) != 0); + + /* Only allow kexec/kdump into loaded images */ + if ( !test_bit(base + pos, &kexec_flags) ) + return -ENOENT; + + switch (exec.type) + { + case KEXEC_TYPE_DEFAULT: + image = kexec_image[base + pos]; + ret = continue_hypercall_on_cpu(0, kexec_reboot, image); + break; + case KEXEC_TYPE_CRASH: + kexec_crash(); /* Does not return */ + break; + } + + return -EINVAL; /* never reached */ +} + +static int kexec_swap_images(int type, struct kexec_image *new, + struct kexec_image **old) +{ + static DEFINE_SPINLOCK(kexec_lock); int base, bit, pos; - int ret = 0; + int new_slot, old_slot; + + *old = NULL; + + spin_lock(&kexec_lock); + + if ( test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) ) + { + spin_unlock(&kexec_lock); + return -EBUSY; + } - if ( kexec_load_get_bits(load->type, &base, &bit) ) + if ( kexec_load_get_bits(type, &base, &bit) ) return -EINVAL; pos = (test_bit(bit, &kexec_flags) != 0); + old_slot = base + pos; + new_slot = base + !pos; - /* Load the user data into an unused image */ - if ( op == KEXEC_CMD_kexec_load ) + if ( new ) { - image = &kexec_image[base + !pos]; + kexec_image[new_slot] = new; + set_bit(new_slot, &kexec_flags); + } + change_bit(bit, &kexec_flags); - BUG_ON(test_bit((base + !pos), &kexec_flags)); /* must be free */ + clear_bit(old_slot, &kexec_flags); + *old = kexec_image[old_slot]; - memcpy(image, &load->image, sizeof(*image)); + spin_unlock(&kexec_lock); - if ( !(ret = machine_kexec_load(load->type, base + !pos, image)) ) - { - /* Set image present bit */ - set_bit((base + !pos), &kexec_flags); + return 0; +} - /* Make new image the active one */ - change_bit(bit, &kexec_flags); - } +static int kexec_load_slot(struct kexec_image *kimage) +{ + struct kexec_image *old_kimage; + int ret = -ENOMEM; + + ret = machine_kexec_load(kimage); + if ( ret < 0 ) + return ret; + + crash_save_vmcoreinfo(); + + ret = kexec_swap_images(kimage->type, kimage, &old_kimage); + if ( ret < 0 ) + return ret; + + kexec_unload_image(old_kimage); + + return 0; +} + +static uint16_t kexec_load_v1_arch(void) +{ +#ifdef CONFIG_X86 + return is_pv_32on64_domain(dom0) ? EM_386 : EM_X86_64; +#else + return EM_NONE; +#endif +} - crash_save_vmcoreinfo(); +static int kexec_segments_add_segment( + unsigned int *nr_segments, xen_kexec_segment_t *segments, + unsigned long mfn) +{ + paddr_t maddr = (paddr_t)mfn << PAGE_SHIFT; + unsigned int n = *nr_segments; + + /* Need a new segment? */ + if ( n == 0 + || segments[n-1].dest_maddr + segments[n-1].dest_size != maddr ) + { + n++; + if ( n > KEXEC_SEGMENT_MAX ) + return -EINVAL; + *nr_segments = n; + + set_xen_guest_handle(segments[n-1].buf.h, NULL); + segments[n-1].buf_size = 0; + segments[n-1].dest_maddr = maddr; + segments[n-1].dest_size = 0; } - /* Unload the old image if present and load successful */ - if ( ret == 0 && !test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) ) + return 0; +} + +static int kexec_segments_from_ind_page(unsigned long mfn, + unsigned int *nr_segments, + xen_kexec_segment_t *segments, + bool_t compat) +{ + void *page; + kimage_entry_t *entry; + int ret = 0; + + page = map_domain_page(mfn); + + /* + * Walk the indirection page list, adding destination pages to the + * segments. + */ + for ( entry = page; ; ) { - if ( test_and_clear_bit((base + pos), &kexec_flags) ) + unsigned long ind; + + ind = kimage_entry_ind(entry, compat); + mfn = kimage_entry_mfn(entry, compat); + + switch ( ind ) { - image = &kexec_image[base + pos]; - machine_kexec_unload(load->type, base + pos, image); + case IND_DESTINATION: + ret = kexec_segments_add_segment(nr_segments, segments, mfn); + if ( ret < 0 ) + goto done; + break; + case IND_INDIRECTION: + unmap_domain_page(page); + entry = page = map_domain_page(mfn); + continue; + case IND_DONE: + goto done; + case IND_SOURCE: + if ( *nr_segments == 0 ) + { + ret = -EINVAL; + goto done; + } + segments[*nr_segments-1].dest_size += PAGE_SIZE; + break; + default: + ret = -EINVAL; + goto done; } + entry = kimage_entry_next(entry, compat); } +done: + unmap_domain_page(page); + return ret; +} +static int kexec_do_load_v1(xen_kexec_load_v1_t *load, int compat) +{ + struct kexec_image *kimage = NULL; + xen_kexec_segment_t *segments; + uint16_t arch; + unsigned int nr_segments = 0; + unsigned long ind_mfn = load->image.indirection_page >> PAGE_SHIFT; + int ret; + + arch = kexec_load_v1_arch(); + if ( arch == EM_NONE ) + return -ENOSYS; + + segments = xmalloc_array(xen_kexec_segment_t, KEXEC_SEGMENT_MAX); + if ( segments == NULL ) + return -ENOMEM; + + /* + * Work out the image segments (destination only) from the + * indirection pages. + * + * This is needed so we don''t allocate pages that will overlap + * with the destination when building the new set of indirection + * pages below. + */ + ret = kexec_segments_from_ind_page(ind_mfn, &nr_segments, segments, compat); + if ( ret < 0 ) + goto error; + + ret = kimage_alloc(&kimage, load->type, arch, load->image.start_address, + nr_segments, segments); + if ( ret < 0 ) + goto error; + + /* + * Build a new set of indirection pages in the native format. + * + * This walks the guest provided indirection pages a second time. + * The guest could have altered then, invalidating the segment + * information constructed above. This will only result in the + * resulting image being potentially unrelocatable. + */ + ret = kimage_build_ind(kimage, ind_mfn, compat); + if ( ret < 0 ) + goto error; + + ret = kexec_load_slot(kimage); + if ( ret < 0 ) + goto error; + + return 0; + +error: + if ( !kimage ) + xfree(segments); + kimage_free(kimage); return ret; } -static int kexec_load_unload(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) uarg) +static int kexec_load_v1(XEN_GUEST_HANDLE_PARAM(void) uarg) { xen_kexec_load_v1_t load; if ( unlikely(copy_from_guest(&load, uarg, 1)) ) return -EFAULT; - return kexec_load_unload_internal(op, &load); + return kexec_do_load_v1(&load, 0); } -static int kexec_load_unload_compat(unsigned long op, - XEN_GUEST_HANDLE_PARAM(void) uarg) +static int kexec_load_v1_compat(XEN_GUEST_HANDLE_PARAM(void) uarg) { #ifdef CONFIG_COMPAT compat_kexec_load_v1_t compat_load; @@ -809,49 +1009,113 @@ static int kexec_load_unload_compat(unsigned long op, load.type = compat_load.type; XLAT_kexec_image(&load.image, &compat_load.image); - return kexec_load_unload_internal(op, &load); -#else /* CONFIG_COMPAT */ + return kexec_do_load_v1(&load, 1); +#else return 0; -#endif /* CONFIG_COMPAT */ +#endif } -static int kexec_exec(XEN_GUEST_HANDLE_PARAM(void) uarg) +static int kexec_load(XEN_GUEST_HANDLE_PARAM(void) uarg) { - xen_kexec_exec_t exec; - xen_kexec_image_t *image; - int base, bit, pos, ret = -EINVAL; + xen_kexec_load_t load; + xen_kexec_segment_t *segments; + struct kexec_image *kimage = NULL; + int ret; - if ( unlikely(copy_from_guest(&exec, uarg, 1)) ) + if ( copy_from_guest(&load, uarg, 1) ) return -EFAULT; - if ( kexec_load_get_bits(exec.type, &base, &bit) ) + if ( load.nr_segments >= KEXEC_SEGMENT_MAX ) return -EINVAL; - pos = (test_bit(bit, &kexec_flags) != 0); - - /* Only allow kexec/kdump into loaded images */ - if ( !test_bit(base + pos, &kexec_flags) ) - return -ENOENT; + segments = xmalloc_array(xen_kexec_segment_t, load.nr_segments); + if ( segments == NULL ) + return -ENOMEM; - switch (exec.type) + if ( copy_from_guest(segments, load.segments.h, load.nr_segments) ) { - case KEXEC_TYPE_DEFAULT: - image = &kexec_image[base + pos]; - ret = continue_hypercall_on_cpu(0, kexec_reboot, image); - break; - case KEXEC_TYPE_CRASH: - kexec_crash(); /* Does not return */ - break; + ret = -EFAULT; + goto error; } - return -EINVAL; /* never reached */ + ret = kimage_alloc(&kimage, load.type, load.arch, load.entry_maddr, + load.nr_segments, segments); + if ( ret < 0 ) + goto error; + + ret = kimage_load_segments(kimage); + if ( ret < 0 ) + goto error; + + ret = kexec_load_slot(kimage); + if ( ret < 0 ) + goto error; + + return 0; + +error: + if ( ! kimage ) + xfree(segments); + kimage_free(kimage); + return ret; +} + +static int kexec_do_unload(xen_kexec_unload_t *unload) +{ + struct kexec_image *old_kimage; + int ret; + + ret = kexec_swap_images(unload->type, NULL, &old_kimage); + if ( ret < 0 ) + return ret; + + kexec_unload_image(old_kimage); + + return 0; +} + +static int kexec_unload_v1(XEN_GUEST_HANDLE_PARAM(void) uarg) +{ + xen_kexec_load_v1_t load; + xen_kexec_unload_t unload; + + if ( copy_from_guest(&load, uarg, 1) ) + return -EFAULT; + + unload.type = load.type; + return kexec_do_unload(&unload); +} + +static int kexec_unload_v1_compat(XEN_GUEST_HANDLE_PARAM(void) uarg) +{ +#ifdef CONFIG_COMPAT + compat_kexec_load_v1_t compat_load; + xen_kexec_unload_t unload; + + if ( copy_from_guest(&compat_load, uarg, 1) ) + return -EFAULT; + + unload.type = compat_load.type; + return kexec_do_unload(&unload); +#else + return 0; +#endif +} + +static int kexec_unload(XEN_GUEST_HANDLE_PARAM(void) uarg) +{ + xen_kexec_unload_t unload; + + if ( unlikely(copy_from_guest(&unload, uarg, 1)) ) + return -EFAULT; + + return kexec_do_unload(&unload); } static int do_kexec_op_internal(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) uarg, bool_t compat) { - unsigned long flags; int ret = -EINVAL; ret = xsm_kexec(XSM_PRIV); @@ -867,20 +1131,26 @@ static int do_kexec_op_internal(unsigned long op, ret = kexec_get_range(uarg); break; case KEXEC_CMD_kexec_load_v1: + if ( compat ) + ret = kexec_load_v1_compat(uarg); + else + ret = kexec_load_v1(uarg); + break; case KEXEC_CMD_kexec_unload_v1: - spin_lock_irqsave(&kexec_lock, flags); - if (!test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags)) - { - if (compat) - ret = kexec_load_unload_compat(op, uarg); - else - ret = kexec_load_unload(op, uarg); - } - spin_unlock_irqrestore(&kexec_lock, flags); + if ( compat ) + ret = kexec_unload_v1_compat(uarg); + else + ret = kexec_unload_v1(uarg); break; case KEXEC_CMD_kexec: ret = kexec_exec(uarg); break; + case KEXEC_CMD_kexec_load: + ret = kexec_load(uarg); + break; + case KEXEC_CMD_kexec_unload: + ret = kexec_unload(uarg); + break; } return ret; diff --git a/xen/common/kimage.c b/xen/common/kimage.c index 02ee37e..10fb785 100644 --- a/xen/common/kimage.c +++ b/xen/common/kimage.c @@ -175,11 +175,20 @@ static int do_kimage_alloc(struct kexec_image **rimage, paddr_t entry, image->control_code_page = kimage_alloc_control_page(image, MEMF_bits(32)); if ( !image->control_code_page ) goto out; + result = machine_kexec_add_page(image, + page_to_maddr(image->control_code_page), + page_to_maddr(image->control_code_page)); + if ( result < 0 ) + goto out; /* Add an empty indirection page. */ image->entry_page = kimage_alloc_control_page(image, 0); if ( !image->entry_page ) goto out; + result = machine_kexec_add_page(image, page_to_maddr(image->entry_page), + page_to_maddr(image->entry_page)); + if ( result < 0 ) + goto out; image->head = page_to_maddr(image->entry_page); @@ -595,7 +604,7 @@ static struct page_info *kimage_alloc_page(struct kexec_image *image, if ( addr == destination ) { page_list_del(page, &image->dest_pages); - return page; + goto found; } } page = NULL; @@ -647,6 +656,8 @@ static struct page_info *kimage_alloc_page(struct kexec_image *image, page_list_add(page, &image->dest_pages); } } +found: + machine_kexec_add_page(image, page_to_maddr(page), page_to_maddr(page)); return page; } @@ -753,6 +764,7 @@ static int kimage_load_crash_segment(struct kexec_image *image, static int kimage_load_segment(struct kexec_image *image, xen_kexec_segment_t *segment) { int result = -ENOMEM; + paddr_t addr; if ( !guest_handle_is_null(segment->buf.h) ) { @@ -767,6 +779,14 @@ static int kimage_load_segment(struct kexec_image *image, xen_kexec_segment_t *s } } + for ( addr = segment->dest_maddr & PAGE_MASK; + addr < segment->dest_maddr + segment->dest_size; addr += PAGE_SIZE ) + { + result = machine_kexec_add_page(image, addr, addr); + if ( result < 0 ) + break; + } + return result; } @@ -810,6 +830,106 @@ int kimage_load_segments(struct kexec_image *image) return 0; } +kimage_entry_t *kimage_entry_next(kimage_entry_t *entry, bool_t compat) +{ + if ( compat ) + return (kimage_entry_t *)((uint32_t *)entry + 1); + return entry + 1; +} + +unsigned long kimage_entry_mfn(kimage_entry_t *entry, bool_t compat) +{ + if ( compat ) + return *(uint32_t *)entry >> PAGE_SHIFT; + return *entry >> PAGE_SHIFT; +} + +unsigned long kimage_entry_ind(kimage_entry_t *entry, bool_t compat) +{ + if ( compat ) + return *(uint32_t *)entry & 0xf; + return *entry & 0xf; +} + +int kimage_build_ind(struct kexec_image *image, unsigned long ind_mfn, + bool_t compat) +{ + void *page; + kimage_entry_t *entry; + int ret = 0; + paddr_t dest = KIMAGE_NO_DEST; + + page = map_domain_page(ind_mfn); + if ( !page ) + return -ENOMEM; + + /* + * Walk the guest-supplied indirection pages, adding entries to + * the image''s indirection pages. + */ + for ( entry = page; ; ) + { + unsigned long ind; + unsigned long mfn; + + ind = kimage_entry_ind(entry, compat); + mfn = kimage_entry_mfn(entry, compat); + + switch ( ind ) + { + case IND_DESTINATION: + dest = (paddr_t)mfn << PAGE_SHIFT; + ret = kimage_set_destination(image, dest); + if ( ret < 0 ) + goto done; + break; + case IND_INDIRECTION: + unmap_domain_page(page); + page = map_domain_page(mfn); + entry = page; + continue; + case IND_DONE: + kimage_terminate(image); + goto done; + case IND_SOURCE: + { + struct page_info *guest_page, *xen_page; + + guest_page = mfn_to_page(mfn); + if ( !get_page(guest_page, current->domain) ) + { + ret = -EFAULT; + goto done; + } + + xen_page = kimage_alloc_page(image, dest); + if ( !xen_page ) + { + put_page(guest_page); + ret = -ENOMEM; + goto done; + } + + copy_domain_page(page_to_mfn(xen_page), mfn); + put_page(guest_page); + + ret = kimage_add_page(image, page_to_maddr(xen_page)); + if ( ret < 0 ) + goto done; + dest += PAGE_SIZE; + break; + } + default: + ret = -EINVAL; + goto done; + } + entry = kimage_entry_next(entry, compat); + } +done: + unmap_domain_page(page); + return ret; +} + /* * Local variables: * mode: C diff --git a/xen/include/asm-x86/fixmap.h b/xen/include/asm-x86/fixmap.h index 8b4266d..48c5676 100644 --- a/xen/include/asm-x86/fixmap.h +++ b/xen/include/asm-x86/fixmap.h @@ -56,9 +56,6 @@ enum fixed_addresses { FIX_ACPI_BEGIN, FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, FIX_HPET_BASE, - FIX_KEXEC_BASE_0, - FIX_KEXEC_BASE_END = FIX_KEXEC_BASE_0 \ - + ((KEXEC_XEN_NO_PAGES >> 1) * KEXEC_IMAGE_NR) - 1, FIX_TBOOT_SHARED_BASE, FIX_MSIX_IO_RESERV_BASE, FIX_MSIX_IO_RESERV_END = FIX_MSIX_IO_RESERV_BASE + FIX_MSIX_MAX_PAGES -1, diff --git a/xen/include/asm-x86/machine_kexec.h b/xen/include/asm-x86/machine_kexec.h new file mode 100644 index 0000000..ba0d469 --- /dev/null +++ b/xen/include/asm-x86/machine_kexec.h @@ -0,0 +1,16 @@ +#ifndef __X86_MACHINE_KEXEC_H__ +#define __X86_MACHINE_KEXEC_H__ + +#define KEXEC_RELOC_FLAG_COMPAT 0x1 /* 32-bit image */ + +#ifndef __ASSEMBLY__ + +extern void kexec_reloc(unsigned long reloc_code, unsigned long reloc_pt, + unsigned long ind_maddr, unsigned long entry_maddr, + unsigned long flags); + +extern unsigned int kexec_reloc_size; + +#endif + +#endif /* __X86_MACHINE_KEXEC_H__ */ diff --git a/xen/include/xen/kexec.h b/xen/include/xen/kexec.h index 1a5dda1..bd17747 100644 --- a/xen/include/xen/kexec.h +++ b/xen/include/xen/kexec.h @@ -6,6 +6,7 @@ #include <public/kexec.h> #include <asm/percpu.h> #include <xen/elfcore.h> +#include <xen/kimage.h> typedef struct xen_kexec_reserve { unsigned long size; @@ -40,11 +41,13 @@ extern enum low_crashinfo low_crashinfo_mode; extern paddr_t crashinfo_maxaddr_bits; void kexec_early_calculations(void); -int machine_kexec_load(int type, int slot, xen_kexec_image_t *image); -void machine_kexec_unload(int type, int slot, xen_kexec_image_t *image); +int machine_kexec_add_page(struct kexec_image *image, unsigned long vaddr, + unsigned long maddr); +int machine_kexec_load(struct kexec_image *image); +void machine_kexec_unload(struct kexec_image *image); void machine_kexec_reserved(xen_kexec_reserve_t *reservation); -void machine_reboot_kexec(xen_kexec_image_t *image); -void machine_kexec(xen_kexec_image_t *image); +void machine_reboot_kexec(struct kexec_image *image); +void machine_kexec(struct kexec_image *image); void kexec_crash(void); void kexec_crash_save_cpu(void); crash_xen_info_t *kexec_crash_save_info(void); @@ -52,11 +55,6 @@ void machine_crash_shutdown(void); int machine_kexec_get(xen_kexec_range_t *range); int machine_kexec_get_xen(xen_kexec_range_t *range); -void compat_machine_kexec(unsigned long rnk, - unsigned long indirection_page, - unsigned long *page_list, - unsigned long start_address); - /* vmcoreinfo stuff */ #define VMCOREINFO_BYTES (4096) #define VMCOREINFO_NOTE_NAME "VMCOREINFO_XEN" diff --git a/xen/include/xen/kimage.h b/xen/include/xen/kimage.h index 0ebd37a..d10ebf7 100644 --- a/xen/include/xen/kimage.h +++ b/xen/include/xen/kimage.h @@ -47,6 +47,12 @@ int kimage_load_segments(struct kexec_image *image); struct page_info *kimage_alloc_control_page(struct kexec_image *image, unsigned memflags); +kimage_entry_t *kimage_entry_next(kimage_entry_t *entry, bool_t compat); +unsigned long kimage_entry_mfn(kimage_entry_t *entry, bool_t compat); +unsigned long kimage_entry_ind(kimage_entry_t *entry, bool_t compat); +int kimage_build_ind(struct kexec_image *image, unsigned long ind_mfn, + bool_t compat); + #endif /* __ASSEMBLY__ */ #endif /* __XEN_KIMAGE_H__ */ -- 1.7.2.5
Don Slutz
2013-Nov-07 20:56 UTC
Re: [PATCH 4/9] kexec: extend hypercall with improved load/unload ops
For what it is worth. Reviewed-by: Don Slutz <dslutz@verizon.com> -Don Slutz On 11/06/13 09:49, David Vrabel wrote:> From: David Vrabel <david.vrabel@citrix.com> > > In the existing kexec hypercall, the load and unload ops depend on > internals of the Linux kernel (the page list and code page provided by > the kernel). The code page is used to transition between Xen context > and the image so using kernel code doesn''t make sense and will not > work for PVH guests. > > Add replacement KEXEC_CMD_kexec_load and KEXEC_CMD_kexec_unload ops > that no longer require a code page to be provided by the guest -- Xen > now provides the code for calling the image directly. > > The new load op looks similar to the Linux kexec_load system call and > allows the guest to provide the image data to be loaded. The guest > specifies the architecture of the image which may be a 32-bit subarch > of the hypervisor''s architecture (i.e., an EM_386 image on an > EM_X86_64 hypervisor). > > The toolstack can now load images without kernel involvement. This is > required for supporting kexec when using a dom0 with an upstream > kernel. > > Crash images are copied directly into the crash region on load. > Default images are copied into domheap pages and a list of source and > destination machine addresses is created. This is list is used in > kexec_reloc() to relocate the image to its destination. > > The old load and unload sub-ops are still available (as > KEXEC_CMD_load_v1 and KEXEC_CMD_unload_v1) and are implemented on top > of the new infrastructure. > > Signed-off-by: David Vrabel <david.vrabel@citrix.com> > Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> > --- > xen/arch/x86/machine_kexec.c | 192 +++++++++++------ > xen/arch/x86/x86_64/Makefile | 2 +- > xen/arch/x86/x86_64/compat_kexec.S | 187 ---------------- > xen/arch/x86/x86_64/kexec_reloc.S | 198 +++++++++++++++++ > xen/common/kexec.c | 398 +++++++++++++++++++++++++++++------ > xen/common/kimage.c | 122 +++++++++++- > xen/include/asm-x86/fixmap.h | 3 - > xen/include/asm-x86/machine_kexec.h | 16 ++ > xen/include/xen/kexec.h | 16 +- > xen/include/xen/kimage.h | 6 + > 10 files changed, 804 insertions(+), 336 deletions(-) > delete mode 100644 xen/arch/x86/x86_64/compat_kexec.S > create mode 100644 xen/arch/x86/x86_64/kexec_reloc.S > create mode 100644 xen/include/asm-x86/machine_kexec.h > > diff --git a/xen/arch/x86/machine_kexec.c b/xen/arch/x86/machine_kexec.c > index 68b9705..b70d5a6 100644 > --- a/xen/arch/x86/machine_kexec.c > +++ b/xen/arch/x86/machine_kexec.c > @@ -1,9 +1,18 @@ > /****************************************************************************** > * machine_kexec.c > * > + * Copyright (C) 2013 Citrix Systems R&D Ltd. > + * > + * Portions derived from Linux''s arch/x86/kernel/machine_kexec_64.c. > + * > + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> > + * > * Xen port written by: > * - Simon ''Horms'' Horman <horms@verge.net.au> > * - Magnus Damm <magnus@valinux.co.jp> > + * > + * This source code is licensed under the GNU General Public License, > + * Version 2. See the file COPYING for more details. > */ > > #include <xen/types.h> > @@ -11,63 +20,124 @@ > #include <xen/guest_access.h> > #include <asm/fixmap.h> > #include <asm/hpet.h> > +#include <asm/page.h> > +#include <asm/machine_kexec.h> > > -typedef void (*relocate_new_kernel_t)( > - unsigned long indirection_page, > - unsigned long *page_list, > - unsigned long start_address, > - unsigned int preserve_context); > - > -int machine_kexec_load(int type, int slot, xen_kexec_image_t *image) > +/* > + * Add a mapping for a page to the page tables used during kexec. > + */ > +int machine_kexec_add_page(struct kexec_image *image, unsigned long vaddr, > + unsigned long maddr) > { > - unsigned long prev_ma = 0; > - int fix_base = FIX_KEXEC_BASE_0 + (slot * (KEXEC_XEN_NO_PAGES >> 1)); > - int k; > + struct page_info *l4_page; > + struct page_info *l3_page; > + struct page_info *l2_page; > + struct page_info *l1_page; > + l4_pgentry_t *l4 = NULL; > + l3_pgentry_t *l3 = NULL; > + l2_pgentry_t *l2 = NULL; > + l1_pgentry_t *l1 = NULL; > + int ret = -ENOMEM; > + > + l4_page = image->aux_page; > + if ( !l4_page ) > + { > + l4_page = kimage_alloc_control_page(image, 0); > + if ( !l4_page ) > + goto out; > + image->aux_page = l4_page; > + } > > - /* setup fixmap to point to our pages and record the virtual address > - * in every odd index in page_list[]. > - */ > + l4 = __map_domain_page(l4_page); > + l4 += l4_table_offset(vaddr); > + if ( !(l4e_get_flags(*l4) & _PAGE_PRESENT) ) > + { > + l3_page = kimage_alloc_control_page(image, 0); > + if ( !l3_page ) > + goto out; > + l4e_write(l4, l4e_from_page(l3_page, __PAGE_HYPERVISOR)); > + } > + else > + l3_page = l4e_get_page(*l4); > + > + l3 = __map_domain_page(l3_page); > + l3 += l3_table_offset(vaddr); > + if ( !(l3e_get_flags(*l3) & _PAGE_PRESENT) ) > + { > + l2_page = kimage_alloc_control_page(image, 0); > + if ( !l2_page ) > + goto out; > + l3e_write(l3, l3e_from_page(l2_page, __PAGE_HYPERVISOR)); > + } > + else > + l2_page = l3e_get_page(*l3); > + > + l2 = __map_domain_page(l2_page); > + l2 += l2_table_offset(vaddr); > + if ( !(l2e_get_flags(*l2) & _PAGE_PRESENT) ) > + { > + l1_page = kimage_alloc_control_page(image, 0); > + if ( !l1_page ) > + goto out; > + l2e_write(l2, l2e_from_page(l1_page, __PAGE_HYPERVISOR)); > + } > + else > + l1_page = l2e_get_page(*l2); > + > + l1 = __map_domain_page(l1_page); > + l1 += l1_table_offset(vaddr); > + l1e_write(l1, l1e_from_pfn(maddr >> PAGE_SHIFT, __PAGE_HYPERVISOR)); > + > + ret = 0; > +out: > + if ( l1 ) > + unmap_domain_page(l1); > + if ( l2 ) > + unmap_domain_page(l2); > + if ( l3 ) > + unmap_domain_page(l3); > + if ( l4 ) > + unmap_domain_page(l4); > + return ret; > +} > > - for ( k = 0; k < KEXEC_XEN_NO_PAGES; k++ ) > +int machine_kexec_load(struct kexec_image *image) > +{ > + void *code_page; > + int ret; > + > + switch ( image->arch ) > { > - if ( (k & 1) == 0 ) > - { > - /* Even pages: machine address. */ > - prev_ma = image->page_list[k]; > - } > - else > - { > - /* Odd pages: va for previous ma. */ > - if ( is_pv_32on64_domain(dom0) ) > - { > - /* > - * The compatability bounce code sets up a page table > - * with a 1-1 mapping of the first 1G of memory so > - * VA==PA here. > - * > - * This Linux purgatory code still sets up separate > - * high and low mappings on the control page (entries > - * 0 and 1) but it is harmless if they are equal since > - * that PT is not live at the time. > - */ > - image->page_list[k] = prev_ma; > - } > - else > - { > - set_fixmap(fix_base + (k >> 1), prev_ma); > - image->page_list[k] = fix_to_virt(fix_base + (k >> 1)); > - } > - } > + case EM_386: > + case EM_X86_64: > + break; > + default: > + return -EINVAL; > } > > + code_page = __map_domain_page(image->control_code_page); > + memcpy(code_page, kexec_reloc, kexec_reloc_size); > + unmap_domain_page(code_page); > + > + /* > + * Add a mapping for the control code page to the same virtual > + * address as kexec_reloc. This allows us to keep running after > + * these page tables are loaded in kexec_reloc. > + */ > + ret = machine_kexec_add_page(image, (unsigned long)kexec_reloc, > + page_to_maddr(image->control_code_page)); > + if ( ret < 0 ) > + return ret; > + > return 0; > } > > -void machine_kexec_unload(int type, int slot, xen_kexec_image_t *image) > +void machine_kexec_unload(struct kexec_image *image) > { > + /* no-op. kimage_free() frees all control pages. */ > } > > -void machine_reboot_kexec(xen_kexec_image_t *image) > +void machine_reboot_kexec(struct kexec_image *image) > { > BUG_ON(smp_processor_id() != 0); > smp_send_stop(); > @@ -75,13 +145,10 @@ void machine_reboot_kexec(xen_kexec_image_t *image) > BUG(); > } > > -void machine_kexec(xen_kexec_image_t *image) > +void machine_kexec(struct kexec_image *image) > { > - struct desc_ptr gdt_desc = { > - .base = (unsigned long)(boot_cpu_gdt_table - FIRST_RESERVED_GDT_ENTRY), > - .limit = LAST_RESERVED_GDT_BYTE > - }; > int i; > + unsigned long reloc_flags = 0; > > /* We are about to permenantly jump out of the Xen context into the kexec > * purgatory code. We really dont want to be still servicing interupts. > @@ -109,29 +176,12 @@ void machine_kexec(xen_kexec_image_t *image) > * not like running with NMIs disabled. */ > enable_nmis(); > > - /* > - * compat_machine_kexec() returns to idle pagetables, which requires us > - * to be running on a static GDT mapping (idle pagetables have no GDT > - * mappings in their per-domain mapping area). > - */ > - asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); > + if ( image->arch == EM_386 ) > + reloc_flags |= KEXEC_RELOC_FLAG_COMPAT; > > - if ( is_pv_32on64_domain(dom0) ) > - { > - compat_machine_kexec(image->page_list[1], > - image->indirection_page, > - image->page_list, > - image->start_address); > - } > - else > - { > - relocate_new_kernel_t rnk; > - > - rnk = (relocate_new_kernel_t) image->page_list[1]; > - (*rnk)(image->indirection_page, image->page_list, > - image->start_address, > - 0 /* preserve_context */); > - } > + kexec_reloc(page_to_maddr(image->control_code_page), > + page_to_maddr(image->aux_page), > + image->head, image->entry_maddr, reloc_flags); > } > > int machine_kexec_get(xen_kexec_range_t *range) > diff --git a/xen/arch/x86/x86_64/Makefile b/xen/arch/x86/x86_64/Makefile > index d56e12d..7f8fb3d 100644 > --- a/xen/arch/x86/x86_64/Makefile > +++ b/xen/arch/x86/x86_64/Makefile > @@ -11,11 +11,11 @@ obj-y += mmconf-fam10h.o > obj-y += mmconfig_64.o > obj-y += mmconfig-shared.o > obj-y += compat.o > -obj-bin-y += compat_kexec.o > obj-y += domain.o > obj-y += physdev.o > obj-y += platform_hypercall.o > obj-y += cpu_idle.o > obj-y += cpufreq.o > +obj-bin-y += kexec_reloc.o > > obj-$(crash_debug) += gdbstub.o > diff --git a/xen/arch/x86/x86_64/compat_kexec.S b/xen/arch/x86/x86_64/compat_kexec.S > deleted file mode 100644 > index fc92af9..0000000 > --- a/xen/arch/x86/x86_64/compat_kexec.S > +++ /dev/null > @@ -1,187 +0,0 @@ > -/* > - * Compatibility kexec handler. > - */ > - > -/* > - * NOTE: We rely on Xen not relocating itself above the 4G boundary. This is > - * currently true but if it ever changes then compat_pg_table will > - * need to be moved back below 4G at run time. > - */ > - > -#include <xen/config.h> > - > -#include <asm/asm_defns.h> > -#include <asm/msr.h> > -#include <asm/page.h> > - > -/* The unrelocated physical address of a symbol. */ > -#define SYM_PHYS(sym) ((sym) - __XEN_VIRT_START) > - > -/* Load physical address of symbol into register and relocate it. */ > -#define RELOCATE_SYM(sym,reg) mov $SYM_PHYS(sym), reg ; \ > - add xen_phys_start(%rip), reg > - > -/* > - * Relocate a physical address in memory. Size of temporary register > - * determines size of the value to relocate. > - */ > -#define RELOCATE_MEM(addr,reg) mov addr(%rip), reg ; \ > - add xen_phys_start(%rip), reg ; \ > - mov reg, addr(%rip) > - > - .text > - > - .code64 > - > -ENTRY(compat_machine_kexec) > - /* x86/64 x86/32 */ > - /* %rdi - relocate_new_kernel_t CALL */ > - /* %rsi - indirection page 4(%esp) */ > - /* %rdx - page_list 8(%esp) */ > - /* %rcx - start address 12(%esp) */ > - /* cpu has pae 16(%esp) */ > - > - /* Shim the 64 bit page_list into a 32 bit page_list. */ > - mov $12,%r9 > - lea compat_page_list(%rip), %rbx > -1: dec %r9 > - movl (%rdx,%r9,8),%eax > - movl %eax,(%rbx,%r9,4) > - test %r9,%r9 > - jnz 1b > - > - RELOCATE_SYM(compat_page_list,%rdx) > - > - /* Relocate compatibility mode entry point address. */ > - RELOCATE_MEM(compatibility_mode_far,%eax) > - > - /* Relocate compat_pg_table. */ > - RELOCATE_MEM(compat_pg_table, %rax) > - RELOCATE_MEM(compat_pg_table+0x8, %rax) > - RELOCATE_MEM(compat_pg_table+0x10,%rax) > - RELOCATE_MEM(compat_pg_table+0x18,%rax) > - > - /* > - * Setup an identity mapped region in PML4[0] of idle page > - * table. > - */ > - RELOCATE_SYM(l3_identmap,%rax) > - or $0x63,%rax > - mov %rax, idle_pg_table(%rip) > - > - /* Switch to idle page table. */ > - RELOCATE_SYM(idle_pg_table,%rax) > - movq %rax, %cr3 > - > - /* Switch to identity mapped compatibility stack. */ > - RELOCATE_SYM(compat_stack,%rax) > - movq %rax, %rsp > - > - /* Save xen_phys_start for 32 bit code. */ > - movq xen_phys_start(%rip), %rbx > - > - /* Jump to low identity mapping in compatibility mode. */ > - ljmp *compatibility_mode_far(%rip) > - ud2 > - > -compatibility_mode_far: > - .long SYM_PHYS(compatibility_mode) > - .long __HYPERVISOR_CS32 > - > - /* > - * We use 5 words of stack for the arguments passed to the kernel. The > - * kernel only uses 1 word before switching to its own stack. Allocate > - * 16 words to give "plenty" of room. > - */ > - .fill 16,4,0 > -compat_stack: > - > - .code32 > - > -#undef RELOCATE_SYM > -#undef RELOCATE_MEM > - > -/* > - * Load physical address of symbol into register and relocate it. %rbx > - * contains xen_phys_start(%rip) saved before jump to compatibility > - * mode. > - */ > -#define RELOCATE_SYM(sym,reg) mov $SYM_PHYS(sym), reg ; \ > - add %ebx, reg > - > -compatibility_mode: > - /* Setup some sane segments. */ > - movl $__HYPERVISOR_DS32, %eax > - movl %eax, %ds > - movl %eax, %es > - movl %eax, %fs > - movl %eax, %gs > - movl %eax, %ss > - > - /* Push arguments onto stack. */ > - pushl $0 /* 20(%esp) - preserve context */ > - pushl $1 /* 16(%esp) - cpu has pae */ > - pushl %ecx /* 12(%esp) - start address */ > - pushl %edx /* 8(%esp) - page list */ > - pushl %esi /* 4(%esp) - indirection page */ > - pushl %edi /* 0(%esp) - CALL */ > - > - /* Disable paging and therefore leave 64 bit mode. */ > - movl %cr0, %eax > - andl $~X86_CR0_PG, %eax > - movl %eax, %cr0 > - > - /* Switch to 32 bit page table. */ > - RELOCATE_SYM(compat_pg_table, %eax) > - movl %eax, %cr3 > - > - /* Clear MSR_EFER[LME], disabling long mode */ > - movl $MSR_EFER,%ecx > - rdmsr > - btcl $_EFER_LME,%eax > - wrmsr > - > - /* Re-enable paging, but only 32 bit mode now. */ > - movl %cr0, %eax > - orl $X86_CR0_PG, %eax > - movl %eax, %cr0 > - jmp 1f > -1: > - > - popl %eax > - call *%eax > - ud2 > - > - .data > - .align 4 > -compat_page_list: > - .fill 12,4,0 > - > - .align 32,0 > - > - /* > - * These compat page tables contain an identity mapping of the > - * first 4G of the physical address space. > - */ > -compat_pg_table: > - .long SYM_PHYS(compat_pg_table_l2) + 0*PAGE_SIZE + 0x01, 0 > - .long SYM_PHYS(compat_pg_table_l2) + 1*PAGE_SIZE + 0x01, 0 > - .long SYM_PHYS(compat_pg_table_l2) + 2*PAGE_SIZE + 0x01, 0 > - .long SYM_PHYS(compat_pg_table_l2) + 3*PAGE_SIZE + 0x01, 0 > - > - .section .data.page_aligned, "aw", @progbits > - .align PAGE_SIZE,0 > -compat_pg_table_l2: > - .macro identmap from=0, count=512 > - .if \count-1 > - identmap "(\from+0)","(\count/2)" > - identmap "(\from+(0x200000*(\count/2)))","(\count/2)" > - .else > - .quad 0x00000000000000e3 + \from > - .endif > - .endm > - > - identmap 0x00000000 > - identmap 0x40000000 > - identmap 0x80000000 > - identmap 0xc0000000 > diff --git a/xen/arch/x86/x86_64/kexec_reloc.S b/xen/arch/x86/x86_64/kexec_reloc.S > new file mode 100644 > index 0000000..7a16c85 > --- /dev/null > +++ b/xen/arch/x86/x86_64/kexec_reloc.S > @@ -0,0 +1,198 @@ > +/* > + * Relocate a kexec_image to its destination and call it. > + * > + * Copyright (C) 2013 Citrix Systems R&D Ltd. > + * > + * Portions derived from Linux''s arch/x86/kernel/relocate_kernel_64.S. > + * > + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> > + * > + * This source code is licensed under the GNU General Public License, > + * Version 2. See the file COPYING for more details. > + */ > +#include <xen/config.h> > +#include <xen/kimage.h> > + > +#include <asm/asm_defns.h> > +#include <asm/msr.h> > +#include <asm/page.h> > +#include <asm/machine_kexec.h> > + > + .text > + .align PAGE_SIZE > + .code64 > + > +ENTRY(kexec_reloc) > + /* %rdi - code page maddr */ > + /* %rsi - page table maddr */ > + /* %rdx - indirection page maddr */ > + /* %rcx - entry maddr (%rbp) */ > + /* %r8 - flags */ > + > + movq %rcx, %rbp > + > + /* Setup stack. */ > + leaq (reloc_stack - kexec_reloc)(%rdi), %rsp > + > + /* Load reloc page table. */ > + movq %rsi, %cr3 > + > + /* Jump to identity mapped code. */ > + leaq (identity_mapped - kexec_reloc)(%rdi), %rax > + jmpq *%rax > + > +identity_mapped: > + /* > + * Set cr0 to a known state: > + * - Paging enabled > + * - Alignment check disabled > + * - Write protect disabled > + * - No task switch > + * - Don''t do FP software emulation. > + * - Protected mode enabled > + */ > + movq %cr0, %rax > + andl $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax > + orl $(X86_CR0_PG | X86_CR0_PE), %eax > + movq %rax, %cr0 > + > + /* > + * Set cr4 to a known state: > + * - physical address extension enabled > + */ > + movl $X86_CR4_PAE, %eax > + movq %rax, %cr4 > + > + movq %rdx, %rdi > + call relocate_pages > + > + /* Need to switch to 32-bit mode? */ > + testq $KEXEC_RELOC_FLAG_COMPAT, %r8 > + jnz call_32_bit > + > +call_64_bit: > + /* Call the image entry point. This should never return. */ > + callq *%rbp > + ud2 > + > +call_32_bit: > + /* Setup IDT. */ > + lidt compat_mode_idt(%rip) > + > + /* Load compat GDT. */ > + leaq compat_mode_gdt(%rip), %rax > + movq %rax, (compat_mode_gdt_desc + 2)(%rip) > + lgdt compat_mode_gdt_desc(%rip) > + > + /* Relocate compatibility mode entry point address. */ > + leal compatibility_mode(%rip), %eax > + movl %eax, compatibility_mode_far(%rip) > + > + /* Enter compatibility mode. */ > + ljmp *compatibility_mode_far(%rip) > + > +relocate_pages: > + /* %rdi - indirection page maddr */ > + pushq %rbx > + > + cld > + movq %rdi, %rbx > + xorl %edi, %edi > + xorl %esi, %esi > + > +next_entry: /* top, read another word for the indirection page */ > + > + movq (%rbx), %rcx > + addq $8, %rbx > +is_dest: > + testb $IND_DESTINATION, %cl > + jz is_ind > + movq %rcx, %rdi > + andq $PAGE_MASK, %rdi > + jmp next_entry > +is_ind: > + testb $IND_INDIRECTION, %cl > + jz is_done > + movq %rcx, %rbx > + andq $PAGE_MASK, %rbx > + jmp next_entry > +is_done: > + testb $IND_DONE, %cl > + jnz done > +is_source: > + testb $IND_SOURCE, %cl > + jz is_zero > + movq %rcx, %rsi /* For every source page do a copy */ > + andq $PAGE_MASK, %rsi > + movl $(PAGE_SIZE / 8), %ecx > + rep movsq > + jmp next_entry > +is_zero: > + testb $IND_ZERO, %cl > + jz next_entry > + movl $(PAGE_SIZE / 8), %ecx /* Zero the destination page. */ > + xorl %eax, %eax > + rep stosq > + jmp next_entry > +done: > + popq %rbx > + ret > + > + .code32 > + > +compatibility_mode: > + /* Setup some sane segments. */ > + movl $0x0008, %eax > + movl %eax, %ds > + movl %eax, %es > + movl %eax, %fs > + movl %eax, %gs > + movl %eax, %ss > + > + /* Disable paging and therefore leave 64 bit mode. */ > + movl %cr0, %eax > + andl $~X86_CR0_PG, %eax > + movl %eax, %cr0 > + > + /* Disable long mode */ > + movl $MSR_EFER, %ecx > + rdmsr > + andl $~EFER_LME, %eax > + wrmsr > + > + /* Clear cr4 to disable PAE. */ > + xorl %eax, %eax > + movl %eax, %cr4 > + > + /* Call the image entry point. This should never return. */ > + call *%ebp > + ud2 > + > + .align 4 > +compatibility_mode_far: > + .long 0x00000000 /* set in call_32_bit above */ > + .word 0x0010 > + > +compat_mode_gdt_desc: > + .word (3*8)-1 > + .quad 0x0000000000000000 /* set in call_32_bit above */ > + > + .align 8 > +compat_mode_gdt: > + .quad 0x0000000000000000 /* null */ > + .quad 0x00cf92000000ffff /* 0x0008 ring 0 data */ > + .quad 0x00cf9a000000ffff /* 0x0010 ring 0 code, compatibility */ > + > +compat_mode_idt: > + .word 0 /* limit */ > + .long 0 /* base */ > + > + /* > + * 16 words of stack are more than enough. > + */ > + .fill 16,8,0 > +reloc_stack: > + > + .globl kexec_reloc_size > +kexec_reloc_size: > + .long . - kexec_reloc > diff --git a/xen/common/kexec.c b/xen/common/kexec.c > index 7b23df0..c5450ba 100644 > --- a/xen/common/kexec.c > +++ b/xen/common/kexec.c > @@ -25,6 +25,7 @@ > #include <xen/version.h> > #include <xen/console.h> > #include <xen/kexec.h> > +#include <xen/kimage.h> > #include <public/elfnote.h> > #include <xsm/xsm.h> > #include <xen/cpu.h> > @@ -47,7 +48,7 @@ static Elf_Note *xen_crash_note; > > static cpumask_t crash_saved_cpus; > > -static xen_kexec_image_t kexec_image[KEXEC_IMAGE_NR]; > +static struct kexec_image *kexec_image[KEXEC_IMAGE_NR]; > > #define KEXEC_FLAG_DEFAULT_POS (KEXEC_IMAGE_NR + 0) > #define KEXEC_FLAG_CRASH_POS (KEXEC_IMAGE_NR + 1) > @@ -55,8 +56,6 @@ static xen_kexec_image_t kexec_image[KEXEC_IMAGE_NR]; > > static unsigned long kexec_flags = 0; /* the lowest bits are for KEXEC_IMAGE... */ > > -static spinlock_t kexec_lock = SPIN_LOCK_UNLOCKED; > - > static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; > static size_t vmcoreinfo_size = 0; > > @@ -311,14 +310,14 @@ void kexec_crash(void) > kexec_common_shutdown(); > kexec_crash_save_cpu(); > machine_crash_shutdown(); > - machine_kexec(&kexec_image[KEXEC_IMAGE_CRASH_BASE + pos]); > + machine_kexec(kexec_image[KEXEC_IMAGE_CRASH_BASE + pos]); > > BUG(); > } > > static long kexec_reboot(void *_image) > { > - xen_kexec_image_t *image = _image; > + struct kexec_image *image = _image; > > kexecing = TRUE; > > @@ -734,63 +733,264 @@ static void crash_save_vmcoreinfo(void) > #endif > } > > -static int kexec_load_unload_internal(unsigned long op, xen_kexec_load_v1_t *load) > +static void kexec_unload_image(struct kexec_image *image) > { > - xen_kexec_image_t *image; > + if ( !image ) > + return; > + > + machine_kexec_unload(image); > + kimage_free(image); > +} > + > +static int kexec_exec(XEN_GUEST_HANDLE_PARAM(void) uarg) > +{ > + xen_kexec_exec_t exec; > + struct kexec_image *image; > + int base, bit, pos, ret = -EINVAL; > + > + if ( unlikely(copy_from_guest(&exec, uarg, 1)) ) > + return -EFAULT; > + > + if ( kexec_load_get_bits(exec.type, &base, &bit) ) > + return -EINVAL; > + > + pos = (test_bit(bit, &kexec_flags) != 0); > + > + /* Only allow kexec/kdump into loaded images */ > + if ( !test_bit(base + pos, &kexec_flags) ) > + return -ENOENT; > + > + switch (exec.type) > + { > + case KEXEC_TYPE_DEFAULT: > + image = kexec_image[base + pos]; > + ret = continue_hypercall_on_cpu(0, kexec_reboot, image); > + break; > + case KEXEC_TYPE_CRASH: > + kexec_crash(); /* Does not return */ > + break; > + } > + > + return -EINVAL; /* never reached */ > +} > + > +static int kexec_swap_images(int type, struct kexec_image *new, > + struct kexec_image **old) > +{ > + static DEFINE_SPINLOCK(kexec_lock); > int base, bit, pos; > - int ret = 0; > + int new_slot, old_slot; > + > + *old = NULL; > + > + spin_lock(&kexec_lock); > + > + if ( test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) ) > + { > + spin_unlock(&kexec_lock); > + return -EBUSY; > + } > > - if ( kexec_load_get_bits(load->type, &base, &bit) ) > + if ( kexec_load_get_bits(type, &base, &bit) ) > return -EINVAL; > > pos = (test_bit(bit, &kexec_flags) != 0); > + old_slot = base + pos; > + new_slot = base + !pos; > > - /* Load the user data into an unused image */ > - if ( op == KEXEC_CMD_kexec_load ) > + if ( new ) > { > - image = &kexec_image[base + !pos]; > + kexec_image[new_slot] = new; > + set_bit(new_slot, &kexec_flags); > + } > + change_bit(bit, &kexec_flags); > > - BUG_ON(test_bit((base + !pos), &kexec_flags)); /* must be free */ > + clear_bit(old_slot, &kexec_flags); > + *old = kexec_image[old_slot]; > > - memcpy(image, &load->image, sizeof(*image)); > + spin_unlock(&kexec_lock); > > - if ( !(ret = machine_kexec_load(load->type, base + !pos, image)) ) > - { > - /* Set image present bit */ > - set_bit((base + !pos), &kexec_flags); > + return 0; > +} > > - /* Make new image the active one */ > - change_bit(bit, &kexec_flags); > - } > +static int kexec_load_slot(struct kexec_image *kimage) > +{ > + struct kexec_image *old_kimage; > + int ret = -ENOMEM; > + > + ret = machine_kexec_load(kimage); > + if ( ret < 0 ) > + return ret; > + > + crash_save_vmcoreinfo(); > + > + ret = kexec_swap_images(kimage->type, kimage, &old_kimage); > + if ( ret < 0 ) > + return ret; > + > + kexec_unload_image(old_kimage); > + > + return 0; > +} > + > +static uint16_t kexec_load_v1_arch(void) > +{ > +#ifdef CONFIG_X86 > + return is_pv_32on64_domain(dom0) ? EM_386 : EM_X86_64; > +#else > + return EM_NONE; > +#endif > +} > > - crash_save_vmcoreinfo(); > +static int kexec_segments_add_segment( > + unsigned int *nr_segments, xen_kexec_segment_t *segments, > + unsigned long mfn) > +{ > + paddr_t maddr = (paddr_t)mfn << PAGE_SHIFT; > + unsigned int n = *nr_segments; > + > + /* Need a new segment? */ > + if ( n == 0 > + || segments[n-1].dest_maddr + segments[n-1].dest_size != maddr ) > + { > + n++; > + if ( n > KEXEC_SEGMENT_MAX ) > + return -EINVAL; > + *nr_segments = n; > + > + set_xen_guest_handle(segments[n-1].buf.h, NULL); > + segments[n-1].buf_size = 0; > + segments[n-1].dest_maddr = maddr; > + segments[n-1].dest_size = 0; > } > > - /* Unload the old image if present and load successful */ > - if ( ret == 0 && !test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) ) > + return 0; > +} > + > +static int kexec_segments_from_ind_page(unsigned long mfn, > + unsigned int *nr_segments, > + xen_kexec_segment_t *segments, > + bool_t compat) > +{ > + void *page; > + kimage_entry_t *entry; > + int ret = 0; > + > + page = map_domain_page(mfn); > + > + /* > + * Walk the indirection page list, adding destination pages to the > + * segments. > + */ > + for ( entry = page; ; ) > { > - if ( test_and_clear_bit((base + pos), &kexec_flags) ) > + unsigned long ind; > + > + ind = kimage_entry_ind(entry, compat); > + mfn = kimage_entry_mfn(entry, compat); > + > + switch ( ind ) > { > - image = &kexec_image[base + pos]; > - machine_kexec_unload(load->type, base + pos, image); > + case IND_DESTINATION: > + ret = kexec_segments_add_segment(nr_segments, segments, mfn); > + if ( ret < 0 ) > + goto done; > + break; > + case IND_INDIRECTION: > + unmap_domain_page(page); > + entry = page = map_domain_page(mfn); > + continue; > + case IND_DONE: > + goto done; > + case IND_SOURCE: > + if ( *nr_segments == 0 ) > + { > + ret = -EINVAL; > + goto done; > + } > + segments[*nr_segments-1].dest_size += PAGE_SIZE; > + break; > + default: > + ret = -EINVAL; > + goto done; > } > + entry = kimage_entry_next(entry, compat); > } > +done: > + unmap_domain_page(page); > + return ret; > +} > > +static int kexec_do_load_v1(xen_kexec_load_v1_t *load, int compat) > +{ > + struct kexec_image *kimage = NULL; > + xen_kexec_segment_t *segments; > + uint16_t arch; > + unsigned int nr_segments = 0; > + unsigned long ind_mfn = load->image.indirection_page >> PAGE_SHIFT; > + int ret; > + > + arch = kexec_load_v1_arch(); > + if ( arch == EM_NONE ) > + return -ENOSYS; > + > + segments = xmalloc_array(xen_kexec_segment_t, KEXEC_SEGMENT_MAX); > + if ( segments == NULL ) > + return -ENOMEM; > + > + /* > + * Work out the image segments (destination only) from the > + * indirection pages. > + * > + * This is needed so we don''t allocate pages that will overlap > + * with the destination when building the new set of indirection > + * pages below. > + */ > + ret = kexec_segments_from_ind_page(ind_mfn, &nr_segments, segments, compat); > + if ( ret < 0 ) > + goto error; > + > + ret = kimage_alloc(&kimage, load->type, arch, load->image.start_address, > + nr_segments, segments); > + if ( ret < 0 ) > + goto error; > + > + /* > + * Build a new set of indirection pages in the native format. > + * > + * This walks the guest provided indirection pages a second time. > + * The guest could have altered then, invalidating the segment > + * information constructed above. This will only result in the > + * resulting image being potentially unrelocatable. > + */ > + ret = kimage_build_ind(kimage, ind_mfn, compat); > + if ( ret < 0 ) > + goto error; > + > + ret = kexec_load_slot(kimage); > + if ( ret < 0 ) > + goto error; > + > + return 0; > + > +error: > + if ( !kimage ) > + xfree(segments); > + kimage_free(kimage); > return ret; > } > > -static int kexec_load_unload(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) uarg) > +static int kexec_load_v1(XEN_GUEST_HANDLE_PARAM(void) uarg) > { > xen_kexec_load_v1_t load; > > if ( unlikely(copy_from_guest(&load, uarg, 1)) ) > return -EFAULT; > > - return kexec_load_unload_internal(op, &load); > + return kexec_do_load_v1(&load, 0); > } > > -static int kexec_load_unload_compat(unsigned long op, > - XEN_GUEST_HANDLE_PARAM(void) uarg) > +static int kexec_load_v1_compat(XEN_GUEST_HANDLE_PARAM(void) uarg) > { > #ifdef CONFIG_COMPAT > compat_kexec_load_v1_t compat_load; > @@ -809,49 +1009,113 @@ static int kexec_load_unload_compat(unsigned long op, > load.type = compat_load.type; > XLAT_kexec_image(&load.image, &compat_load.image); > > - return kexec_load_unload_internal(op, &load); > -#else /* CONFIG_COMPAT */ > + return kexec_do_load_v1(&load, 1); > +#else > return 0; > -#endif /* CONFIG_COMPAT */ > +#endif > } > > -static int kexec_exec(XEN_GUEST_HANDLE_PARAM(void) uarg) > +static int kexec_load(XEN_GUEST_HANDLE_PARAM(void) uarg) > { > - xen_kexec_exec_t exec; > - xen_kexec_image_t *image; > - int base, bit, pos, ret = -EINVAL; > + xen_kexec_load_t load; > + xen_kexec_segment_t *segments; > + struct kexec_image *kimage = NULL; > + int ret; > > - if ( unlikely(copy_from_guest(&exec, uarg, 1)) ) > + if ( copy_from_guest(&load, uarg, 1) ) > return -EFAULT; > > - if ( kexec_load_get_bits(exec.type, &base, &bit) ) > + if ( load.nr_segments >= KEXEC_SEGMENT_MAX ) > return -EINVAL; > > - pos = (test_bit(bit, &kexec_flags) != 0); > - > - /* Only allow kexec/kdump into loaded images */ > - if ( !test_bit(base + pos, &kexec_flags) ) > - return -ENOENT; > + segments = xmalloc_array(xen_kexec_segment_t, load.nr_segments); > + if ( segments == NULL ) > + return -ENOMEM; > > - switch (exec.type) > + if ( copy_from_guest(segments, load.segments.h, load.nr_segments) ) > { > - case KEXEC_TYPE_DEFAULT: > - image = &kexec_image[base + pos]; > - ret = continue_hypercall_on_cpu(0, kexec_reboot, image); > - break; > - case KEXEC_TYPE_CRASH: > - kexec_crash(); /* Does not return */ > - break; > + ret = -EFAULT; > + goto error; > } > > - return -EINVAL; /* never reached */ > + ret = kimage_alloc(&kimage, load.type, load.arch, load.entry_maddr, > + load.nr_segments, segments); > + if ( ret < 0 ) > + goto error; > + > + ret = kimage_load_segments(kimage); > + if ( ret < 0 ) > + goto error; > + > + ret = kexec_load_slot(kimage); > + if ( ret < 0 ) > + goto error; > + > + return 0; > + > +error: > + if ( ! kimage ) > + xfree(segments); > + kimage_free(kimage); > + return ret; > +} > + > +static int kexec_do_unload(xen_kexec_unload_t *unload) > +{ > + struct kexec_image *old_kimage; > + int ret; > + > + ret = kexec_swap_images(unload->type, NULL, &old_kimage); > + if ( ret < 0 ) > + return ret; > + > + kexec_unload_image(old_kimage); > + > + return 0; > +} > + > +static int kexec_unload_v1(XEN_GUEST_HANDLE_PARAM(void) uarg) > +{ > + xen_kexec_load_v1_t load; > + xen_kexec_unload_t unload; > + > + if ( copy_from_guest(&load, uarg, 1) ) > + return -EFAULT; > + > + unload.type = load.type; > + return kexec_do_unload(&unload); > +} > + > +static int kexec_unload_v1_compat(XEN_GUEST_HANDLE_PARAM(void) uarg) > +{ > +#ifdef CONFIG_COMPAT > + compat_kexec_load_v1_t compat_load; > + xen_kexec_unload_t unload; > + > + if ( copy_from_guest(&compat_load, uarg, 1) ) > + return -EFAULT; > + > + unload.type = compat_load.type; > + return kexec_do_unload(&unload); > +#else > + return 0; > +#endif > +} > + > +static int kexec_unload(XEN_GUEST_HANDLE_PARAM(void) uarg) > +{ > + xen_kexec_unload_t unload; > + > + if ( unlikely(copy_from_guest(&unload, uarg, 1)) ) > + return -EFAULT; > + > + return kexec_do_unload(&unload); > } > > static int do_kexec_op_internal(unsigned long op, > XEN_GUEST_HANDLE_PARAM(void) uarg, > bool_t compat) > { > - unsigned long flags; > int ret = -EINVAL; > > ret = xsm_kexec(XSM_PRIV); > @@ -867,20 +1131,26 @@ static int do_kexec_op_internal(unsigned long op, > ret = kexec_get_range(uarg); > break; > case KEXEC_CMD_kexec_load_v1: > + if ( compat ) > + ret = kexec_load_v1_compat(uarg); > + else > + ret = kexec_load_v1(uarg); > + break; > case KEXEC_CMD_kexec_unload_v1: > - spin_lock_irqsave(&kexec_lock, flags); > - if (!test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags)) > - { > - if (compat) > - ret = kexec_load_unload_compat(op, uarg); > - else > - ret = kexec_load_unload(op, uarg); > - } > - spin_unlock_irqrestore(&kexec_lock, flags); > + if ( compat ) > + ret = kexec_unload_v1_compat(uarg); > + else > + ret = kexec_unload_v1(uarg); > break; > case KEXEC_CMD_kexec: > ret = kexec_exec(uarg); > break; > + case KEXEC_CMD_kexec_load: > + ret = kexec_load(uarg); > + break; > + case KEXEC_CMD_kexec_unload: > + ret = kexec_unload(uarg); > + break; > } > > return ret; > diff --git a/xen/common/kimage.c b/xen/common/kimage.c > index 02ee37e..10fb785 100644 > --- a/xen/common/kimage.c > +++ b/xen/common/kimage.c > @@ -175,11 +175,20 @@ static int do_kimage_alloc(struct kexec_image **rimage, paddr_t entry, > image->control_code_page = kimage_alloc_control_page(image, MEMF_bits(32)); > if ( !image->control_code_page ) > goto out; > + result = machine_kexec_add_page(image, > + page_to_maddr(image->control_code_page), > + page_to_maddr(image->control_code_page)); > + if ( result < 0 ) > + goto out; > > /* Add an empty indirection page. */ > image->entry_page = kimage_alloc_control_page(image, 0); > if ( !image->entry_page ) > goto out; > + result = machine_kexec_add_page(image, page_to_maddr(image->entry_page), > + page_to_maddr(image->entry_page)); > + if ( result < 0 ) > + goto out; > > image->head = page_to_maddr(image->entry_page); > > @@ -595,7 +604,7 @@ static struct page_info *kimage_alloc_page(struct kexec_image *image, > if ( addr == destination ) > { > page_list_del(page, &image->dest_pages); > - return page; > + goto found; > } > } > page = NULL; > @@ -647,6 +656,8 @@ static struct page_info *kimage_alloc_page(struct kexec_image *image, > page_list_add(page, &image->dest_pages); > } > } > +found: > + machine_kexec_add_page(image, page_to_maddr(page), page_to_maddr(page)); > return page; > } > > @@ -753,6 +764,7 @@ static int kimage_load_crash_segment(struct kexec_image *image, > static int kimage_load_segment(struct kexec_image *image, xen_kexec_segment_t *segment) > { > int result = -ENOMEM; > + paddr_t addr; > > if ( !guest_handle_is_null(segment->buf.h) ) > { > @@ -767,6 +779,14 @@ static int kimage_load_segment(struct kexec_image *image, xen_kexec_segment_t *s > } > } > > + for ( addr = segment->dest_maddr & PAGE_MASK; > + addr < segment->dest_maddr + segment->dest_size; addr += PAGE_SIZE ) > + { > + result = machine_kexec_add_page(image, addr, addr); > + if ( result < 0 ) > + break; > + } > + > return result; > } > > @@ -810,6 +830,106 @@ int kimage_load_segments(struct kexec_image *image) > return 0; > } > > +kimage_entry_t *kimage_entry_next(kimage_entry_t *entry, bool_t compat) > +{ > + if ( compat ) > + return (kimage_entry_t *)((uint32_t *)entry + 1); > + return entry + 1; > +} > + > +unsigned long kimage_entry_mfn(kimage_entry_t *entry, bool_t compat) > +{ > + if ( compat ) > + return *(uint32_t *)entry >> PAGE_SHIFT; > + return *entry >> PAGE_SHIFT; > +} > + > +unsigned long kimage_entry_ind(kimage_entry_t *entry, bool_t compat) > +{ > + if ( compat ) > + return *(uint32_t *)entry & 0xf; > + return *entry & 0xf; > +} > + > +int kimage_build_ind(struct kexec_image *image, unsigned long ind_mfn, > + bool_t compat) > +{ > + void *page; > + kimage_entry_t *entry; > + int ret = 0; > + paddr_t dest = KIMAGE_NO_DEST; > + > + page = map_domain_page(ind_mfn); > + if ( !page ) > + return -ENOMEM; > + > + /* > + * Walk the guest-supplied indirection pages, adding entries to > + * the image''s indirection pages. > + */ > + for ( entry = page; ; ) > + { > + unsigned long ind; > + unsigned long mfn; > + > + ind = kimage_entry_ind(entry, compat); > + mfn = kimage_entry_mfn(entry, compat); > + > + switch ( ind ) > + { > + case IND_DESTINATION: > + dest = (paddr_t)mfn << PAGE_SHIFT; > + ret = kimage_set_destination(image, dest); > + if ( ret < 0 ) > + goto done; > + break; > + case IND_INDIRECTION: > + unmap_domain_page(page); > + page = map_domain_page(mfn); > + entry = page; > + continue; > + case IND_DONE: > + kimage_terminate(image); > + goto done; > + case IND_SOURCE: > + { > + struct page_info *guest_page, *xen_page; > + > + guest_page = mfn_to_page(mfn); > + if ( !get_page(guest_page, current->domain) ) > + { > + ret = -EFAULT; > + goto done; > + } > + > + xen_page = kimage_alloc_page(image, dest); > + if ( !xen_page ) > + { > + put_page(guest_page); > + ret = -ENOMEM; > + goto done; > + } > + > + copy_domain_page(page_to_mfn(xen_page), mfn); > + put_page(guest_page); > + > + ret = kimage_add_page(image, page_to_maddr(xen_page)); > + if ( ret < 0 ) > + goto done; > + dest += PAGE_SIZE; > + break; > + } > + default: > + ret = -EINVAL; > + goto done; > + } > + entry = kimage_entry_next(entry, compat); > + } > +done: > + unmap_domain_page(page); > + return ret; > +} > + > /* > * Local variables: > * mode: C > diff --git a/xen/include/asm-x86/fixmap.h b/xen/include/asm-x86/fixmap.h > index 8b4266d..48c5676 100644 > --- a/xen/include/asm-x86/fixmap.h > +++ b/xen/include/asm-x86/fixmap.h > @@ -56,9 +56,6 @@ enum fixed_addresses { > FIX_ACPI_BEGIN, > FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, > FIX_HPET_BASE, > - FIX_KEXEC_BASE_0, > - FIX_KEXEC_BASE_END = FIX_KEXEC_BASE_0 \ > - + ((KEXEC_XEN_NO_PAGES >> 1) * KEXEC_IMAGE_NR) - 1, > FIX_TBOOT_SHARED_BASE, > FIX_MSIX_IO_RESERV_BASE, > FIX_MSIX_IO_RESERV_END = FIX_MSIX_IO_RESERV_BASE + FIX_MSIX_MAX_PAGES -1, > diff --git a/xen/include/asm-x86/machine_kexec.h b/xen/include/asm-x86/machine_kexec.h > new file mode 100644 > index 0000000..ba0d469 > --- /dev/null > +++ b/xen/include/asm-x86/machine_kexec.h > @@ -0,0 +1,16 @@ > +#ifndef __X86_MACHINE_KEXEC_H__ > +#define __X86_MACHINE_KEXEC_H__ > + > +#define KEXEC_RELOC_FLAG_COMPAT 0x1 /* 32-bit image */ > + > +#ifndef __ASSEMBLY__ > + > +extern void kexec_reloc(unsigned long reloc_code, unsigned long reloc_pt, > + unsigned long ind_maddr, unsigned long entry_maddr, > + unsigned long flags); > + > +extern unsigned int kexec_reloc_size; > + > +#endif > + > +#endif /* __X86_MACHINE_KEXEC_H__ */ > diff --git a/xen/include/xen/kexec.h b/xen/include/xen/kexec.h > index 1a5dda1..bd17747 100644 > --- a/xen/include/xen/kexec.h > +++ b/xen/include/xen/kexec.h > @@ -6,6 +6,7 @@ > #include <public/kexec.h> > #include <asm/percpu.h> > #include <xen/elfcore.h> > +#include <xen/kimage.h> > > typedef struct xen_kexec_reserve { > unsigned long size; > @@ -40,11 +41,13 @@ extern enum low_crashinfo low_crashinfo_mode; > extern paddr_t crashinfo_maxaddr_bits; > void kexec_early_calculations(void); > > -int machine_kexec_load(int type, int slot, xen_kexec_image_t *image); > -void machine_kexec_unload(int type, int slot, xen_kexec_image_t *image); > +int machine_kexec_add_page(struct kexec_image *image, unsigned long vaddr, > + unsigned long maddr); > +int machine_kexec_load(struct kexec_image *image); > +void machine_kexec_unload(struct kexec_image *image); > void machine_kexec_reserved(xen_kexec_reserve_t *reservation); > -void machine_reboot_kexec(xen_kexec_image_t *image); > -void machine_kexec(xen_kexec_image_t *image); > +void machine_reboot_kexec(struct kexec_image *image); > +void machine_kexec(struct kexec_image *image); > void kexec_crash(void); > void kexec_crash_save_cpu(void); > crash_xen_info_t *kexec_crash_save_info(void); > @@ -52,11 +55,6 @@ void machine_crash_shutdown(void); > int machine_kexec_get(xen_kexec_range_t *range); > int machine_kexec_get_xen(xen_kexec_range_t *range); > > -void compat_machine_kexec(unsigned long rnk, > - unsigned long indirection_page, > - unsigned long *page_list, > - unsigned long start_address); > - > /* vmcoreinfo stuff */ > #define VMCOREINFO_BYTES (4096) > #define VMCOREINFO_NOTE_NAME "VMCOREINFO_XEN" > diff --git a/xen/include/xen/kimage.h b/xen/include/xen/kimage.h > index 0ebd37a..d10ebf7 100644 > --- a/xen/include/xen/kimage.h > +++ b/xen/include/xen/kimage.h > @@ -47,6 +47,12 @@ int kimage_load_segments(struct kexec_image *image); > struct page_info *kimage_alloc_control_page(struct kexec_image *image, > unsigned memflags); > > +kimage_entry_t *kimage_entry_next(kimage_entry_t *entry, bool_t compat); > +unsigned long kimage_entry_mfn(kimage_entry_t *entry, bool_t compat); > +unsigned long kimage_entry_ind(kimage_entry_t *entry, bool_t compat); > +int kimage_build_ind(struct kexec_image *image, unsigned long ind_mfn, > + bool_t compat); > + > #endif /* __ASSEMBLY__ */ > > #endif /* __XEN_KIMAGE_H__ */