diff --git a/arch/x86/coco/sev/Makefile b/arch/x86/coco/sev/Makefile index 4e375e7305ac..08de37559307 100644 --- a/arch/x86/coco/sev/Makefile +++ b/arch/x86/coco/sev/Makefile @@ -13,3 +13,6 @@ KCOV_INSTRUMENT_core.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. KCSAN_SANITIZE := n + +# Clang 14 and older may fail to respect __no_sanitize_undefined when inlining +UBSAN_SANITIZE := n diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 65d676c0f7bc..4dbdb9561019 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -9,6 +9,8 @@ #define pr_fmt(fmt) "SEV: " fmt +#define DISABLE_BRANCH_PROFILING + #include /* For show_regs() */ #include #include @@ -787,15 +789,10 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr, val = sev_es_rd_ghcb_msr(); - if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP, - "Wrong PSC response code: 0x%x\n", - (unsigned int)GHCB_RESP_CODE(val))) + if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) goto e_term; - if (WARN(GHCB_MSR_PSC_RESP_VAL(val), - "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n", - op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared", - paddr, GHCB_MSR_PSC_RESP_VAL(val))) + if (GHCB_MSR_PSC_RESP_VAL(val)) goto e_term; /* Page validation must be performed after changing to private */ @@ -831,7 +828,7 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); } -void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, +void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned long npages) { /* @@ -2423,7 +2420,7 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info) call.rcx = pa; ret = svsm_perform_call_protocol(&call); if (ret) - panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n", ret, call.rax_out); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL); RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)pa; RIP_REL_REF(boot_svsm_caa_pa) = pa; diff --git a/arch/x86/coco/sev/shared.c b/arch/x86/coco/sev/shared.c index 4386f37bd31d..2e4122f8aa6b 100644 --- a/arch/x86/coco/sev/shared.c +++ b/arch/x86/coco/sev/shared.c @@ -498,7 +498,7 @@ static const struct snp_cpuid_table *snp_cpuid_get_table(void) * * Return: XSAVE area size on success, 0 otherwise. */ -static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) +static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) { const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); u64 xfeatures_found = 0; @@ -576,8 +576,9 @@ static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpui sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); } -static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - struct cpuid_leaf *leaf) +static int __head +snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, + struct cpuid_leaf *leaf) { struct cpuid_leaf leaf_hv = *leaf; @@ -1253,7 +1254,7 @@ static void svsm_pval_terminate(struct svsm_pvalidate_call *pc, int ret, u64 svs __pval_terminate(pfn, action, page_size, ret, svsm_ret); } -static void svsm_pval_4k_page(unsigned long paddr, bool validate) +static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) { struct svsm_pvalidate_call *pc; struct svsm_call call = {}; @@ -1285,12 +1286,13 @@ static void svsm_pval_4k_page(unsigned long paddr, bool validate) ret = svsm_perform_call_protocol(&call); if (ret) - svsm_pval_terminate(pc, ret, call.rax_out); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); native_local_irq_restore(flags); } -static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool validate) +static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, + bool validate) { int ret; @@ -1303,7 +1305,7 @@ static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool val } else { ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); if (ret) - __pval_terminate(PHYS_PFN(paddr), validate, RMP_PG_SIZE_4K, ret, 0); + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); } } diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 14d72727d7ee..0e82ebc5d1e1 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_INIT_H #define _ASM_X86_INIT_H -#define __head __section(".head.text") +#define __head __section(".head.text") __no_sanitize_undefined struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index ae5482a2f0ca..8ad187462b68 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -8,14 +8,9 @@ # define PA_PGD 2 # define PA_SWAP_PAGE 3 # define PAGES_NR 4 -#else -# define PA_CONTROL_PAGE 0 -# define VA_CONTROL_PAGE 1 -# define PA_TABLE_PAGE 2 -# define PA_SWAP_PAGE 3 -# define PAGES_NR 4 #endif +# define KEXEC_CONTROL_PAGE_SIZE 4096 # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 #ifndef __ASSEMBLY__ @@ -43,7 +38,6 @@ struct kimage; /* Maximum address we can use for the control code buffer */ # define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE -# define KEXEC_CONTROL_PAGE_SIZE 4096 /* The native architecture */ # define KEXEC_ARCH KEXEC_ARCH_386 @@ -58,11 +52,12 @@ struct kimage; /* Maximum address we can use for the control pages */ # define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1) -/* Allocate one page for the pdp and the second for the code */ -# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL) - /* The native architecture */ # define KEXEC_ARCH KEXEC_ARCH_X86_64 + +extern unsigned long kexec_va_control_page; +extern unsigned long kexec_pa_table_page; +extern unsigned long kexec_pa_swap_page; #endif /* @@ -116,21 +111,21 @@ static inline void crash_setup_regs(struct pt_regs *newregs, } #ifdef CONFIG_X86_32 -asmlinkage unsigned long -relocate_kernel(unsigned long indirection_page, - unsigned long control_page, - unsigned long start_address, - unsigned int has_pae, - unsigned int preserve_context); +typedef asmlinkage unsigned long +relocate_kernel_fn(unsigned long indirection_page, + unsigned long control_page, + unsigned long start_address, + unsigned int has_pae, + unsigned int preserve_context); #else -unsigned long -relocate_kernel(unsigned long indirection_page, - unsigned long page_list, - unsigned long start_address, - unsigned int preserve_context, - unsigned int host_mem_enc_active); +typedef unsigned long +relocate_kernel_fn(unsigned long indirection_page, + unsigned long pa_control_page, + unsigned long start_address, + unsigned int preserve_context, + unsigned int host_mem_enc_active); #endif - +extern relocate_kernel_fn relocate_kernel; #define ARCH_HAS_KIMAGE_ARCH #ifdef CONFIG_X86_32 @@ -145,6 +140,19 @@ struct kimage_arch { }; #else struct kimage_arch { + /* + * This is a kimage control page, as it must not overlap with either + * source or destination address ranges. + */ + pgd_t *pgd; + /* + * The virtual mapping of the control code page itself is used only + * during the transition, while the current kernel's pages are all + * in place. Thus the intermediate page table pages used to map it + * are not control pages, but instead just normal pages obtained + * with get_zeroed_page(). And have to be tracked (below) so that + * they can be freed. + */ p4d_t *p4d; pud_t *pud; pmd_t *pmd; diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 3fa87e5e11ab..30e8ee7006f9 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -5,6 +5,7 @@ #include #include +extern char __relocate_kernel_start[], __relocate_kernel_end[]; extern char __brk_base[], __brk_limit[]; extern char __end_rodata_aligned[]; diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 0667b2a88614..85f4fde3515c 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -49,7 +49,7 @@ extern unsigned long saved_video_mode; extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); -extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp); +extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp); extern void startup_64_setup_gdt_idt(void); extern void early_setup_idt(void); extern void __init do_early_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 6ef92432a5ce..dcbccdb280f9 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -207,6 +207,7 @@ struct snp_psc_desc { #define GHCB_TERM_SVSM_VMPL0 8 /* SVSM is present but has set VMPL to 0 */ #define GHCB_TERM_SVSM_CAA 9 /* SVSM is present but CAA is not page aligned */ #define GHCB_TERM_SECURE_TSC 10 /* Secure TSC initialization failed */ +#define GHCB_TERM_SVSM_CA_REMAP_FAIL 11 /* SVSM is present but CA could not be remapped */ #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index f17d16607882..8418a892d195 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -139,9 +139,15 @@ static bool skip_addr(void *dest) return true; #endif #ifdef CONFIG_KEXEC_CORE +# ifdef CONFIG_X86_64 + if (dest >= (void *)__relocate_kernel_start && + dest < (void *)__relocate_kernel_end) + return true; +# else if (dest >= (void *)relocate_kernel && dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE) return true; +# endif #endif return false; } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 4b9d4557fc94..22c9ba305ac1 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -91,9 +91,11 @@ static inline bool check_la57_support(void) return true; } -static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd) +static unsigned long __head sme_postprocess_startup(struct boot_params *bp, + pmdval_t *pmd, + unsigned long p2v_offset) { - unsigned long vaddr, vaddr_end; + unsigned long paddr, paddr_end; int i; /* Encrypt the kernel and related (if SME is active) */ @@ -106,10 +108,10 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv * attribute. */ if (sme_get_me_mask()) { - vaddr = (unsigned long)__start_bss_decrypted; - vaddr_end = (unsigned long)__end_bss_decrypted; + paddr = (unsigned long)&RIP_REL_REF(__start_bss_decrypted); + paddr_end = (unsigned long)&RIP_REL_REF(__end_bss_decrypted); - for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + for (; paddr < paddr_end; paddr += PMD_SIZE) { /* * On SNP, transition the page to shared in the RMP table so that * it is consistent with the page table attribute change. @@ -118,11 +120,11 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv * mapping (kernel .text). PVALIDATE, by way of * early_snp_set_memory_shared(), requires a valid virtual * address but the kernel is currently running off of the identity - * mapping so use __pa() to get a *currently* valid virtual address. + * mapping so use the PA to get a *currently* valid virtual address. */ - early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD); + early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD); - i = pmd_index(vaddr); + i = pmd_index(paddr - p2v_offset); pmd[i] -= sme_get_me_mask(); } } @@ -138,12 +140,15 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv * doesn't have to generate PC-relative relocations when accessing globals from * that function. Clang actually does not generate them, which leads to * boot-time crashes. To work around this problem, every global pointer must - * be accessed using RIP_REL_REF(). + * be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined + * by subtracting p2v_offset from the RIP-relative address. */ -unsigned long __head __startup_64(unsigned long physaddr, +unsigned long __head __startup_64(unsigned long p2v_offset, struct boot_params *bp) { pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts); + unsigned long physaddr = (unsigned long)&RIP_REL_REF(_text); + unsigned long va_text, va_end; unsigned long pgtable_flags; unsigned long load_delta; pgdval_t *pgd; @@ -163,13 +168,16 @@ unsigned long __head __startup_64(unsigned long physaddr, * Compute the delta between the address I am compiled to run at * and the address I am actually running at. */ - load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map); + load_delta = __START_KERNEL_map + p2v_offset; RIP_REL_REF(phys_base) = load_delta; /* Is the address not 2M aligned? */ if (load_delta & ~PMD_MASK) for (;;); + va_text = physaddr - p2v_offset; + va_end = (unsigned long)&RIP_REL_REF(_end) - p2v_offset; + /* Include the SME encryption mask in the fixup value */ load_delta += sme_get_me_mask(); @@ -178,7 +186,7 @@ unsigned long __head __startup_64(unsigned long physaddr, pgd = &RIP_REL_REF(early_top_pgt)->pgd; pgd[pgd_index(__START_KERNEL_map)] += load_delta; - if (la57) { + if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) { p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt); p4d[MAX_PTRS_PER_P4D - 1] += load_delta; @@ -230,7 +238,7 @@ unsigned long __head __startup_64(unsigned long physaddr, pmd_entry += sme_get_me_mask(); pmd_entry += physaddr; - for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { + for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) { int idx = i + (physaddr >> PMD_SHIFT); pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE; @@ -255,11 +263,11 @@ unsigned long __head __startup_64(unsigned long physaddr, pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd; /* invalidate pages before the kernel image */ - for (i = 0; i < pmd_index((unsigned long)_text); i++) + for (i = 0; i < pmd_index(va_text); i++) pmd[i] &= ~_PAGE_PRESENT; /* fixup pages that are part of the kernel image */ - for (; i <= pmd_index((unsigned long)_end); i++) + for (; i <= pmd_index(va_end); i++) if (pmd[i] & _PAGE_PRESENT) pmd[i] += load_delta; @@ -267,7 +275,7 @@ unsigned long __head __startup_64(unsigned long physaddr, for (; i < PTRS_PER_PMD; i++) pmd[i] &= ~_PAGE_PRESENT; - return sme_postprocess_startup(bp, pmd); + return sme_postprocess_startup(bp, pmd, p2v_offset); } /* Wipe all early page tables except for the kernel symbol map */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 56163e2124cf..31345e0ba006 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -94,13 +94,19 @@ SYM_CODE_START_NOALIGN(startup_64) /* Sanitize CPU configuration */ call verify_cpu + /* + * Derive the kernel's physical-to-virtual offset from the physical and + * virtual addresses of common_startup_64(). + */ + leaq common_startup_64(%rip), %rdi + subq .Lcommon_startup_64(%rip), %rdi + /* * Perform pagetable fixups. Additionally, if SME is active, encrypt * the kernel and retrieve the modifier (SME encryption mask if SME * is active) to be added to the initial pgdir entry that will be * programmed into CR3. */ - leaq _text(%rip), %rdi movq %r15, %rsi call __startup_64 @@ -128,11 +134,11 @@ SYM_CODE_START_NOALIGN(startup_64) /* Branch to the common startup code at its kernel virtual address */ ANNOTATE_RETPOLINE_SAFE - jmp *0f(%rip) + jmp *.Lcommon_startup_64(%rip) SYM_CODE_END(startup_64) __INITRODATA -0: .quad common_startup_64 +SYM_DATA_LOCAL(.Lcommon_startup_64, .quad common_startup_64) .text SYM_CODE_START(secondary_startup_64) diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 257892fcefa7..b68d4be9464e 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -28,19 +28,19 @@ static ssize_t version_show(struct kobject *kobj, static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version); static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { memcpy(buf, (void *)&boot_params + off, count); return count; } -static struct bin_attribute boot_params_data_attr = { +static const struct bin_attribute boot_params_data_attr = { .attr = { .name = "data", .mode = S_IRUGO, }, - .read = boot_params_data_read, + .read_new = boot_params_data_read, .size = sizeof(boot_params), }; @@ -49,14 +49,14 @@ static struct attribute *boot_params_version_attrs[] = { NULL, }; -static struct bin_attribute *boot_params_data_attrs[] = { +static const struct bin_attribute *const boot_params_data_attrs[] = { &boot_params_data_attr, NULL, }; static const struct attribute_group boot_params_attr_group = { .attrs = boot_params_version_attrs, - .bin_attrs = boot_params_data_attrs, + .bin_attrs_new = boot_params_data_attrs, }; static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr) @@ -172,7 +172,7 @@ static ssize_t type_show(struct kobject *kobj, static ssize_t setup_data_data_read(struct file *fp, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { @@ -250,7 +250,7 @@ static struct bin_attribute data_attr __ro_after_init = { .name = "data", .mode = S_IRUGO, }, - .read = setup_data_data_read, + .read_new = setup_data_data_read, }; static struct attribute *setup_data_type_attrs[] = { @@ -258,14 +258,14 @@ static struct attribute *setup_data_type_attrs[] = { NULL, }; -static struct bin_attribute *setup_data_data_attrs[] = { +static const struct bin_attribute *const setup_data_data_attrs[] = { &data_attr, NULL, }; static const struct attribute_group setup_data_attr_group = { .attrs = setup_data_type_attrs, - .bin_attrs = setup_data_data_attrs, + .bin_attrs_new = setup_data_data_attrs, }; static int __init create_setup_data_node(struct kobject *parent, diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 1b373d79cedc..80265162aeff 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -160,15 +160,10 @@ void machine_kexec_cleanup(struct kimage *image) */ void machine_kexec(struct kimage *image) { + relocate_kernel_fn *relocate_kernel_ptr; unsigned long page_list[PAGES_NR]; void *control_page; int save_ftrace_enabled; - asmlinkage unsigned long - (*relocate_kernel_ptr)(unsigned long indirection_page, - unsigned long control_page, - unsigned long start_address, - unsigned int has_pae, - unsigned int preserve_context); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 9c9ac606893e..a68f5a0a9f37 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -146,7 +146,8 @@ static void free_transition_pgtable(struct kimage *image) image->arch.pte = NULL; } -static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) +static int init_transition_pgtable(struct kimage *image, pgd_t *pgd, + unsigned long control_page) { pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; unsigned long vaddr, paddr; @@ -156,8 +157,13 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) pmd_t *pmd; pte_t *pte; - vaddr = (unsigned long)relocate_kernel; - paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); + /* + * For the transition to the identity mapped page tables, the control + * code page also needs to be mapped at the virtual address it starts + * off running from. + */ + vaddr = (unsigned long)__va(control_page); + paddr = control_page; pgd += pgd_index(vaddr); if (!pgd_present(*pgd)) { p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); @@ -216,7 +222,7 @@ static void *alloc_pgt_page(void *data) return p; } -static int init_pgtable(struct kimage *image, unsigned long start_pgtable) +static int init_pgtable(struct kimage *image, unsigned long control_page) { struct x86_mapping_info info = { .alloc_pgt_page = alloc_pgt_page, @@ -225,12 +231,12 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) .kernpg_flag = _KERNPG_TABLE_NOENC, }; unsigned long mstart, mend; - pgd_t *level4p; int result; int i; - level4p = (pgd_t *)__va(start_pgtable); - clear_page(level4p); + image->arch.pgd = alloc_pgt_page(image); + if (!image->arch.pgd) + return -ENOMEM; if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { info.page_flag |= _PAGE_ENC; @@ -244,8 +250,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) mstart = pfn_mapped[i].start << PAGE_SHIFT; mend = pfn_mapped[i].end << PAGE_SHIFT; - result = kernel_ident_mapping_init(&info, - level4p, mstart, mend); + result = kernel_ident_mapping_init(&info, image->arch.pgd, + mstart, mend); if (result) return result; } @@ -260,8 +266,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; - result = kernel_ident_mapping_init(&info, - level4p, mstart, mend); + result = kernel_ident_mapping_init(&info, image->arch.pgd, + mstart, mend); if (result) return result; @@ -271,15 +277,19 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) * Prepare EFI systab and ACPI tables for kexec kernel since they are * not covered by pfn_mapped. */ - result = map_efi_systab(&info, level4p); + result = map_efi_systab(&info, image->arch.pgd); if (result) return result; - result = map_acpi_tables(&info, level4p); + result = map_acpi_tables(&info, image->arch.pgd); if (result) return result; - return init_transition_pgtable(image, level4p); + /* + * This must be last because the intermediate page table pages it + * allocates will not be control pages and may overlap the image. + */ + return init_transition_pgtable(image, image->arch.pgd, control_page); } static void load_segments(void) @@ -296,22 +306,35 @@ static void load_segments(void) int machine_kexec_prepare(struct kimage *image) { - unsigned long start_pgtable; + void *control_page = page_address(image->control_code_page); + unsigned long reloc_start = (unsigned long)__relocate_kernel_start; + unsigned long reloc_end = (unsigned long)__relocate_kernel_end; int result; - /* Calculate the offsets */ - start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - /* Setup the identity mapped 64bit page table */ - result = init_pgtable(image, start_pgtable); + result = init_pgtable(image, __pa(control_page)); if (result) return result; + kexec_va_control_page = (unsigned long)control_page; + kexec_pa_table_page = (unsigned long)__pa(image->arch.pgd); + + if (image->type == KEXEC_TYPE_DEFAULT) + kexec_pa_swap_page = page_to_pfn(image->swap_page) << PAGE_SHIFT; + + __memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start); + + set_memory_rox((unsigned long)control_page, 1); return 0; } void machine_kexec_cleanup(struct kimage *image) { + void *control_page = page_address(image->control_code_page); + + set_memory_nx((unsigned long)control_page, 1); + set_memory_rw((unsigned long)control_page, 1); + free_transition_pgtable(image); } @@ -319,9 +342,10 @@ void machine_kexec_cleanup(struct kimage *image) * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ -void machine_kexec(struct kimage *image) +void __nocfi machine_kexec(struct kimage *image) { - unsigned long page_list[PAGES_NR]; + unsigned long reloc_start = (unsigned long)__relocate_kernel_start; + relocate_kernel_fn *relocate_kernel_ptr; unsigned int host_mem_enc_active; int save_ftrace_enabled; void *control_page; @@ -357,17 +381,13 @@ void machine_kexec(struct kimage *image) #endif } - control_page = page_address(image->control_code_page) + PAGE_SIZE; - __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + control_page = page_address(image->control_code_page); - page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); - page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; - page_list[PA_TABLE_PAGE] = - (unsigned long)__pa(page_address(image->control_code_page)); - - if (image->type == KEXEC_TYPE_DEFAULT) - page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) - << PAGE_SHIFT); + /* + * Allow for the possibility that relocate_kernel might not be at + * the very start of the page. + */ + relocate_kernel_ptr = control_page + (unsigned long)relocate_kernel - reloc_start; /* * The segment registers are funny things, they have both a @@ -388,11 +408,11 @@ void machine_kexec(struct kimage *image) native_gdt_invalidate(); /* now call it */ - image->start = relocate_kernel((unsigned long)image->head, - (unsigned long)page_list, - image->start, - image->preserve_context, - host_mem_enc_active); + image->start = relocate_kernel_ptr((unsigned long)image->head, + virt_to_phys(control_page), + image->start, + image->preserve_context, + host_mem_enc_active); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -573,8 +593,7 @@ static void kexec_mark_crashkres(bool protect) /* Don't touch the control code page used in crash_kexec().*/ control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page)); - /* Control code page is located in the 2nd page. */ - kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect); + kexec_mark_range(crashk_res.start, control - 1, protect); control += KEXEC_CONTROL_PAGE_SIZE; kexec_mark_range(control, crashk_res.end, protect); } diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 540443d699e3..b44d8863e57f 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -24,33 +24,30 @@ #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) /* - * control_page + KEXEC_CONTROL_CODE_MAX_SIZE - * ~ control_page + PAGE_SIZE are used as data storage and stack for - * jumping back + * The .text..relocate_kernel and .data..relocate_kernel sections are copied + * into the control page, and the remainder of the page is used as the stack. */ -#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + .section .data..relocate_kernel,"a"; /* Minimal CPU state */ -#define RSP DATA(0x0) -#define CR0 DATA(0x8) -#define CR3 DATA(0x10) -#define CR4 DATA(0x18) +SYM_DATA_LOCAL(saved_rsp, .quad 0) +SYM_DATA_LOCAL(saved_cr0, .quad 0) +SYM_DATA_LOCAL(saved_cr3, .quad 0) +SYM_DATA_LOCAL(saved_cr4, .quad 0) + /* other data */ +SYM_DATA(kexec_va_control_page, .quad 0) +SYM_DATA(kexec_pa_table_page, .quad 0) +SYM_DATA(kexec_pa_swap_page, .quad 0) +SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0) -/* other data */ -#define CP_PA_TABLE_PAGE DATA(0x20) -#define CP_PA_SWAP_PAGE DATA(0x28) -#define CP_PA_BACKUP_PAGES_MAP DATA(0x30) - - .text - .align PAGE_SIZE + .section .text..relocate_kernel,"ax"; .code64 -SYM_CODE_START_NOALIGN(relocate_range) SYM_CODE_START_NOALIGN(relocate_kernel) UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR /* * %rdi indirection_page - * %rsi page_list + * %rsi pa_control_page * %rdx start address * %rcx preserve_context * %r8 host_mem_enc_active @@ -65,60 +62,57 @@ SYM_CODE_START_NOALIGN(relocate_kernel) pushq %r15 pushf - movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 - movq %rsp, RSP(%r11) - movq %cr0, %rax - movq %rax, CR0(%r11) - movq %cr3, %rax - movq %rax, CR3(%r11) - movq %cr4, %rax - movq %rax, CR4(%r11) - - /* Save CR4. Required to enable the right paging mode later. */ - movq %rax, %r13 - /* zero out flags, and disable interrupts */ pushq $0 popfq - /* Save SME active flag */ - movq %r8, %r12 - - /* - * get physical address of control page now - * this is impossible after page table switch - */ - movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 - - /* get physical address of page table now too */ - movq PTR(PA_TABLE_PAGE)(%rsi), %r9 - - /* get physical address of swap page now */ - movq PTR(PA_SWAP_PAGE)(%rsi), %r10 - - /* save some information for jumping back */ - movq %r9, CP_PA_TABLE_PAGE(%r11) - movq %r10, CP_PA_SWAP_PAGE(%r11) - movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) - /* Switch to the identity mapped page tables */ + movq %cr3, %rax + movq kexec_pa_table_page(%rip), %r9 movq %r9, %cr3 + /* Leave CR4 in %r13 to enable the right paging mode later. */ + movq %cr4, %r13 + + /* Disable global pages immediately to ensure this mapping is RWX */ + movq %r13, %r12 + andq $~(X86_CR4_PGE), %r12 + movq %r12, %cr4 + + /* Save %rsp and CRs. */ + movq %r13, saved_cr4(%rip) + movq %rsp, saved_rsp(%rip) + movq %rax, saved_cr3(%rip) + movq %cr0, %rax + movq %rax, saved_cr0(%rip) + + /* save indirection list for jumping back */ + movq %rdi, pa_backup_pages_map(%rip) + + /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ + movq %rcx, %r11 + /* setup a new stack at the end of the physical control page */ - lea PAGE_SIZE(%r8), %rsp + lea PAGE_SIZE(%rsi), %rsp /* jump to identity mapped page */ - addq $(identity_mapped - relocate_kernel), %r8 - pushq %r8 - ANNOTATE_UNRET_SAFE - ret - int3 +0: addq $identity_mapped - 0b, %rsi + subq $__relocate_kernel_start - 0b, %rsi + ANNOTATE_RETPOLINE_SAFE + jmp *%rsi SYM_CODE_END(relocate_kernel) SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) UNWIND_HINT_END_OF_STACK - /* set return address to 0 if not preserving context */ - pushq $0 + /* + * %rdi indirection page + * %rdx start address + * %r8 host_mem_enc_active + * %r9 page table page + * %r11 preserve_context + * %r13 original CR4 when relocate_kernel() was invoked + */ + /* store the start address on the stack */ pushq %rdx @@ -166,13 +160,11 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * entries that will conflict with the now unencrypted memory * used by kexec. Flush the caches before copying the kernel. */ - testq %r12, %r12 + testq %r8, %r8 jz .Lsme_off wbinvd .Lsme_off: - /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ - movq %rcx, %r11 call swap_pages /* @@ -184,13 +176,14 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movq %cr3, %rax movq %rax, %cr3 + testq %r11, %r11 /* preserve_context */ + jnz .Lrelocate + /* * set all of the registers to known values * leave %rsp alone */ - testq %r11, %r11 - jnz .Lrelocate xorl %eax, %eax xorl %ebx, %ebx xorl %ecx, %ecx @@ -213,20 +206,34 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) .Lrelocate: popq %rdx + + /* Use the swap page for the callee's stack */ + movq kexec_pa_swap_page(%rip), %r10 leaq PAGE_SIZE(%r10), %rsp + + /* push the existing entry point onto the callee's stack */ + pushq %rdx + ANNOTATE_RETPOLINE_SAFE call *%rdx /* get the re-entry point of the peer system */ - movq 0(%rsp), %rbp - leaq relocate_kernel(%rip), %r8 - movq CP_PA_SWAP_PAGE(%r8), %r10 - movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi - movq CP_PA_TABLE_PAGE(%r8), %rax + popq %rbp + movq kexec_pa_swap_page(%rip), %r10 + movq pa_backup_pages_map(%rip), %rdi + movq kexec_pa_table_page(%rip), %rax movq %rax, %cr3 + + /* Find start (and end) of this physical mapping of control page */ + leaq (%rip), %r8 + ANNOTATE_NOENDBR + andq $PAGE_MASK, %r8 lea PAGE_SIZE(%r8), %rsp + movl $1, %r11d /* Ensure preserve_context flag is set */ call swap_pages - movq $virtual_mapped, %rax + movq kexec_va_control_page(%rip), %rax +0: addq $virtual_mapped - 0b, %rax + subq $__relocate_kernel_start - 0b, %rax pushq %rax ANNOTATE_UNRET_SAFE ret @@ -236,11 +243,11 @@ SYM_CODE_END(identity_mapped) SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR // RET target, above - movq RSP(%r8), %rsp - movq CR4(%r8), %rax + movq saved_rsp(%rip), %rsp + movq saved_cr4(%rip), %rax movq %rax, %cr4 - movq CR3(%r8), %rax - movq CR0(%r8), %r8 + movq saved_cr3(%rip), %rax + movq saved_cr0(%rip), %r8 movq %rax, %cr3 movq %r8, %cr0 @@ -250,6 +257,7 @@ SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) lgdt saved_context_gdt_desc(%rax) #endif + /* relocate_kernel() returns the re-entry point for next time */ movq %rbp, %rax popf @@ -267,42 +275,49 @@ SYM_CODE_END(virtual_mapped) /* Do the copies */ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) UNWIND_HINT_END_OF_STACK + /* + * %rdi indirection page + * %r11 preserve_context + */ movq %rdi, %rcx /* Put the indirection_page in %rcx */ xorl %edi, %edi xorl %esi, %esi - jmp 1f + jmp .Lstart /* Should start with an indirection record */ -0: /* top, read another word for the indirection page */ +.Lloop: /* top, read another word for the indirection page */ movq (%rbx), %rcx addq $8, %rbx -1: +.Lstart: testb $0x1, %cl /* is it a destination page? */ - jz 2f + jz .Lnotdest movq %rcx, %rdi andq $0xfffffffffffff000, %rdi - jmp 0b -2: + jmp .Lloop +.Lnotdest: testb $0x2, %cl /* is it an indirection page? */ - jz 2f + jz .Lnotind movq %rcx, %rbx andq $0xfffffffffffff000, %rbx - jmp 0b -2: + jmp .Lloop +.Lnotind: testb $0x4, %cl /* is it the done indicator? */ - jz 2f - jmp 3f -2: + jz .Lnotdone + jmp .Ldone +.Lnotdone: testb $0x8, %cl /* is it the source indicator? */ - jz 0b /* Ignore it otherwise */ + jz .Lloop /* Ignore it otherwise */ movq %rcx, %rsi /* For ever source page do a copy */ andq $0xfffffffffffff000, %rsi movq %rdi, %rdx /* Save destination page to %rdx */ movq %rsi, %rax /* Save source page to %rax */ + testq %r11, %r11 /* Only actually swap for ::preserve_context */ + jz .Lnoswap + /* copy source page to swap page */ - movq %r10, %rdi + movq kexec_pa_swap_page(%rip), %rdi movl $512, %ecx rep ; movsq @@ -314,17 +329,15 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) /* copy swap page to destination page */ movq %rdx, %rdi - movq %r10, %rsi + movq kexec_pa_swap_page(%rip), %rsi +.Lnoswap: movl $512, %ecx rep ; movsq lea PAGE_SIZE(%rax), %rsi - jmp 0b -3: + jmp .Lloop +.Ldone: ANNOTATE_UNRET_SAFE ret int3 SYM_CODE_END(swap_pages) - - .skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc -SYM_CODE_END(relocate_range); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 6a17396c8174..0deb4887d6e9 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -28,6 +28,7 @@ #include #include #include +#include #undef i386 /* in case the preprocessor is a 32bit one */ @@ -95,7 +96,19 @@ const_pcpu_hot = pcpu_hot; #define BSS_DECRYPTED #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_KEXEC_CORE) +#define KEXEC_RELOCATE_KERNEL \ + . = ALIGN(0x100); \ + __relocate_kernel_start = .; \ + *(.text..relocate_kernel); \ + *(.data..relocate_kernel); \ + __relocate_kernel_end = .; +ASSERT(__relocate_kernel_end - __relocate_kernel_start <= KEXEC_CONTROL_CODE_MAX_SIZE, + "relocate_kernel code too large!") +#else +#define KEXEC_RELOCATE_KERNEL +#endif PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(6); /* RW_ */ @@ -121,19 +134,6 @@ SECTIONS .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; _stext = .; - /* bootstrapping code */ - HEAD_TEXT - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - SOFTIRQENTRY_TEXT -#ifdef CONFIG_MITIGATION_RETPOLINE - *(.text..__x86.indirect_thunk) - *(.text..__x86.return_thunk) -#endif - STATIC_CALL_TEXT - ALIGN_ENTRY_TEXT_BEGIN *(.text..__x86.rethunk_untrain) ENTRY_TEXT @@ -147,10 +147,26 @@ SECTIONS *(.text..__x86.rethunk_safe) #endif ALIGN_ENTRY_TEXT_END + + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + SOFTIRQENTRY_TEXT +#ifdef CONFIG_MITIGATION_RETPOLINE + *(.text..__x86.indirect_thunk) + *(.text..__x86.return_thunk) +#endif + STATIC_CALL_TEXT *(.gnu.warning) } :text = 0xcccccccc + /* bootstrapping code */ + .head.text : AT(ADDR(.head.text) - LOAD_OFFSET) { + HEAD_TEXT + } :text = 0xcccccccc + /* End of text section, which should occupy whole number of pages */ _etext = .; . = ALIGN(PAGE_SIZE); @@ -181,6 +197,7 @@ SECTIONS DATA_DATA CONSTRUCTORS + KEXEC_RELOCATE_KERNEL /* rarely changed data like cpu maps */ READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 27441e5863b2..e937be979ec8 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -841,10 +841,10 @@ static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, const char *symname) { + int headtext = !strcmp(sec_name(sec->shdr.sh_info), ".head.text"); unsigned r_type = ELF64_R_TYPE(rel->r_info); ElfW(Addr) offset = rel->r_offset; int shn_abs = (sym->st_shndx == SHN_ABS) && !is_reloc(S_REL, symname); - if (sym->st_shndx == SHN_UNDEF) return 0; @@ -900,6 +900,12 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, break; } + if (headtext) { + die("Absolute reference to symbol '%s' not permitted in .head.text\n", + symname); + break; + } + /* * Relocation offsets for 64 bit kernels are output * as 32 bits and sign extended back to 64 bits when diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c0caa14880c3..b424a5c6ae87 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1001,6 +1001,12 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { + /* + * This flow is analogous to hibernation flows that occur + * before creating an image and before jumping from the + * restore kernel to the image one, so it uses the same + * device callbacks as those two flows. + */ pm_prepare_console(); error = freeze_processes(); if (error) { @@ -1011,12 +1017,10 @@ int kernel_kexec(void) error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Resume_console; - /* At this point, dpm_suspend_start() has been called, - * but *not* dpm_suspend_end(). We *must* call - * dpm_suspend_end() now. Otherwise, drivers for - * some devices (e.g. interrupt controllers) become - * desynchronized with the actual state of the - * hardware at resume time, and evil weirdness ensues. + /* + * dpm_suspend_end() must be called after dpm_suspend_start() + * to complete the transition, like in the hibernation flows + * mentioned above. */ error = dpm_suspend_end(PMSG_FREEZE); if (error) @@ -1052,6 +1056,13 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { + /* + * This flow is analogous to hibernation flows that occur after + * creating an image and after the image kernel has got control + * back, and in case the devices have been reset or otherwise + * manipulated in the meantime, it uses the device callbacks + * used by the latter. + */ syscore_resume(); Enable_irqs: local_irq_enable();