diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 5e351ac52cca..3435a062a208 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1407,14 +1407,21 @@ earlyprintk=serial[,0x...[,baudrate]] earlyprintk=ttySn[,baudrate] earlyprintk=dbgp[debugController#] - earlyprintk=pciserial[,force],bus:device.function[,baudrate] + earlyprintk=pciserial[,force],bus:device.function[,{nocfg|baudrate}] earlyprintk=xdbc[xhciController#] earlyprintk=bios + earlyprintk=mmio,membase[,{nocfg|baudrate}] earlyprintk is useful when the kernel crashes before the normal console is initialized. It is not enabled by default because it has some cosmetic problems. + Only 32-bit memory addresses are supported for "mmio" + and "pciserial" devices. + + Use "nocfg" to skip UART configuration, assume + BIOS/firmware has configured UART correctly. + Append ",keep" to not disable it when the real console takes over. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9427b5292ca2..15f346f02af0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -460,20 +460,28 @@ config SMP If you don't know what to do here, say N. config X86_X2APIC - bool "Support x2apic" + bool "x2APIC interrupt controller architecture support" depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST) + default y help - This enables x2apic support on CPUs that have this feature. + x2APIC is an interrupt controller architecture, a component of which + (the local APIC) is present in the CPU. It allows faster access to + the local APIC and supports a larger number of CPUs in the system + than the predecessors. - This allows 32-bit apic IDs (so it can support very large systems), - and accesses the local apic via MSRs not via mmio. + x2APIC was introduced in Intel CPUs around 2008 and in AMD EPYC CPUs + in 2019, but it can be disabled by the BIOS. It is also frequently + emulated in virtual machines, even when the host CPU does not support + it. Support in the CPU can be checked by executing + grep x2apic /proc/cpuinfo - Some Intel systems circa 2022 and later are locked into x2APIC mode - and can not fall back to the legacy APIC modes if SGX or TDX are - enabled in the BIOS. They will boot with very reduced functionality - without enabling this option. + If this configuration option is disabled, the kernel will boot with + very reduced functionality and performance on some platforms that + have x2APIC enabled. On the other hand, on hardware that does not + support x2APIC, a kernel with this option enabled will just fallback + to older APIC implementations. - If you don't know what to do here, say N. + If in doubt, say Y. config X86_POSTED_MSI bool "Enable MSI and MSI-x delivery by posted interrupts" @@ -544,16 +552,17 @@ config X86_EXTENDED_PLATFORM CONFIG_64BIT. 32-bit platforms (CONFIG_64BIT=n): - Goldfish (Android emulator) - AMD Elan + Goldfish (mostly Android emulator) + Intel CE media processor (CE4100) SoC + Intel Quark RDC R-321x SoC - SGI 320/540 (Visual Workstation) 64-bit platforms (CONFIG_64BIT=y): Numascale NumaChip ScaleMP vSMP SGI Ultraviolet Merrifield/Moorefield MID devices + Goldfish (mostly Android emulator) If you have one of these systems, or if you want to build a generic distribution kernel, say Y here - otherwise say N. @@ -667,6 +676,17 @@ config X86_INTEL_QUARK Say Y here if you have a Quark based system such as the Arduino compatible Intel Galileo. +config X86_RDC321X + bool "RDC R-321x SoC" + depends on X86_32 + depends on X86_EXTENDED_PLATFORM + select M486 + select X86_REBOOTFIXUPS + help + This option is needed for RDC R-321x system-on-chip, also known + as R-8610-(G). + If you don't have one of these chips, you should say N here. + config X86_INTEL_LPSS bool "Intel Low Power Subsystem Support" depends on X86 && ACPI && PCI @@ -720,17 +740,6 @@ config IOSF_MBI_DEBUG If you don't require the option or are in doubt, say N. -config X86_RDC321X - bool "RDC R-321x SoC" - depends on X86_32 - depends on X86_EXTENDED_PLATFORM - select M486 - select X86_REBOOTFIXUPS - help - This option is needed for RDC R-321x system-on-chip, also known - as R-8610-(G). - If you don't have one of these chips, you should say N here. - config X86_SUPPORTS_MEMORY_FAILURE def_bool y # MCE code calls memory_failure(): @@ -1565,7 +1574,6 @@ config ARCH_FLATMEM_ENABLE config ARCH_SPARSEMEM_ENABLE def_bool y - depends on X86_64 || NUMA || X86_32 select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 @@ -2212,7 +2220,7 @@ config HOTPLUG_CPU config COMPAT_VDSO def_bool n - prompt "Disable the 32-bit vDSO (needed for glibc 2.3.3)" + prompt "Workaround for glibc 2.3.2 / 2.3.3 (released in year 2003/2004)" depends on COMPAT_32 help Certain buggy versions of glibc will crash if they are @@ -2901,6 +2909,19 @@ config PCI_MMCONFIG default y depends on PCI && (ACPI || JAILHOUSE_GUEST) depends on X86_64 || (PCI_GOANY || PCI_GOMMCONFIG) + help + Add support for accessing the PCI configuration space as a memory + mapped area. It is the recommended method if the system supports + this (it must have PCI Express and ACPI for it to be available). + + In the unlikely case that enabling this configuration option causes + problems, the mechanism can be switched off with the 'pci=nommconf' + command line parameter. + + Say N only if you are sure that your platform does not support this + access method or you have problems caused by it. + + Say Y otherwise. config PCI_OLPC def_bool y @@ -2915,13 +2936,21 @@ config MMCONF_FAM10H depends on X86_64 && PCI_MMCONFIG && ACPI config PCI_CNB20LE_QUIRK - bool "Read CNB20LE Host Bridge Windows" if EXPERT - depends on PCI + bool "Read PCI host bridge windows from the CNB20LE chipset" if EXPERT + depends on X86_32 && PCI help Read the PCI windows out of the CNB20LE host bridge. This allows PCI hotplug to work on systems with the CNB20LE chipset which do not have ACPI. + The ServerWorks (later Broadcom) CNB20LE was a chipset designed + most probably only for Pentium III. + + To find out if you have such a chipset, search for a PCI device with + 1166:0009 PCI IDs, for example by executing + lspci -nn | grep '1166:0009' + The code is inactive if there is none. + There's no public spec for this chipset, and this functionality is known to be incomplete. diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index cb0911c5dc5d..d83236b96f22 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -70,6 +70,8 @@ For 32-bit we have the following conventions - kernel is built with pushq %rsi /* pt_regs->si */ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ movq %rdi, 8(%rsp) /* pt_regs->di (overwriting original return address) */ + /* We just clobbered the return address - use the IRET frame for unwinding: */ + UNWIND_HINT_IRET_REGS offset=3*8 .else pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index e4d11e3318f0..8a5cc8e70439 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -435,12 +435,8 @@ static inline void call_depth_return_thunk(void) {} * Inline asm uses the %V modifier which is only in newer GCC * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ -#ifdef CONFIG_MITIGATION_RETPOLINE #define CALL_NOSPEC __CS_PREFIX("%V[thunk_target]") \ "call __x86_indirect_thunk_%V[thunk_target]\n" -#else -#define CALL_NOSPEC "call *%[thunk_target]\n" -#endif # define THUNK_TARGET(addr) [thunk_target] "r" (addr) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 138689b8e1d8..b61028cf5c8a 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -600,7 +600,7 @@ static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev, unsigned long p_addr = (unsigned long)&mc->hdr.data_code; if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize)) - return -1; + return false; native_wrmsrl(MSR_AMD64_PATCH_LOADER, p_addr); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 91639d1e4ec2..c6fefd4585f8 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -195,6 +195,7 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%sCall Trace:\n", log_lvl); unwind_start(&state, task, regs, stack); + stack = stack ?: get_stack_pointer(task, regs); regs = unwind_get_entry_regs(&state, &partial); /* @@ -213,9 +214,7 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - hardirq stack * - entry stack */ - for (stack = stack ?: get_stack_pointer(task, regs); - stack; - stack = stack_info.next_sp) { + for (; stack; stack = stack_info.next_sp) { const char *stack_name; stack = PTR_ALIGN(stack, sizeof(long)); diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index fc1714bad045..611f27e3890c 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -190,7 +190,6 @@ static __init void early_serial_init(char *s) early_serial_hw_init(divisor); } -#ifdef CONFIG_PCI static __noendbr void mem32_serial_out(unsigned long addr, int offset, int value) { u32 __iomem *vaddr = (u32 __iomem *)addr; @@ -207,6 +206,45 @@ static __noendbr unsigned int mem32_serial_in(unsigned long addr, int offset) } ANNOTATE_NOENDBR_SYM(mem32_serial_in); +/* + * early_mmio_serial_init() - Initialize MMIO-based early serial console. + * @s: MMIO-based serial specification. + */ +static __init void early_mmio_serial_init(char *s) +{ + unsigned long baudrate; + unsigned long membase; + char *e; + + if (*s == ',') + s++; + + if (!strncmp(s, "0x", 2)) { + /* NB: only 32-bit addresses are supported. */ + membase = simple_strtoul(s, &e, 16); + early_serial_base = (unsigned long)early_ioremap(membase, PAGE_SIZE); + + static_call_update(serial_in, mem32_serial_in); + static_call_update(serial_out, mem32_serial_out); + + s += strcspn(s, ","); + if (*s == ',') + s++; + } + + if (!strncmp(s, "nocfg", 5)) { + baudrate = 0; + } else { + baudrate = simple_strtoul(s, &e, 0); + if (baudrate == 0 || s == e) + baudrate = DEFAULT_BAUD; + } + + if (baudrate) + early_serial_hw_init(115200 / baudrate); +} + +#ifdef CONFIG_PCI /* * early_pci_serial_init() * @@ -351,6 +389,11 @@ static int __init setup_early_printk(char *buf) keep = (strstr(buf, "keep") != NULL); while (*buf != '\0') { + if (!strncmp(buf, "mmio", 4)) { + early_mmio_serial_init(buf + 4); + early_console_register(&early_serial_console, keep); + buf += 4; + } if (!strncmp(buf, "serial", 6)) { buf += 6; early_serial_init(buf); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 1b734a9ff088..91d6341f281f 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -508,7 +508,7 @@ static inline void fpstate_init_fstate(struct fpstate *fpstate) /* * Used in two places: * 1) Early boot to setup init_fpstate for non XSAVE systems - * 2) fpu_init_fpstate_user() which is invoked from KVM + * 2) fpu_alloc_guest_fpstate() which is invoked from KVM */ void fpstate_init_user(struct fpstate *fpstate) { diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index e40861c9cb90..72d8cbc61158 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -984,29 +984,42 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr, return -EINVAL; } -/* - * track_pfn_copy is called when vma that is covering the pfnmap gets - * copied through copy_page_range(). - * - * If the vma has a linear pfn mapping for the entire range, we get the prot - * from pte and reserve the entire vma range with single reserve_pfn_range call. - */ -int track_pfn_copy(struct vm_area_struct *vma) +int track_pfn_copy(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, unsigned long *pfn) { + const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start; resource_size_t paddr; - unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; + int rc; - if (vma->vm_flags & VM_PAT) { - if (get_pat_info(vma, &paddr, &pgprot)) - return -EINVAL; - /* reserve the whole chunk covered by vma. */ - return reserve_pfn_range(paddr, vma_size, &pgprot, 1); - } + if (!(src_vma->vm_flags & VM_PAT)) + return 0; + /* + * Duplicate the PAT information for the dst VMA based on the src + * VMA. + */ + if (get_pat_info(src_vma, &paddr, &pgprot)) + return -EINVAL; + rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1); + if (rc) + return rc; + + /* Reservation for the destination VMA succeeded. */ + vm_flags_set(dst_vma, VM_PAT); + *pfn = PHYS_PFN(paddr); return 0; } +void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn) +{ + untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start, true); + /* + * Reservation was freed, any copied page tables will get cleaned + * up later, but without getting PAT involved again. + */ +} + /* * prot is passed in as a parameter for the new mapping. If the vma has * a linear pfn mapping for the entire range, or no vma is provided, @@ -1095,15 +1108,6 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, } } -/* - * untrack_pfn_clear is called if the following situation fits: - * - * 1) while mremapping a pfnmap for a new region, with the old vma after - * its pfnmap page table has been removed. The new vma has a new pfnmap - * to the same pfn & cache type with VM_PAT set. - * 2) while duplicating vm area, the new vma fails to copy the pgtable from - * old vma. - */ void untrack_pfn_clear(struct vm_area_struct *vma) { vm_flags_clear(vma, VM_PAT); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 94d267d02372..4c107e17c547 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1508,14 +1508,25 @@ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, } /* - * track_pfn_copy is called when vma that is covering the pfnmap gets - * copied through copy_page_range(). + * track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page + * tables copied during copy_page_range(). On success, stores the pfn to be + * passed to untrack_pfn_copy(). */ -static inline int track_pfn_copy(struct vm_area_struct *vma) +static inline int track_pfn_copy(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, unsigned long *pfn) { return 0; } +/* + * untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during + * copy_page_range(), but after track_pfn_copy() was already called. + */ +static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma, + unsigned long pfn) +{ +} + /* * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or @@ -1528,8 +1539,10 @@ static inline void untrack_pfn(struct vm_area_struct *vma, } /* - * untrack_pfn_clear is called while mremapping a pfnmap for a new region - * or fails to copy pgtable during duplicate vm area. + * untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA: + * + * 1) During mremap() on the src VMA after the page tables were moved. + * 2) During fork() on the dst VMA, immediately after duplicating the src VMA. */ static inline void untrack_pfn_clear(struct vm_area_struct *vma) { @@ -1540,7 +1553,10 @@ extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long size); extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn); -extern int track_pfn_copy(struct vm_area_struct *vma); +extern int track_pfn_copy(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, unsigned long *pfn); +extern void untrack_pfn_copy(struct vm_area_struct *dst_vma, + unsigned long pfn); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size, bool mm_wr_locked); extern void untrack_pfn_clear(struct vm_area_struct *vma); diff --git a/kernel/fork.c b/kernel/fork.c index a61a4407ebdf..1b659b07ecd5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -504,6 +504,10 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) vma_numab_state_init(new); dup_anon_vma_name(orig, new); + /* track_pfn_copy() will later take care of copying internal state. */ + if (unlikely(new->vm_flags & VM_PFNMAP)) + untrack_pfn_clear(new); + return new; } diff --git a/mm/memory.c b/mm/memory.c index bea87c082f27..369905596243 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1362,12 +1362,12 @@ int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pgd_t *src_pgd, *dst_pgd; - unsigned long next; unsigned long addr = src_vma->vm_start; unsigned long end = src_vma->vm_end; struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; + unsigned long next, pfn; bool is_cow; int ret; @@ -1378,11 +1378,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { - /* - * We do not free on error cases below as remove_vma - * gets called on error from higher level routine - */ - ret = track_pfn_copy(src_vma); + ret = track_pfn_copy(dst_vma, src_vma, &pfn); if (ret) return ret; } @@ -1419,7 +1415,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) continue; if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, addr, next))) { - untrack_pfn_clear(dst_vma); ret = -ENOMEM; break; } @@ -1429,6 +1424,8 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } + if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP)) + untrack_pfn_copy(dst_vma, pfn); return ret; }