linux/arch/arm64/kernel/elfcore.c
Mateusz Guzik d6ff4c8f65
fs: avoid mmap sem relocks when coredumping with many missing pages
Dumping processes with large allocated and mostly not-faulted areas is
very slow.

Borrowing a test case from Tavian Barnes:

int main(void) {
    char *mem = mmap(NULL, 1ULL << 40, PROT_READ | PROT_WRITE,
            MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
    printf("%p %m\n", mem);
    if (mem != MAP_FAILED) {
            mem[0] = 1;
    }
    abort();
}

That's 1TB of almost completely not-populated area.

On my test box it takes 13-14 seconds to dump.

The profile shows:
-   99.89%     0.00%  a.out
     entry_SYSCALL_64_after_hwframe
     do_syscall_64
     syscall_exit_to_user_mode
     arch_do_signal_or_restart
   - get_signal
      - 99.89% do_coredump
         - 99.88% elf_core_dump
            - dump_user_range
               - 98.12% get_dump_page
                  - 64.19% __get_user_pages
                     - 40.92% gup_vma_lookup
                        - find_vma
                           - mt_find
                                4.21% __rcu_read_lock
                                1.33% __rcu_read_unlock
                     - 3.14% check_vma_flags
                          0.68% vma_is_secretmem
                       0.61% __cond_resched
                       0.60% vma_pgtable_walk_end
                       0.59% vma_pgtable_walk_begin
                       0.58% no_page_table
                  - 15.13% down_read_killable
                       0.69% __cond_resched
                    13.84% up_read
                 0.58% __cond_resched

Almost 29% of the time is spent relocking the mmap semaphore between
calls to get_dump_page() which find nothing.

Whacking that results in times of 10 seconds (down from 13-14).

While here make the thing killable.

The real problem is the page-sized iteration and the real fix would
patch it up instead. It is left as an exercise for the mm-familiar
reader.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20250119103205.2172432-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-02-21 10:25:32 +01:00

140 lines
2.9 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
#include <linux/coredump.h>
#include <linux/elfcore.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <asm/cpufeature.h>
#include <asm/mte.h>
#define for_each_mte_vma(cprm, i, m) \
if (system_supports_mte()) \
for (i = 0, m = cprm->vma_meta; \
i < cprm->vma_count; \
i++, m = cprm->vma_meta + i) \
if (m->flags & VM_MTE)
static unsigned long mte_vma_tag_dump_size(struct core_vma_metadata *m)
{
return (m->dump_size >> PAGE_SHIFT) * MTE_PAGE_TAG_STORAGE;
}
/* Derived from dump_user_range(); start/end must be page-aligned */
static int mte_dump_tag_range(struct coredump_params *cprm,
unsigned long start, unsigned long len)
{
int ret = 1;
unsigned long addr;
void *tags = NULL;
int locked = 0;
for (addr = start; addr < start + len; addr += PAGE_SIZE) {
struct page *page = get_dump_page(addr, &locked);
/*
* get_dump_page() returns NULL when encountering an empty
* page table entry that would otherwise have been filled with
* the zero page. Skip the equivalent tag dump which would
* have been all zeros.
*/
if (!page) {
dump_skip(cprm, MTE_PAGE_TAG_STORAGE);
continue;
}
/*
* Pages mapped in user space as !pte_access_permitted() (e.g.
* PROT_EXEC only) may not have the PG_mte_tagged flag set.
*/
if (!page_mte_tagged(page)) {
put_page(page);
dump_skip(cprm, MTE_PAGE_TAG_STORAGE);
continue;
}
if (!tags) {
tags = mte_allocate_tag_storage();
if (!tags) {
put_page(page);
ret = 0;
break;
}
}
mte_save_page_tags(page_address(page), tags);
put_page(page);
if (!dump_emit(cprm, tags, MTE_PAGE_TAG_STORAGE)) {
ret = 0;
break;
}
}
if (tags)
mte_free_tag_storage(tags);
return ret;
}
Elf_Half elf_core_extra_phdrs(struct coredump_params *cprm)
{
int i;
struct core_vma_metadata *m;
int vma_count = 0;
for_each_mte_vma(cprm, i, m)
vma_count++;
return vma_count;
}
int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
{
int i;
struct core_vma_metadata *m;
for_each_mte_vma(cprm, i, m) {
struct elf_phdr phdr;
phdr.p_type = PT_AARCH64_MEMTAG_MTE;
phdr.p_offset = offset;
phdr.p_vaddr = m->start;
phdr.p_paddr = 0;
phdr.p_filesz = mte_vma_tag_dump_size(m);
phdr.p_memsz = m->end - m->start;
offset += phdr.p_filesz;
phdr.p_flags = 0;
phdr.p_align = 0;
if (!dump_emit(cprm, &phdr, sizeof(phdr)))
return 0;
}
return 1;
}
size_t elf_core_extra_data_size(struct coredump_params *cprm)
{
int i;
struct core_vma_metadata *m;
size_t data_size = 0;
for_each_mte_vma(cprm, i, m)
data_size += mte_vma_tag_dump_size(m);
return data_size;
}
int elf_core_write_extra_data(struct coredump_params *cprm)
{
int i;
struct core_vma_metadata *m;
for_each_mte_vma(cprm, i, m) {
if (!mte_dump_tag_range(cprm, m->start, m->dump_size))
return 0;
}
return 1;
}