mirror of
https://github.com/torvalds/linux.git
synced 2025-04-12 16:47:42 +00:00

rw_semaphore is a sizable structure of 40 bytes and consumes considerable space for each vm_area_struct. However vma_lock has two important specifics which can be used to replace rw_semaphore with a simpler structure: 1. Readers never wait. They try to take the vma_lock and fall back to mmap_lock if that fails. 2. Only one writer at a time will ever try to write-lock a vma_lock because writers first take mmap_lock in write mode. Because of these requirements, full rw_semaphore functionality is not needed and we can replace rw_semaphore and the vma->detached flag with a refcount (vm_refcnt). When vma is in detached state, vm_refcnt is 0 and only a call to vma_mark_attached() can take it out of this state. Note that unlike before, now we enforce both vma_mark_attached() and vma_mark_detached() to be done only after vma has been write-locked. vma_mark_attached() changes vm_refcnt to 1 to indicate that it has been attached to the vma tree. When a reader takes read lock, it increments vm_refcnt, unless the top usable bit of vm_refcnt (0x40000000) is set, indicating presence of a writer. When writer takes write lock, it sets the top usable bit to indicate its presence. If there are readers, writer will wait using newly introduced mm->vma_writer_wait. Since all writers take mmap_lock in write mode first, there can be only one writer at a time. The last reader to release the lock will signal the writer to wake up. refcount might overflow if there are many competing readers, in which case read-locking will fail. Readers are expected to handle such failures. In summary: 1. all readers increment the vm_refcnt; 2. writer sets top usable (writer) bit of vm_refcnt; 3. readers cannot increment the vm_refcnt if the writer bit is set; 4. in the presence of readers, writer must wait for the vm_refcnt to drop to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma with no readers; 5. vm_refcnt overflow is handled by the readers. While this vm_lock replacement does not yet result in a smaller vm_area_struct (it stays at 256 bytes due to cacheline alignment), it allows for further size optimization by structure member regrouping to bring the size of vm_area_struct below 192 bytes. [surenb@google.com: fix a crash due to vma_end_read() that should have been removed] Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Suggested-by: Peter Zijlstra <peterz@infradead.org> Suggested-by: Matthew Wilcox <willy@infradead.org> Tested-by: Shivank Garg <shivankg@amd.com> Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Christian Brauner <brauner@kernel.org> Cc: David Hildenbrand <david@redhat.com> Cc: David Howells <dhowells@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Hugh Dickins <hughd@google.com> Cc: Jann Horn <jannh@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Klara Modin <klarasmodin@gmail.com> Cc: Liam R. Howlett <Liam.Howlett@Oracle.com> Cc: Lokesh Gidra <lokeshgidra@google.com> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Mateusz Guzik <mjguzik@gmail.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Minchan Kim <minchan@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: "Paul E . McKenney" <paulmck@kernel.org> Cc: Peter Xu <peterx@redhat.com> Cc: Shakeel Butt <shakeel.butt@linux.dev> Cc: Sourav Panda <souravpanda@google.com> Cc: Wei Yang <richard.weiyang@gmail.com> Cc: Will Deacon <will@kernel.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
59 lines
1.8 KiB
C
59 lines
1.8 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include <linux/mm_types.h>
|
|
#include <linux/maple_tree.h>
|
|
#include <linux/rwsem.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/list.h>
|
|
#include <linux/cpumask.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/pgtable.h>
|
|
|
|
#include <linux/atomic.h>
|
|
#include <linux/user_namespace.h>
|
|
#include <linux/iommu.h>
|
|
#include <asm/mmu.h>
|
|
|
|
#ifndef INIT_MM_CONTEXT
|
|
#define INIT_MM_CONTEXT(name)
|
|
#endif
|
|
|
|
const struct vm_operations_struct vma_dummy_vm_ops;
|
|
|
|
/*
|
|
* For dynamically allocated mm_structs, there is a dynamically sized cpumask
|
|
* at the end of the structure, the size of which depends on the maximum CPU
|
|
* number the system can see. That way we allocate only as much memory for
|
|
* mm_cpumask() as needed for the hundreds, or thousands of processes that
|
|
* a system typically runs.
|
|
*
|
|
* Since there is only one init_mm in the entire system, keep it simple
|
|
* and size this cpu_bitmask to NR_CPUS.
|
|
*/
|
|
struct mm_struct init_mm = {
|
|
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock),
|
|
.pgd = swapper_pg_dir,
|
|
.mm_users = ATOMIC_INIT(2),
|
|
.mm_count = ATOMIC_INIT(1),
|
|
.write_protect_seq = SEQCNT_ZERO(init_mm.write_protect_seq),
|
|
MMAP_LOCK_INITIALIZER(init_mm)
|
|
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
|
|
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
|
|
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
|
|
#ifdef CONFIG_PER_VMA_LOCK
|
|
.vma_writer_wait = __RCUWAIT_INITIALIZER(init_mm.vma_writer_wait),
|
|
.mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq),
|
|
#endif
|
|
.user_ns = &init_user_ns,
|
|
.cpu_bitmap = CPU_BITS_NONE,
|
|
INIT_MM_CONTEXT(init_mm)
|
|
};
|
|
|
|
void setup_initial_init_mm(void *start_code, void *end_code,
|
|
void *end_data, void *brk)
|
|
{
|
|
init_mm.start_code = (unsigned long)start_code;
|
|
init_mm.end_code = (unsigned long)end_code;
|
|
init_mm.end_data = (unsigned long)end_data;
|
|
init_mm.brk = (unsigned long)brk;
|
|
}
|