mirror of
https://github.com/torvalds/linux.git
synced 2025-04-06 00:16:18 +00:00
- The 6 patch series "Enable strict percpu address space checks" from
Uros Bizjak uses x86 named address space qualifiers to provide compile-time checking of percpu area accesses. This has caused a small amount of fallout - two or three issues were reported. In all cases the calling code was founf to be incorrect. - The 4 patch series "Some cleanup for memcg" from Chen Ridong implements some relatively monir cleanups for the memcontrol code. - The 17 patch series "mm: fixes for device-exclusive entries (hmm)" from David Hildenbrand fixes a boatload of issues which David found then using device-exclusive PTE entries when THP is enabled. More work is needed, but this makes thins better - our own HMM selftests now succeed. - The 2 patch series "mm: zswap: remove z3fold and zbud" from Yosry Ahmed remove the z3fold and zbud implementations. They have been deprecated for half a year and nobody has complained. - The 5 patch series "mm: further simplify VMA merge operation" from Lorenzo Stoakes implements numerous simplifications in this area. No runtime effects are anticipated. - The 4 patch series "mm/madvise: remove redundant mmap_lock operations from process_madvise()" from SeongJae Park rationalizes the locking in the madvise() implementation. Performance gains of 20-25% were observed in one MADV_DONTNEED microbenchmark. - The 12 patch series "Tiny cleanup and improvements about SWAP code" from Baoquan He contains a number of touchups to issues which Baoquan noticed when working on the swap code. - The 2 patch series "mm: kmemleak: Usability improvements" from Catalin Marinas implements a couple of improvements to the kmemleak user-visible output. - The 2 patch series "mm/damon/paddr: fix large folios access and schemes handling" from Usama Arif provides a couple of fixes for DAMON's handling of large folios. - The 3 patch series "mm/damon/core: fix wrong and/or useless damos_walk() behaviors" from SeongJae Park fixes a few issues with the accuracy of kdamond's walking of DAMON regions. - The 3 patch series "expose mapping wrprotect, fix fb_defio use" from Lorenzo Stoakes changes the interaction between framebuffer deferred-io and core MM. No functional changes are anticipated - this is preparatory work for the future removal of page structure fields. - The 4 patch series "mm/damon: add support for hugepage_size DAMOS filter" from Usama Arif adds a DAMOS filter which permits the filtering by huge page sizes. - The 4 patch series "mm: permit guard regions for file-backed/shmem mappings" from Lorenzo Stoakes extends the guard region feature from its present "anon mappings only" state. The feature now covers shmem and file-backed mappings. - The 4 patch series "mm: batched unmap lazyfree large folios during reclamation" from Barry Song cleans up and speeds up the unmapping for pte-mapped large folios. - The 18 patch series "reimplement per-vma lock as a refcount" from Suren Baghdasaryan puts the vm_lock back into the vma. Our reasons for pulling it out were largely bogus and that change made the code more messy. This patchset provides small (0-10%) improvements on one microbenchmark. - The 5 patch series "Docs/mm/damon: misc DAMOS filters documentation fixes and improves" from SeongJae Park does some maintenance work on the DAMON docs. - The 27 patch series "hugetlb/CMA improvements for large systems" from Frank van der Linden addresses a pile of issues which have been observed when using CMA on large machines. - The 2 patch series "mm/damon: introduce DAMOS filter type for unmapped pages" from SeongJae Park enables users of DMAON/DAMOS to filter my the page's mapped/unmapped status. - The 19 patch series "zsmalloc/zram: there be preemption" from Sergey Senozhatsky teaches zram to run its compression and decompression operations preemptibly. - The 12 patch series "selftests/mm: Some cleanups from trying to run them" from Brendan Jackman fixes a pile of unrelated issues which Brendan encountered while runnimg our selftests. - The 2 patch series "fs/proc/task_mmu: add guard region bit to pagemap" from Lorenzo Stoakes permits userspace to use /proc/pid/pagemap to determine whether a particular page is a guard page. - The 7 patch series "mm, swap: remove swap slot cache" from Kairui Song removes the swap slot cache from the allocation path - it simply wasn't being effective. - The 5 patch series "mm: cleanups for device-exclusive entries (hmm)" from David Hildenbrand implements a number of unrelated cleanups in this code. - The 5 patch series "mm: Rework generic PTDUMP configs" from Anshuman Khandual implements a number of preparatoty cleanups to the GENERIC_PTDUMP Kconfig logic. - The 8 patch series "mm/damon: auto-tune aggregation interval" from SeongJae Park implements a feedback-driven automatic tuning feature for DAMON's aggregation interval tuning. - The 5 patch series "Fix lazy mmu mode" from Ryan Roberts fixes some issues in powerpc, sparc and x86 lazy MMU implementations. Ryan did this in preparation for implementing lazy mmu mode for arm64 to optimize vmalloc. - The 2 patch series "mm/page_alloc: Some clarifications for migratetype fallback" from Brendan Jackman reworks some commentary to make the code easier to follow. - The 3 patch series "page_counter cleanup and size reduction" from Shakeel Butt cleans up the page_counter code and fixes a size increase which we accidentally added late last year. - The 3 patch series "Add a command line option that enables control of how many threads should be used to allocate huge pages" from Thomas Prescher does that. It allows the careful operator to significantly reduce boot time by tuning the parallalization of huge page initialization. - The 3 patch series "Fix calculations in trace_balance_dirty_pages() for cgwb" from Tang Yizhou fixes the tracing output from the dirty page balancing code. - The 9 patch series "mm/damon: make allow filters after reject filters useful and intuitive" from SeongJae Park improves the handling of allow and reject filters. Behaviour is made more consistent and the documention is updated accordingly. - The 5 patch series "Switch zswap to object read/write APIs" from Yosry Ahmed updates zswap to the new object read/write APIs and thus permits the removal of some legacy code from zpool and zsmalloc. - The 6 patch series "Some trivial cleanups for shmem" from Baolin Wang does as it claims. - The 20 patch series "fs/dax: Fix ZONE_DEVICE page reference counts" from Alistair Popple regularizes the weird ZONE_DEVICE page refcount handling in DAX, permittig the removal of a number of special-case checks. - The 4 patch series "refactor mremap and fix bug" from Lorenzo Stoakes is a preparatoty refactoring and cleanup of the mremap() code. - The 20 patch series "mm: MM owner tracking for large folios (!hugetlb) + CONFIG_NO_PAGE_MAPCOUNT" from David Hildenbrand reworks the manner in which we determine whether a large folio is known to be mapped exclusively into a single MM. - The 8 patch series "mm/damon: add sysfs dirs for managing DAMOS filters based on handling layers" from SeongJae Park adds a couple of new sysfs directories to ease the management of DAMON/DAMOS filters. - The 13 patch series "arch, mm: reduce code duplication in mem_init()" from Mike Rapoport consolidates many per-arch implementations of mem_init() into code generic code, where that is practical. - The 13 patch series "mm/damon/sysfs: commit parameters online via damon_call()" from SeongJae Park continues the cleaning up of sysfs access to DAMON internal data. - The 3 patch series "mm: page_ext: Introduce new iteration API" from Luiz Capitulino reworks the page_ext initialization to fix a boot-time crash which was observed with an unusual combination of compile and cmdline options. - The 8 patch series "Buddy allocator like (or non-uniform) folio split" from Zi Yan reworks the code to split a folio into smaller folios. The main benefit is lessened memory consumption: fewer post-split folios are generated. - The 2 patch series "Minimize xa_node allocation during xarry split" from Zi Yan reduces the number of xarray xa_nodes which are generated during an xarray split. - The 2 patch series "drivers/base/memory: Two cleanups" from Gavin Shan performs some maintenance work on the drivers/base/memory code. - The 3 patch series "Add tracepoints for lowmem reserves, watermarks and totalreserve_pages" from Martin Liu adds some more tracepoints to the page allocator code. - The 4 patch series "mm/madvise: cleanup requests validations and classifications" from SeongJae Park cleans up some warts which SeongJae observed during his earlier madvise work. - The 3 patch series "mm/hwpoison: Fix regressions in memory failure handling" from Shuai Xue addresses two quite serious regressions which Shuai has observed in the memory-failure implementation. - The 5 patch series "mm: reliable huge page allocator" from Johannes Weiner makes huge page allocations cheaper and more reliable by reducing fragmentation. - The 5 patch series "Minor memcg cleanups & prep for memdescs" from Matthew Wilcox is preparatory work for the future implementation of memdescs. - The 4 patch series "track memory used by balloon drivers" from Nico Pache introduces a way to track memory used by our various balloon drivers. - The 2 patch series "mm/damon: introduce DAMOS filter type for active pages" from Nhat Pham permits users to filter for active/inactive pages, separately for file and anon pages. - The 2 patch series "Adding Proactive Memory Reclaim Statistics" from Hao Jia separates the proactive reclaim statistics from the direct reclaim statistics. - The 2 patch series "mm/vmscan: don't try to reclaim hwpoison folio" from Jinjiang Tu fixes our handling of hwpoisoned pages within the reclaim code. -----BEGIN PGP SIGNATURE----- iHQEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZ+nZaAAKCRDdBJ7gKXxA jsOWAPiP4r7CJHMZRK4eyJOkvS1a1r+TsIarrFZtjwvf/GIfAQCEG+JDxVfUaUSF Ee93qSSLR1BkNdDw+931Pu0mXfbnBw== =Pn2K -----END PGP SIGNATURE----- Merge tag 'mm-stable-2025-03-30-16-52' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull MM updates from Andrew Morton: - The series "Enable strict percpu address space checks" from Uros Bizjak uses x86 named address space qualifiers to provide compile-time checking of percpu area accesses. This has caused a small amount of fallout - two or three issues were reported. In all cases the calling code was found to be incorrect. - The series "Some cleanup for memcg" from Chen Ridong implements some relatively monir cleanups for the memcontrol code. - The series "mm: fixes for device-exclusive entries (hmm)" from David Hildenbrand fixes a boatload of issues which David found then using device-exclusive PTE entries when THP is enabled. More work is needed, but this makes thins better - our own HMM selftests now succeed. - The series "mm: zswap: remove z3fold and zbud" from Yosry Ahmed remove the z3fold and zbud implementations. They have been deprecated for half a year and nobody has complained. - The series "mm: further simplify VMA merge operation" from Lorenzo Stoakes implements numerous simplifications in this area. No runtime effects are anticipated. - The series "mm/madvise: remove redundant mmap_lock operations from process_madvise()" from SeongJae Park rationalizes the locking in the madvise() implementation. Performance gains of 20-25% were observed in one MADV_DONTNEED microbenchmark. - The series "Tiny cleanup and improvements about SWAP code" from Baoquan He contains a number of touchups to issues which Baoquan noticed when working on the swap code. - The series "mm: kmemleak: Usability improvements" from Catalin Marinas implements a couple of improvements to the kmemleak user-visible output. - The series "mm/damon/paddr: fix large folios access and schemes handling" from Usama Arif provides a couple of fixes for DAMON's handling of large folios. - The series "mm/damon/core: fix wrong and/or useless damos_walk() behaviors" from SeongJae Park fixes a few issues with the accuracy of kdamond's walking of DAMON regions. - The series "expose mapping wrprotect, fix fb_defio use" from Lorenzo Stoakes changes the interaction between framebuffer deferred-io and core MM. No functional changes are anticipated - this is preparatory work for the future removal of page structure fields. - The series "mm/damon: add support for hugepage_size DAMOS filter" from Usama Arif adds a DAMOS filter which permits the filtering by huge page sizes. - The series "mm: permit guard regions for file-backed/shmem mappings" from Lorenzo Stoakes extends the guard region feature from its present "anon mappings only" state. The feature now covers shmem and file-backed mappings. - The series "mm: batched unmap lazyfree large folios during reclamation" from Barry Song cleans up and speeds up the unmapping for pte-mapped large folios. - The series "reimplement per-vma lock as a refcount" from Suren Baghdasaryan puts the vm_lock back into the vma. Our reasons for pulling it out were largely bogus and that change made the code more messy. This patchset provides small (0-10%) improvements on one microbenchmark. - The series "Docs/mm/damon: misc DAMOS filters documentation fixes and improves" from SeongJae Park does some maintenance work on the DAMON docs. - The series "hugetlb/CMA improvements for large systems" from Frank van der Linden addresses a pile of issues which have been observed when using CMA on large machines. - The series "mm/damon: introduce DAMOS filter type for unmapped pages" from SeongJae Park enables users of DMAON/DAMOS to filter my the page's mapped/unmapped status. - The series "zsmalloc/zram: there be preemption" from Sergey Senozhatsky teaches zram to run its compression and decompression operations preemptibly. - The series "selftests/mm: Some cleanups from trying to run them" from Brendan Jackman fixes a pile of unrelated issues which Brendan encountered while runnimg our selftests. - The series "fs/proc/task_mmu: add guard region bit to pagemap" from Lorenzo Stoakes permits userspace to use /proc/pid/pagemap to determine whether a particular page is a guard page. - The series "mm, swap: remove swap slot cache" from Kairui Song removes the swap slot cache from the allocation path - it simply wasn't being effective. - The series "mm: cleanups for device-exclusive entries (hmm)" from David Hildenbrand implements a number of unrelated cleanups in this code. - The series "mm: Rework generic PTDUMP configs" from Anshuman Khandual implements a number of preparatoty cleanups to the GENERIC_PTDUMP Kconfig logic. - The series "mm/damon: auto-tune aggregation interval" from SeongJae Park implements a feedback-driven automatic tuning feature for DAMON's aggregation interval tuning. - The series "Fix lazy mmu mode" from Ryan Roberts fixes some issues in powerpc, sparc and x86 lazy MMU implementations. Ryan did this in preparation for implementing lazy mmu mode for arm64 to optimize vmalloc. - The series "mm/page_alloc: Some clarifications for migratetype fallback" from Brendan Jackman reworks some commentary to make the code easier to follow. - The series "page_counter cleanup and size reduction" from Shakeel Butt cleans up the page_counter code and fixes a size increase which we accidentally added late last year. - The series "Add a command line option that enables control of how many threads should be used to allocate huge pages" from Thomas Prescher does that. It allows the careful operator to significantly reduce boot time by tuning the parallalization of huge page initialization. - The series "Fix calculations in trace_balance_dirty_pages() for cgwb" from Tang Yizhou fixes the tracing output from the dirty page balancing code. - The series "mm/damon: make allow filters after reject filters useful and intuitive" from SeongJae Park improves the handling of allow and reject filters. Behaviour is made more consistent and the documention is updated accordingly. - The series "Switch zswap to object read/write APIs" from Yosry Ahmed updates zswap to the new object read/write APIs and thus permits the removal of some legacy code from zpool and zsmalloc. - The series "Some trivial cleanups for shmem" from Baolin Wang does as it claims. - The series "fs/dax: Fix ZONE_DEVICE page reference counts" from Alistair Popple regularizes the weird ZONE_DEVICE page refcount handling in DAX, permittig the removal of a number of special-case checks. - The series "refactor mremap and fix bug" from Lorenzo Stoakes is a preparatoty refactoring and cleanup of the mremap() code. - The series "mm: MM owner tracking for large folios (!hugetlb) + CONFIG_NO_PAGE_MAPCOUNT" from David Hildenbrand reworks the manner in which we determine whether a large folio is known to be mapped exclusively into a single MM. - The series "mm/damon: add sysfs dirs for managing DAMOS filters based on handling layers" from SeongJae Park adds a couple of new sysfs directories to ease the management of DAMON/DAMOS filters. - The series "arch, mm: reduce code duplication in mem_init()" from Mike Rapoport consolidates many per-arch implementations of mem_init() into code generic code, where that is practical. - The series "mm/damon/sysfs: commit parameters online via damon_call()" from SeongJae Park continues the cleaning up of sysfs access to DAMON internal data. - The series "mm: page_ext: Introduce new iteration API" from Luiz Capitulino reworks the page_ext initialization to fix a boot-time crash which was observed with an unusual combination of compile and cmdline options. - The series "Buddy allocator like (or non-uniform) folio split" from Zi Yan reworks the code to split a folio into smaller folios. The main benefit is lessened memory consumption: fewer post-split folios are generated. - The series "Minimize xa_node allocation during xarry split" from Zi Yan reduces the number of xarray xa_nodes which are generated during an xarray split. - The series "drivers/base/memory: Two cleanups" from Gavin Shan performs some maintenance work on the drivers/base/memory code. - The series "Add tracepoints for lowmem reserves, watermarks and totalreserve_pages" from Martin Liu adds some more tracepoints to the page allocator code. - The series "mm/madvise: cleanup requests validations and classifications" from SeongJae Park cleans up some warts which SeongJae observed during his earlier madvise work. - The series "mm/hwpoison: Fix regressions in memory failure handling" from Shuai Xue addresses two quite serious regressions which Shuai has observed in the memory-failure implementation. - The series "mm: reliable huge page allocator" from Johannes Weiner makes huge page allocations cheaper and more reliable by reducing fragmentation. - The series "Minor memcg cleanups & prep for memdescs" from Matthew Wilcox is preparatory work for the future implementation of memdescs. - The series "track memory used by balloon drivers" from Nico Pache introduces a way to track memory used by our various balloon drivers. - The series "mm/damon: introduce DAMOS filter type for active pages" from Nhat Pham permits users to filter for active/inactive pages, separately for file and anon pages. - The series "Adding Proactive Memory Reclaim Statistics" from Hao Jia separates the proactive reclaim statistics from the direct reclaim statistics. - The series "mm/vmscan: don't try to reclaim hwpoison folio" from Jinjiang Tu fixes our handling of hwpoisoned pages within the reclaim code. * tag 'mm-stable-2025-03-30-16-52' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (431 commits) mm/page_alloc: remove unnecessary __maybe_unused in order_to_pindex() x86/mm: restore early initialization of high_memory for 32-bits mm/vmscan: don't try to reclaim hwpoison folio mm/hwpoison: introduce folio_contain_hwpoisoned_page() helper cgroup: docs: add pswpin and pswpout items in cgroup v2 doc mm: vmscan: split proactive reclaim statistics from direct reclaim statistics selftests/mm: speed up split_huge_page_test selftests/mm: uffd-unit-tests support for hugepages > 2M docs/mm/damon/design: document active DAMOS filter type mm/damon: implement a new DAMOS filter type for active pages fs/dax: don't disassociate zero page entries MM documentation: add "Unaccepted" meminfo entry selftests/mm: add commentary about 9pfs bugs fork: use __vmalloc_node() for stack allocation docs/mm: Physical Memory: Populate the "Zones" section xen: balloon: update the NR_BALLOON_PAGES state hv_balloon: update the NR_BALLOON_PAGES state balloon_compaction: update the NR_BALLOON_PAGES state meminfo: add a per node counter for balloon drivers mm: remove references to folio in __memcg_kmem_uncharge_page() ...
This commit is contained in:
commit
eb0ece1602
3
CREDITS
3
CREDITS
@ -1895,6 +1895,7 @@ S: Czech Republic
|
||||
N: Seth Jennings
|
||||
E: sjenning@redhat.com
|
||||
D: Creation and maintenance of zswap
|
||||
D: Creation and maintenace of the zbud allocator
|
||||
|
||||
N: Jeremy Kerr
|
||||
D: Maintainer of SPU File System
|
||||
@ -3803,6 +3804,7 @@ N: Dan Streetman
|
||||
E: ddstreet@ieee.org
|
||||
D: Maintenance and development of zswap
|
||||
D: Creation and maintenance of the zpool API
|
||||
D: Maintenace of the zbud allocator
|
||||
|
||||
N: Drew Sullivan
|
||||
E: drew@ss.org
|
||||
@ -4330,6 +4332,7 @@ S: England
|
||||
N: Vitaly Wool
|
||||
E: vitaly.wool@konsulko.com
|
||||
D: Maintenance and development of zswap
|
||||
D: Maintenance and development of z3fold
|
||||
|
||||
N: Chris Wright
|
||||
E: chrisw@sous-sol.org
|
||||
|
@ -22,14 +22,6 @@ Description:
|
||||
device. The reset operation frees all the memory associated
|
||||
with this device.
|
||||
|
||||
What: /sys/block/zram<id>/max_comp_streams
|
||||
Date: February 2014
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
Description:
|
||||
The max_comp_streams file is read-write and specifies the
|
||||
number of backend's zcomp_strm compression streams (number of
|
||||
concurrent compress operations).
|
||||
|
||||
What: /sys/block/zram<id>/comp_algorithm
|
||||
Date: February 2014
|
||||
Contact: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
|
||||
|
@ -29,3 +29,16 @@ Date: Feb 2024
|
||||
Contact: Anshuman Khandual <anshuman.khandual@arm.com>
|
||||
Description:
|
||||
the number of pages CMA API succeeded to release
|
||||
|
||||
What: /sys/kernel/mm/cma/<cma-heap-name>/total_pages
|
||||
Date: Jun 2024
|
||||
Contact: Frank van der Linden <fvdl@google.com>
|
||||
Description:
|
||||
The size of the CMA area in pages.
|
||||
|
||||
What: /sys/kernel/mm/cma/<cma-heap-name>/available_pages
|
||||
Date: Jun 2024
|
||||
Contact: Frank van der Linden <fvdl@google.com>
|
||||
Description:
|
||||
The number of pages in the CMA area that are still
|
||||
available for CMA allocation.
|
||||
|
@ -91,6 +91,36 @@ Description: Writing a value to this file sets the update interval of the
|
||||
DAMON context in microseconds as the value. Reading this file
|
||||
returns the value.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/intrvals_goal/access_bp
|
||||
Date: Feb 2025
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a value to this file sets the monitoring intervals
|
||||
auto-tuning target DAMON-observed access events ratio within
|
||||
the given time interval (aggrs in same directory), in bp
|
||||
(1/10,000). Reading this file returns the value.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/intrvals_goal/aggrs
|
||||
Date: Feb 2025
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a value to this file sets the time interval to achieve
|
||||
the monitoring intervals auto-tuning target DAMON-observed
|
||||
access events ratio (access_bp in same directory) within.
|
||||
Reading this file returns the value.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/intrvals_goal/min_sample_us
|
||||
Date: Feb 2025
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a value to this file sets the minimum value of
|
||||
auto-tuned sampling interval in microseconds. Reading this
|
||||
file returns the value.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/intrvals_goal/max_sample_us
|
||||
Date: Feb 2025
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a value to this file sets the maximum value of
|
||||
auto-tuned sampling interval in microseconds. Reading this
|
||||
file returns the value.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/nr_regions/min
|
||||
|
||||
WDate: Mar 2022
|
||||
@ -345,6 +375,20 @@ Description: If 'addr' is written to the 'type' file, writing to or reading
|
||||
from this file sets or gets the end address of the address
|
||||
range for the filter.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/min
|
||||
Date: Feb 2025
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: If 'hugepage_size' is written to the 'type' file, writing to
|
||||
or reading from this file sets or gets the minimum size of the
|
||||
hugepage for the filter.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/max
|
||||
Date: Feb 2025
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: If 'hugepage_size' is written to the 'type' file, writing to
|
||||
or reading from this file sets or gets the maximum size of the
|
||||
hugepage for the filter.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters/<F>/target_idx
|
||||
Date: Dec 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
@ -365,6 +409,22 @@ Description: Writing 'Y' or 'N' to this file sets whether to allow or reject
|
||||
applying the scheme's action to the memory that satisfies the
|
||||
'type' and the 'matching' of the directory.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/core_filters
|
||||
Date: Feb 2025
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Directory for DAMON core layer-handled DAMOS filters. Files
|
||||
under this directory works same to those of
|
||||
/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters
|
||||
directory.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/ops_filters
|
||||
Date: Feb 2025
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Directory for DAMON operations set layer-handled DAMOS filters.
|
||||
Files under this directory works same to those of
|
||||
/sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/filters
|
||||
directory.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/stats/nr_tried
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
|
@ -971,6 +971,16 @@ unfortunately any spinlock in a ``SLAB_TYPESAFE_BY_RCU`` object must be
|
||||
initialized after each and every call to kmem_cache_alloc(), which renders
|
||||
reference-free spinlock acquisition completely unsafe. Therefore, when
|
||||
using ``SLAB_TYPESAFE_BY_RCU``, make proper use of a reference counter.
|
||||
If using refcount_t, the specialized refcount_{add|inc}_not_zero_acquire()
|
||||
and refcount_set_release() APIs should be used to ensure correct operation
|
||||
ordering when verifying object identity and when initializing newly
|
||||
allocated objects. Acquire fence in refcount_{add|inc}_not_zero_acquire()
|
||||
ensures that identity checks happen *after* reference count is taken.
|
||||
refcount_set_release() should be called after a newly allocated object is
|
||||
fully initialized and release fence ensures that new values are visible
|
||||
*before* refcount can be successfully taken by other users. Once
|
||||
refcount_set_release() is called, the object should be considered visible
|
||||
by other tasks.
|
||||
(Those willing to initialize their locks in a kmem_cache constructor
|
||||
may also use locking, including cache-friendly sequence locking.)
|
||||
|
||||
|
@ -54,7 +54,7 @@ The list of possible return codes:
|
||||
If you use 'echo', the returned value is set by the 'echo' utility,
|
||||
and, in general case, something like::
|
||||
|
||||
echo 3 > /sys/block/zram0/max_comp_streams
|
||||
echo foo > /sys/block/zram0/comp_algorithm
|
||||
if [ $? -ne 0 ]; then
|
||||
handle_error
|
||||
fi
|
||||
@ -73,21 +73,7 @@ This creates 4 devices: /dev/zram{0,1,2,3}
|
||||
num_devices parameter is optional and tells zram how many devices should be
|
||||
pre-created. Default: 1.
|
||||
|
||||
2) Set max number of compression streams
|
||||
========================================
|
||||
|
||||
Regardless of the value passed to this attribute, ZRAM will always
|
||||
allocate multiple compression streams - one per online CPU - thus
|
||||
allowing several concurrent compression operations. The number of
|
||||
allocated compression streams goes down when some of the CPUs
|
||||
become offline. There is no single-compression-stream mode anymore,
|
||||
unless you are running a UP system or have only 1 CPU online.
|
||||
|
||||
To find out how many streams are currently available::
|
||||
|
||||
cat /sys/block/zram0/max_comp_streams
|
||||
|
||||
3) Select compression algorithm
|
||||
2) Select compression algorithm
|
||||
===============================
|
||||
|
||||
Using comp_algorithm device attribute one can see available and
|
||||
@ -107,7 +93,7 @@ Examples::
|
||||
For the time being, the `comp_algorithm` content shows only compression
|
||||
algorithms that are supported by zram.
|
||||
|
||||
4) Set compression algorithm parameters: Optional
|
||||
3) Set compression algorithm parameters: Optional
|
||||
=================================================
|
||||
|
||||
Compression algorithms may support specific parameters which can be
|
||||
@ -138,7 +124,7 @@ better the compression ratio, it even can take negatives values for some
|
||||
algorithms), for other algorithms `level` is acceleration level (the higher
|
||||
the value the lower the compression ratio).
|
||||
|
||||
5) Set Disksize
|
||||
4) Set Disksize
|
||||
===============
|
||||
|
||||
Set disk size by writing the value to sysfs node 'disksize'.
|
||||
@ -158,7 +144,7 @@ There is little point creating a zram of greater than twice the size of memory
|
||||
since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the
|
||||
size of the disk when not in use so a huge zram is wasteful.
|
||||
|
||||
6) Set memory limit: Optional
|
||||
5) Set memory limit: Optional
|
||||
=============================
|
||||
|
||||
Set memory limit by writing the value to sysfs node 'mem_limit'.
|
||||
@ -177,7 +163,7 @@ Examples::
|
||||
# To disable memory limit
|
||||
echo 0 > /sys/block/zram0/mem_limit
|
||||
|
||||
7) Activate
|
||||
6) Activate
|
||||
===========
|
||||
|
||||
::
|
||||
@ -188,7 +174,7 @@ Examples::
|
||||
mkfs.ext4 /dev/zram1
|
||||
mount /dev/zram1 /tmp
|
||||
|
||||
8) Add/remove zram devices
|
||||
7) Add/remove zram devices
|
||||
==========================
|
||||
|
||||
zram provides a control interface, which enables dynamic (on-demand) device
|
||||
@ -208,7 +194,7 @@ execute::
|
||||
|
||||
echo X > /sys/class/zram-control/hot_remove
|
||||
|
||||
9) Stats
|
||||
8) Stats
|
||||
========
|
||||
|
||||
Per-device statistics are exported as various nodes under /sys/block/zram<id>/
|
||||
@ -228,8 +214,6 @@ mem_limit WO specifies the maximum amount of memory ZRAM can
|
||||
writeback_limit WO specifies the maximum amount of write IO zram
|
||||
can write out to backing device as 4KB unit
|
||||
writeback_limit_enable RW show and set writeback_limit feature
|
||||
max_comp_streams RW the number of possible concurrent compress
|
||||
operations
|
||||
comp_algorithm RW show and change the compression algorithm
|
||||
algorithm_params WO setup compression algorithm parameters
|
||||
compact WO trigger memory compaction
|
||||
@ -310,7 +294,7 @@ a single line of text and contains the following stats separated by whitespace:
|
||||
Unit: 4K bytes
|
||||
============== =============================================================
|
||||
|
||||
10) Deactivate
|
||||
9) Deactivate
|
||||
==============
|
||||
|
||||
::
|
||||
@ -318,7 +302,7 @@ a single line of text and contains the following stats separated by whitespace:
|
||||
swapoff /dev/zram0
|
||||
umount /dev/zram1
|
||||
|
||||
11) Reset
|
||||
10) Reset
|
||||
=========
|
||||
|
||||
Write any positive value to 'reset' sysfs node::
|
||||
|
@ -610,6 +610,10 @@ memory.stat file includes following statistics:
|
||||
|
||||
'rss + mapped_file" will give you resident set size of cgroup.
|
||||
|
||||
Note that some kernel configurations might account complete larger
|
||||
allocations (e.g., THP) towards 'rss' and 'mapped_file', even if
|
||||
only some, but not all that memory is mapped.
|
||||
|
||||
(Note: file and shmem may be shared among other cgroups. In that case,
|
||||
mapped_file is accounted only when the memory cgroup is owner of page
|
||||
cache.)
|
||||
|
@ -1445,7 +1445,10 @@ The following nested keys are defined.
|
||||
|
||||
anon
|
||||
Amount of memory used in anonymous mappings such as
|
||||
brk(), sbrk(), and mmap(MAP_ANONYMOUS)
|
||||
brk(), sbrk(), and mmap(MAP_ANONYMOUS). Note that
|
||||
some kernel configurations might account complete larger
|
||||
allocations (e.g., THP) if only some, but not all the
|
||||
memory of such an allocation is mapped anymore.
|
||||
|
||||
file
|
||||
Amount of memory used to cache filesystem data,
|
||||
@ -1488,7 +1491,10 @@ The following nested keys are defined.
|
||||
Amount of application memory swapped out to zswap.
|
||||
|
||||
file_mapped
|
||||
Amount of cached filesystem data mapped with mmap()
|
||||
Amount of cached filesystem data mapped with mmap(). Note
|
||||
that some kernel configurations might account complete
|
||||
larger allocations (e.g., THP) if only some, but not
|
||||
not all the memory of such an allocation is mapped.
|
||||
|
||||
file_dirty
|
||||
Amount of cached filesystem data that was modified but
|
||||
@ -1560,6 +1566,12 @@ The following nested keys are defined.
|
||||
workingset_nodereclaim
|
||||
Number of times a shadow node has been reclaimed
|
||||
|
||||
pswpin (npn)
|
||||
Number of pages swapped into memory
|
||||
|
||||
pswpout (npn)
|
||||
Number of pages swapped out of memory
|
||||
|
||||
pgscan (npn)
|
||||
Amount of scanned pages (in an inactive LRU list)
|
||||
|
||||
@ -1575,6 +1587,9 @@ The following nested keys are defined.
|
||||
pgscan_khugepaged (npn)
|
||||
Amount of scanned pages by khugepaged (in an inactive LRU list)
|
||||
|
||||
pgscan_proactive (npn)
|
||||
Amount of scanned pages proactively (in an inactive LRU list)
|
||||
|
||||
pgsteal_kswapd (npn)
|
||||
Amount of reclaimed pages by kswapd
|
||||
|
||||
@ -1584,6 +1599,9 @@ The following nested keys are defined.
|
||||
pgsteal_khugepaged (npn)
|
||||
Amount of reclaimed pages by khugepaged
|
||||
|
||||
pgsteal_proactive (npn)
|
||||
Amount of reclaimed pages proactively
|
||||
|
||||
pgfault (npn)
|
||||
Total number of page faults incurred
|
||||
|
||||
@ -1661,6 +1679,9 @@ The following nested keys are defined.
|
||||
pgdemote_khugepaged
|
||||
Number of pages demoted by khugepaged.
|
||||
|
||||
pgdemote_proactive
|
||||
Number of pages demoted by proactively.
|
||||
|
||||
hugetlb
|
||||
Amount of memory used by hugetlb pages. This metric only shows
|
||||
up if hugetlb usage is accounted for in memory.current (i.e.
|
||||
|
@ -1866,7 +1866,7 @@
|
||||
hpet_mmap= [X86, HPET_MMAP] Allow userspace to mmap HPET
|
||||
registers. Default set by CONFIG_HPET_MMAP_DEFAULT.
|
||||
|
||||
hugepages= [HW] Number of HugeTLB pages to allocate at boot.
|
||||
hugepages= [HW,EARLY] Number of HugeTLB pages to allocate at boot.
|
||||
If this follows hugepagesz (below), it specifies
|
||||
the number of pages of hugepagesz to be allocated.
|
||||
If this is the first HugeTLB parameter on the command
|
||||
@ -1878,15 +1878,24 @@
|
||||
<node>:<integer>[,<node>:<integer>]
|
||||
|
||||
hugepagesz=
|
||||
[HW] The size of the HugeTLB pages. This is used in
|
||||
conjunction with hugepages (above) to allocate huge
|
||||
pages of a specific size at boot. The pair
|
||||
hugepagesz=X hugepages=Y can be specified once for
|
||||
each supported huge page size. Huge page sizes are
|
||||
architecture dependent. See also
|
||||
[HW,EARLY] The size of the HugeTLB pages. This is
|
||||
used in conjunction with hugepages (above) to
|
||||
allocate huge pages of a specific size at boot. The
|
||||
pair hugepagesz=X hugepages=Y can be specified once
|
||||
for each supported huge page size. Huge page sizes
|
||||
are architecture dependent. See also
|
||||
Documentation/admin-guide/mm/hugetlbpage.rst.
|
||||
Format: size[KMG]
|
||||
|
||||
hugepage_alloc_threads=
|
||||
[HW] The number of threads that should be used to
|
||||
allocate hugepages during boot. This option can be
|
||||
used to improve system bootup time when allocating
|
||||
a large amount of huge pages.
|
||||
The default value is 25% of the available hardware threads.
|
||||
|
||||
Note that this parameter only applies to non-gigantic huge pages.
|
||||
|
||||
hugetlb_cma= [HW,CMA,EARLY] The size of a CMA area used for allocation
|
||||
of gigantic hugepages. Or using node format, the size
|
||||
of a CMA area per node can be specified.
|
||||
@ -1897,6 +1906,13 @@
|
||||
hugepages using the CMA allocator. If enabled, the
|
||||
boot-time allocation of gigantic hugepages is skipped.
|
||||
|
||||
hugetlb_cma_only=
|
||||
[HW,CMA,EARLY] When allocating new HugeTLB pages, only
|
||||
try to allocate from the CMA areas.
|
||||
|
||||
This option does nothing if hugetlb_cma= is not also
|
||||
specified.
|
||||
|
||||
hugetlb_free_vmemmap=
|
||||
[KNL] Requires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
||||
enabled.
|
||||
|
@ -12,10 +12,16 @@ its CMA name like below:
|
||||
|
||||
The structure of the files created under that directory is as follows:
|
||||
|
||||
- [RO] base_pfn: The base PFN (Page Frame Number) of the zone.
|
||||
- [RO] base_pfn: The base PFN (Page Frame Number) of the CMA area.
|
||||
This is the same as ranges/0/base_pfn.
|
||||
- [RO] count: Amount of memory in the CMA area.
|
||||
- [RO] order_per_bit: Order of pages represented by one bit.
|
||||
- [RO] bitmap: The bitmap of page states in the zone.
|
||||
- [RO] bitmap: The bitmap of allocated pages in the area.
|
||||
This is the same as ranges/0/base_pfn.
|
||||
- [RO] ranges/N/base_pfn: The base PFN of contiguous range N
|
||||
in the CMA area.
|
||||
- [RO] ranges/N/bitmap: The bit map of allocated pages in
|
||||
range N in the CMA area.
|
||||
- [WO] alloc: Allocate N pages from that CMA area. For example::
|
||||
|
||||
echo 5 > <debugfs>/cma/<cma_name>/alloc
|
||||
|
@ -64,6 +64,7 @@ comma (",").
|
||||
│ │ │ │ :ref:`0 <sysfs_context>`/avail_operations,operations
|
||||
│ │ │ │ │ :ref:`monitoring_attrs <sysfs_monitoring_attrs>`/
|
||||
│ │ │ │ │ │ intervals/sample_us,aggr_us,update_us
|
||||
│ │ │ │ │ │ │ intervals_goal/access_bp,aggrs,min_sample_us,max_sample_us
|
||||
│ │ │ │ │ │ nr_regions/min,max
|
||||
│ │ │ │ │ :ref:`targets <sysfs_targets>`/nr_targets
|
||||
│ │ │ │ │ │ :ref:`0 <sysfs_target>`/pid_target
|
||||
@ -82,8 +83,8 @@ comma (",").
|
||||
│ │ │ │ │ │ │ │ :ref:`goals <sysfs_schemes_quota_goals>`/nr_goals
|
||||
│ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value
|
||||
│ │ │ │ │ │ │ :ref:`watermarks <sysfs_watermarks>`/metric,interval_us,high,mid,low
|
||||
│ │ │ │ │ │ │ :ref:`filters <sysfs_filters>`/nr_filters
|
||||
│ │ │ │ │ │ │ │ 0/type,matching,allow,memcg_path,addr_start,addr_end,target_idx
|
||||
│ │ │ │ │ │ │ :ref:`{core_,ops_,}filters <sysfs_filters>`/nr_filters
|
||||
│ │ │ │ │ │ │ │ 0/type,matching,allow,memcg_path,addr_start,addr_end,target_idx,min,max
|
||||
│ │ │ │ │ │ │ :ref:`stats <sysfs_schemes_stats>`/nr_tried,sz_tried,nr_applied,sz_applied,sz_ops_filter_passed,qt_exceeds
|
||||
│ │ │ │ │ │ │ :ref:`tried_regions <sysfs_schemes_tried_regions>`/total_bytes
|
||||
│ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age,sz_filter_passed
|
||||
@ -132,6 +133,11 @@ Users can write below commands for the kdamond to the ``state`` file.
|
||||
- ``off``: Stop running.
|
||||
- ``commit``: Read the user inputs in the sysfs files except ``state`` file
|
||||
again.
|
||||
- ``update_tuned_intervals``: Update the contents of ``sample_us`` and
|
||||
``aggr_us`` files of the kdamond with the auto-tuning applied ``sampling
|
||||
interval`` and ``aggregation interval`` for the files. Please refer to
|
||||
:ref:`intervals_goal section <damon_usage_sysfs_monitoring_intervals_goal>`
|
||||
for more details.
|
||||
- ``commit_schemes_quota_goals``: Read the DAMON-based operation schemes'
|
||||
:ref:`quota goals <sysfs_schemes_quota_goals>`.
|
||||
- ``update_schemes_stats``: Update the contents of stats files for each
|
||||
@ -213,6 +219,25 @@ writing to and rading from the files.
|
||||
For more details about the intervals and monitoring regions range, please refer
|
||||
to the Design document (:doc:`/mm/damon/design`).
|
||||
|
||||
.. _damon_usage_sysfs_monitoring_intervals_goal:
|
||||
|
||||
contexts/<N>/monitoring_attrs/intervals/intervals_goal/
|
||||
-------------------------------------------------------
|
||||
|
||||
Under the ``intervals`` directory, one directory for automated tuning of
|
||||
``sample_us`` and ``aggr_us``, namely ``intervals_goal`` directory also exists.
|
||||
Under the directory, four files for the auto-tuning control, namely
|
||||
``access_bp``, ``aggrs``, ``min_sample_us`` and ``max_sample_us`` exist.
|
||||
Please refer to the :ref:`design document of the feature
|
||||
<damon_design_monitoring_intervals_autotuning>` for the internal of the tuning
|
||||
mechanism. Reading and writing the four files under ``intervals_goal``
|
||||
directory shows and updates the tuning parameters that described in the
|
||||
:ref:design doc <damon_design_monitoring_intervals_autotuning>` with the same
|
||||
names. The tuning starts with the user-set ``sample_us`` and ``aggr_us``. The
|
||||
tuning-applied current values of the two intervals can be read from the
|
||||
``sample_us`` and ``aggr_us`` files after writing ``update_tuned_intervals`` to
|
||||
the ``state`` file.
|
||||
|
||||
.. _sysfs_targets:
|
||||
|
||||
contexts/<N>/targets/
|
||||
@ -282,9 +307,10 @@ to ``N-1``. Each directory represents each DAMON-based operation scheme.
|
||||
schemes/<N>/
|
||||
------------
|
||||
|
||||
In each scheme directory, five directories (``access_pattern``, ``quotas``,
|
||||
``watermarks``, ``filters``, ``stats``, and ``tried_regions``) and three files
|
||||
(``action``, ``target_nid`` and ``apply_interval``) exist.
|
||||
In each scheme directory, seven directories (``access_pattern``, ``quotas``,
|
||||
``watermarks``, ``core_filters``, ``ops_filters``, ``filters``, ``stats``, and
|
||||
``tried_regions``) and three files (``action``, ``target_nid`` and
|
||||
``apply_interval``) exist.
|
||||
|
||||
The ``action`` file is for setting and getting the scheme's :ref:`action
|
||||
<damon_design_damos_action>`. The keywords that can be written to and read
|
||||
@ -395,33 +421,43 @@ The ``interval`` should written in microseconds unit.
|
||||
|
||||
.. _sysfs_filters:
|
||||
|
||||
schemes/<N>/filters/
|
||||
--------------------
|
||||
schemes/<N>/{core\_,ops\_,}filters/
|
||||
-----------------------------------
|
||||
|
||||
The directory for the :ref:`filters <damon_design_damos_filters>` of the given
|
||||
Directories for :ref:`filters <damon_design_damos_filters>` of the given
|
||||
DAMON-based operation scheme.
|
||||
|
||||
In the beginning, this directory has only one file, ``nr_filters``. Writing a
|
||||
``core_filters`` and ``ops_filters`` directories are for the filters handled by
|
||||
the DAMON core layer and operations set layer, respectively. ``filters``
|
||||
directory can be used for installing filters regardless of their handled
|
||||
layers. Filters that requested by ``core_filters`` and ``ops_filters`` will be
|
||||
installed before those of ``filters``. All three directories have same files.
|
||||
|
||||
Use of ``filters`` directory can make expecting evaluation orders of given
|
||||
filters with the files under directory bit confusing. Users are hence
|
||||
recommended to use ``core_filters`` and ``ops_filters`` directories. The
|
||||
``filters`` directory could be deprecated in future.
|
||||
|
||||
In the beginning, the directory has only one file, ``nr_filters``. Writing a
|
||||
number (``N``) to the file creates the number of child directories named ``0``
|
||||
to ``N-1``. Each directory represents each filter. The filters are evaluated
|
||||
in the numeric order.
|
||||
|
||||
Each filter directory contains seven files, namely ``type``, ``matching``,
|
||||
``allow``, ``memcg_path``, ``addr_start``, ``addr_end``, and ``target_idx``.
|
||||
To ``type`` file, you can write one of five special keywords: ``anon`` for
|
||||
anonymous pages, ``memcg`` for specific memory cgroup, ``young`` for young
|
||||
pages, ``addr`` for specific address range (an open-ended interval), or
|
||||
``target`` for specific DAMON monitoring target filtering. Meaning of the
|
||||
types are same to the description on the :ref:`design doc
|
||||
<damon_design_damos_filters>`.
|
||||
Each filter directory contains nine files, namely ``type``, ``matching``,
|
||||
``allow``, ``memcg_path``, ``addr_start``, ``addr_end``, ``min``, ``max``
|
||||
and ``target_idx``. To ``type`` file, you can write the type of the filter.
|
||||
Refer to :ref:`the design doc <damon_design_damos_filters>` for available type
|
||||
names, their meaning and on what layer those are handled.
|
||||
|
||||
In case of the memory cgroup filtering, you can specify the memory cgroup of
|
||||
the interest by writing the path of the memory cgroup from the cgroups mount
|
||||
point to ``memcg_path`` file. In case of the address range filtering, you can
|
||||
specify the start and end address of the range to ``addr_start`` and
|
||||
``addr_end`` files, respectively. For the DAMON monitoring target filtering,
|
||||
you can specify the index of the target between the list of the DAMON context's
|
||||
monitoring targets list to ``target_idx`` file.
|
||||
For ``memcg`` type, you can specify the memory cgroup of the interest by
|
||||
writing the path of the memory cgroup from the cgroups mount point to
|
||||
``memcg_path`` file. For ``addr`` type, you can specify the start and end
|
||||
address of the range (open-ended interval) to ``addr_start`` and ``addr_end``
|
||||
files, respectively. For ``hugepage_size`` type, you can specify the minimum
|
||||
and maximum size of the range (closed interval) to ``min`` and ``max`` files,
|
||||
respectively. For ``target`` type, you can specify the index of the target
|
||||
between the list of the DAMON context's monitoring targets list to
|
||||
``target_idx`` file.
|
||||
|
||||
You can write ``Y`` or ``N`` to ``matching`` file to specify whether the filter
|
||||
is for memory that matches the ``type``. You can write ``Y`` or ``N`` to
|
||||
@ -431,6 +467,7 @@ the ``type`` and ``matching`` should be allowed or not.
|
||||
For example, below restricts a DAMOS action to be applied to only non-anonymous
|
||||
pages of all memory cgroups except ``/having_care_already``.::
|
||||
|
||||
# cd ops_filters/0/
|
||||
# echo 2 > nr_filters
|
||||
# # disallow anonymous pages
|
||||
echo anon > 0/type
|
||||
|
@ -145,7 +145,17 @@ hugepages
|
||||
|
||||
It will allocate 1 2M hugepage on node0 and 2 2M hugepages on node1.
|
||||
If the node number is invalid, the parameter will be ignored.
|
||||
hugepage_alloc_threads
|
||||
Specify the number of threads that should be used to allocate hugepages
|
||||
during boot. This parameter can be used to improve system bootup time
|
||||
when allocating a large amount of huge pages.
|
||||
|
||||
The default value is 25% of the available hardware threads.
|
||||
Example to use 8 allocation threads::
|
||||
|
||||
hugepage_alloc_threads=8
|
||||
|
||||
Note that this parameter only applies to non-gigantic huge pages.
|
||||
default_hugepagesz
|
||||
Specify the default huge page size. This parameter can
|
||||
only be specified once on the command line. default_hugepagesz can
|
||||
|
@ -21,7 +21,8 @@ There are four components to pagemap:
|
||||
* Bit 56 page exclusively mapped (since 4.2)
|
||||
* Bit 57 pte is uffd-wp write-protected (since 5.13) (see
|
||||
Documentation/admin-guide/mm/userfaultfd.rst)
|
||||
* Bits 58-60 zero
|
||||
* Bit 58 pte is a guard region (since 6.15) (see madvise (2) man page)
|
||||
* Bits 59-60 zero
|
||||
* Bit 61 page is file-page or shared-anon (since 3.5)
|
||||
* Bit 62 page swapped
|
||||
* Bit 63 page present
|
||||
@ -37,12 +38,28 @@ There are four components to pagemap:
|
||||
precisely which pages are mapped (or in swap) and comparing mapped
|
||||
pages between processes.
|
||||
|
||||
Traditionally, bit 56 indicates that a page is mapped exactly once and bit
|
||||
56 is clear when a page is mapped multiple times, even when mapped in the
|
||||
same process multiple times. In some kernel configurations, the semantics
|
||||
for pages part of a larger allocation (e.g., THP) can differ: bit 56 is set
|
||||
if all pages part of the corresponding large allocation are *certainly*
|
||||
mapped in the same process, even if the page is mapped multiple times in that
|
||||
process. Bit 56 is clear when any page page of the larger allocation
|
||||
is *maybe* mapped in a different process. In some cases, a large allocation
|
||||
might be treated as "maybe mapped by multiple processes" even though this
|
||||
is no longer the case.
|
||||
|
||||
Efficient users of this interface will use ``/proc/pid/maps`` to
|
||||
determine which areas of memory are actually mapped and llseek to
|
||||
skip over unmapped regions.
|
||||
|
||||
* ``/proc/kpagecount``. This file contains a 64-bit count of the number of
|
||||
times each page is mapped, indexed by PFN.
|
||||
times each page is mapped, indexed by PFN. Some kernel configurations do
|
||||
not track the precise number of times a page part of a larger allocation
|
||||
(e.g., THP) is mapped. In these configurations, the average number of
|
||||
mappings per page in this larger allocation is returned instead. However,
|
||||
if any page of the large allocation is mapped, the returned value will
|
||||
be at least 1.
|
||||
|
||||
The page-types tool in the tools/mm directory can be used to query the
|
||||
number of times a page is mapped.
|
||||
|
@ -60,15 +60,13 @@ accessed. The compressed memory pool grows on demand and shrinks as compressed
|
||||
pages are freed. The pool is not preallocated. By default, a zpool
|
||||
of type selected in ``CONFIG_ZSWAP_ZPOOL_DEFAULT`` Kconfig option is created,
|
||||
but it can be overridden at boot time by setting the ``zpool`` attribute,
|
||||
e.g. ``zswap.zpool=zbud``. It can also be changed at runtime using the sysfs
|
||||
e.g. ``zswap.zpool=zsmalloc``. It can also be changed at runtime using the sysfs
|
||||
``zpool`` attribute, e.g.::
|
||||
|
||||
echo zbud > /sys/module/zswap/parameters/zpool
|
||||
echo zsmalloc > /sys/module/zswap/parameters/zpool
|
||||
|
||||
The zbud type zpool allocates exactly 1 page to store 2 compressed pages, which
|
||||
means the compression ratio will always be 2:1 or worse (because of half-full
|
||||
zbud pages). The zsmalloc type zpool has a more complex compressed page
|
||||
storage method, and it can achieve greater storage densities.
|
||||
The zsmalloc type zpool has a complex compressed page storage method, and it
|
||||
can achieve great storage densities.
|
||||
|
||||
When a swap page is passed from swapout to zswap, zswap maintains a mapping
|
||||
of the swap entry, a combination of the swap type and swap offset, to the zpool
|
||||
|
@ -28,6 +28,7 @@ Currently, these files are in /proc/sys/vm:
|
||||
- compact_memory
|
||||
- compaction_proactiveness
|
||||
- compact_unevictable_allowed
|
||||
- defrag_mode
|
||||
- dirty_background_bytes
|
||||
- dirty_background_ratio
|
||||
- dirty_bytes
|
||||
@ -145,6 +146,14 @@ On CONFIG_PREEMPT_RT the default value is 0 in order to avoid a page fault, due
|
||||
to compaction, which would block the task from becoming active until the fault
|
||||
is resolved.
|
||||
|
||||
defrag_mode
|
||||
===========
|
||||
|
||||
When set to 1, the page allocator tries harder to avoid fragmentation
|
||||
and maintain the ability to produce huge pages / higher-order pages.
|
||||
|
||||
It is recommended to enable this right after boot, as fragmentation,
|
||||
once it occurred, can be long-lasting or even permanent.
|
||||
|
||||
dirty_background_bytes
|
||||
======================
|
||||
|
@ -22,8 +22,6 @@ offlining of memory being accessed by the ptdump code.
|
||||
In order to dump the kernel page tables, enable the following
|
||||
configurations and mount debugfs::
|
||||
|
||||
CONFIG_GENERIC_PTDUMP=y
|
||||
CONFIG_PTDUMP_CORE=y
|
||||
CONFIG_PTDUMP_DEBUGFS=y
|
||||
|
||||
mount -t debugfs nodev /sys/kernel/debug
|
||||
|
@ -86,7 +86,19 @@ Memory ordering guarantee changes:
|
||||
* none (both fully unordered)
|
||||
|
||||
|
||||
case 2) - increment-based ops that return no value
|
||||
case 2) - non-"Read/Modify/Write" (RMW) ops with release ordering
|
||||
-----------------------------------------------------------------
|
||||
|
||||
Function changes:
|
||||
|
||||
* atomic_set_release() --> refcount_set_release()
|
||||
|
||||
Memory ordering guarantee changes:
|
||||
|
||||
* none (both provide RELEASE ordering)
|
||||
|
||||
|
||||
case 3) - increment-based ops that return no value
|
||||
--------------------------------------------------
|
||||
|
||||
Function changes:
|
||||
@ -98,7 +110,7 @@ Memory ordering guarantee changes:
|
||||
|
||||
* none (both fully unordered)
|
||||
|
||||
case 3) - decrement-based RMW ops that return no value
|
||||
case 4) - decrement-based RMW ops that return no value
|
||||
------------------------------------------------------
|
||||
|
||||
Function changes:
|
||||
@ -110,7 +122,7 @@ Memory ordering guarantee changes:
|
||||
* fully unordered --> RELEASE ordering
|
||||
|
||||
|
||||
case 4) - increment-based RMW ops that return a value
|
||||
case 5) - increment-based RMW ops that return a value
|
||||
-----------------------------------------------------
|
||||
|
||||
Function changes:
|
||||
@ -126,7 +138,20 @@ Memory ordering guarantees changes:
|
||||
result of obtaining pointer to the object!
|
||||
|
||||
|
||||
case 5) - generic dec/sub decrement-based RMW ops that return a value
|
||||
case 6) - increment-based RMW ops with acquire ordering that return a value
|
||||
---------------------------------------------------------------------------
|
||||
|
||||
Function changes:
|
||||
|
||||
* atomic_inc_not_zero() --> refcount_inc_not_zero_acquire()
|
||||
* no atomic counterpart --> refcount_add_not_zero_acquire()
|
||||
|
||||
Memory ordering guarantees changes:
|
||||
|
||||
* fully ordered --> ACQUIRE ordering on success
|
||||
|
||||
|
||||
case 7) - generic dec/sub decrement-based RMW ops that return a value
|
||||
---------------------------------------------------------------------
|
||||
|
||||
Function changes:
|
||||
@ -139,7 +164,7 @@ Memory ordering guarantees changes:
|
||||
* fully ordered --> RELEASE ordering + ACQUIRE ordering on success
|
||||
|
||||
|
||||
case 6) other decrement-based RMW ops that return a value
|
||||
case 8) other decrement-based RMW ops that return a value
|
||||
---------------------------------------------------------
|
||||
|
||||
Function changes:
|
||||
@ -154,7 +179,7 @@ Memory ordering guarantees changes:
|
||||
.. note:: atomic_add_unless() only provides full order on success.
|
||||
|
||||
|
||||
case 7) - lock-based RMW
|
||||
case 9) - lock-based RMW
|
||||
------------------------
|
||||
|
||||
Function changes:
|
||||
|
@ -489,7 +489,19 @@ Storing ``NULL`` into any index of a multi-index entry will set the
|
||||
entry at every index to ``NULL`` and dissolve the tie. A multi-index
|
||||
entry can be split into entries occupying smaller ranges by calling
|
||||
xas_split_alloc() without the xa_lock held, followed by taking the lock
|
||||
and calling xas_split().
|
||||
and calling xas_split() or calling xas_try_split() with xa_lock. The
|
||||
difference between xas_split_alloc()+xas_split() and xas_try_alloc() is
|
||||
that xas_split_alloc() + xas_split() split the entry from the original
|
||||
order to the new order in one shot uniformly, whereas xas_try_split()
|
||||
iteratively splits the entry containing the index non-uniformly.
|
||||
For example, to split an order-9 entry, which takes 2^(9-6)=8 slots,
|
||||
assuming ``XA_CHUNK_SHIFT`` is 6, xas_split_alloc() + xas_split() need
|
||||
8 xa_node. xas_try_split() splits the order-9 entry into
|
||||
2 order-8 entries, then split one order-8 entry, based on the given index,
|
||||
to 2 order-7 entries, ..., and split one order-1 entry to 2 order-0 entries.
|
||||
When splitting the order-6 entry and a new xa_node is needed, xas_try_split()
|
||||
will try to allocate one if possible. As a result, xas_try_split() would only
|
||||
need 1 xa_node instead of 8.
|
||||
|
||||
Functions and structures
|
||||
========================
|
||||
|
@ -207,7 +207,6 @@ implement direct_access.
|
||||
|
||||
These block devices may be used for inspiration:
|
||||
- brd: RAM backed block device driver
|
||||
- dcssblk: s390 dcss block device driver
|
||||
- pmem: NVDIMM persistent memory driver
|
||||
|
||||
|
||||
|
@ -502,9 +502,25 @@ process, its PSS will be 1500. "Pss_Dirty" is the portion of PSS which
|
||||
consists of dirty pages. ("Pss_Clean" is not included, but it can be
|
||||
calculated by subtracting "Pss_Dirty" from "Pss".)
|
||||
|
||||
Note that even a page which is part of a MAP_SHARED mapping, but has only
|
||||
a single pte mapped, i.e. is currently used by only one process, is accounted
|
||||
as private and not as shared.
|
||||
Traditionally, a page is accounted as "private" if it is mapped exactly once,
|
||||
and a page is accounted as "shared" when mapped multiple times, even when
|
||||
mapped in the same process multiple times. Note that this accounting is
|
||||
independent of MAP_SHARED.
|
||||
|
||||
In some kernel configurations, the semantics of pages part of a larger
|
||||
allocation (e.g., THP) can differ: a page is accounted as "private" if all
|
||||
pages part of the corresponding large allocation are *certainly* mapped in the
|
||||
same process, even if the page is mapped multiple times in that process. A
|
||||
page is accounted as "shared" if any page page of the larger allocation
|
||||
is *maybe* mapped in a different process. In some cases, a large allocation
|
||||
might be treated as "maybe mapped by multiple processes" even though this
|
||||
is no longer the case.
|
||||
|
||||
Some kernel configurations do not track the precise number of times a page part
|
||||
of a larger allocation is mapped. In this case, when calculating the PSS, the
|
||||
average number of mappings per page in this larger allocation might be used
|
||||
as an approximation for the number of mappings of a page. The PSS calculation
|
||||
will be imprecise in this case.
|
||||
|
||||
"Referenced" indicates the amount of memory currently marked as referenced or
|
||||
accessed.
|
||||
@ -686,6 +702,11 @@ Where:
|
||||
node locality page counters (N0 == node0, N1 == node1, ...) and the kernel page
|
||||
size, in KB, that is backing the mapping up.
|
||||
|
||||
Note that some kernel configurations do not track the precise number of times
|
||||
a page part of a larger allocation (e.g., THP) is mapped. In these
|
||||
configurations, "mapmax" might corresponds to the average number of mappings
|
||||
per page in such a larger allocation instead.
|
||||
|
||||
1.2 Kernel data
|
||||
---------------
|
||||
|
||||
@ -1060,6 +1081,8 @@ Example output. You may not have all of these fields.
|
||||
FilePmdMapped: 0 kB
|
||||
CmaTotal: 0 kB
|
||||
CmaFree: 0 kB
|
||||
Unaccepted: 0 kB
|
||||
Balloon: 0 kB
|
||||
HugePages_Total: 0
|
||||
HugePages_Free: 0
|
||||
HugePages_Rsvd: 0
|
||||
@ -1132,9 +1155,15 @@ Dirty
|
||||
Writeback
|
||||
Memory which is actively being written back to the disk
|
||||
AnonPages
|
||||
Non-file backed pages mapped into userspace page tables
|
||||
Non-file backed pages mapped into userspace page tables. Note that
|
||||
some kernel configurations might consider all pages part of a
|
||||
larger allocation (e.g., THP) as "mapped", as soon as a single
|
||||
page is mapped.
|
||||
Mapped
|
||||
files which have been mmapped, such as libraries
|
||||
files which have been mmapped, such as libraries. Note that some
|
||||
kernel configurations might consider all pages part of a larger
|
||||
allocation (e.g., THP) as "mapped", as soon as a single page is
|
||||
mapped.
|
||||
Shmem
|
||||
Total memory used by shared memory (shmem) and tmpfs
|
||||
KReclaimable
|
||||
@ -1228,6 +1257,10 @@ CmaTotal
|
||||
Memory reserved for the Contiguous Memory Allocator (CMA)
|
||||
CmaFree
|
||||
Free remaining memory in the CMA reserves
|
||||
Unaccepted
|
||||
Memory that has not been accepted by the guest
|
||||
Balloon
|
||||
Memory returned to Host by VM Balloon Drivers
|
||||
HugePages_Total, HugePages_Free, HugePages_Rsvd, HugePages_Surp, Hugepagesize, Hugetlb
|
||||
See Documentation/admin-guide/mm/hugetlbpage.rst.
|
||||
DirectMap4k, DirectMap2M, DirectMap1G
|
||||
|
@ -81,7 +81,7 @@ Page stealing from process memory and shm is done if stealing the page would
|
||||
alleviate memory pressure on any zone in the page's node that has fallen below
|
||||
its watermark.
|
||||
|
||||
watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These
|
||||
watermark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These
|
||||
are per-zone fields, used to determine when a zone needs to be balanced. When
|
||||
the number of pages falls below watermark[WMARK_MIN], the hysteric field
|
||||
low_on_memory gets set. This stays set till the number of free pages becomes
|
||||
|
@ -313,6 +313,10 @@ sufficient for the given purpose, it shouldn't be unnecessarily further
|
||||
lowered. It is recommended to be set proportional to ``aggregation interval``.
|
||||
By default, the ratio is set as ``1/20``, and it is still recommended.
|
||||
|
||||
Based on the manual tuning guide, DAMON provides more intuitive knob-based
|
||||
intervals auto tuning mechanism. Please refer to :ref:`the design document of
|
||||
the feature <damon_design_monitoring_intervals_autotuning>` for detail.
|
||||
|
||||
Refer to below documents for an example tuning based on the above guide.
|
||||
|
||||
.. toctree::
|
||||
@ -321,6 +325,52 @@ Refer to below documents for an example tuning based on the above guide.
|
||||
monitoring_intervals_tuning_example
|
||||
|
||||
|
||||
.. _damon_design_monitoring_intervals_autotuning:
|
||||
|
||||
Monitoring Intervals Auto-tuning
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
DAMON provides automatic tuning of the ``sampling interval`` and ``aggregation
|
||||
interval`` based on the :ref:`the tuning guide idea
|
||||
<damon_design_monitoring_params_tuning_guide>`. The tuning mechanism allows
|
||||
users to set the aimed amount of access events to observe via DAMON within
|
||||
given time interval. The target can be specified by the user as a ratio of
|
||||
DAMON-observed access events to the theoretical maximum amount of the events
|
||||
(``access_bp``) that measured within a given number of aggregations
|
||||
(``aggrs``).
|
||||
|
||||
The DAMON-observed access events are calculated in byte granularity based on
|
||||
DAMON :ref:`region assumption <damon_design_region_based_sampling>`. For
|
||||
example, if a region of size ``X`` bytes of ``Y`` ``nr_accesses`` is found, it
|
||||
means ``X * Y`` access events are observed by DAMON. Theoretical maximum
|
||||
access events for the region is calculated in same way, but replacing ``Y``
|
||||
with theoretical maximum ``nr_accesses``, which can be calculated as
|
||||
``aggregation interval / sampling interval``.
|
||||
|
||||
The mechanism calculates the ratio of access events for ``aggrs`` aggregations,
|
||||
and increases or decrease the ``sampleing interval`` and ``aggregation
|
||||
interval`` in same ratio, if the observed access ratio is lower or higher than
|
||||
the target, respectively. The ratio of the intervals change is decided in
|
||||
proportion to the distance between current samples ratio and the target ratio.
|
||||
|
||||
The user can further set the minimum and maximum ``sampling interval`` that can
|
||||
be set by the tuning mechanism using two parameters (``min_sample_us`` and
|
||||
``max_sample_us``). Because the tuning mechanism changes ``sampling interval``
|
||||
and ``aggregation interval`` in same ratio always, the minimum and maximum
|
||||
``aggregation interval`` after each of the tuning changes can automatically set
|
||||
together.
|
||||
|
||||
The tuning is turned off by default, and need to be set explicitly by the user.
|
||||
As a rule of thumbs and the Parreto principle, 4% access samples ratio target
|
||||
is recommended. Note that Parreto principle (80/20 rule) has applied twice.
|
||||
That is, assumes 4% (20% of 20%) DAMON-observed access events ratio (source)
|
||||
to capture 64% (80% multipled by 80%) real access events (outcomes).
|
||||
|
||||
To know how user-space can use this feature via :ref:`DAMON sysfs interface
|
||||
<sysfs_interface>`, refer to :ref:`intervals_goal <sysfs_scheme>` part of
|
||||
the documentation.
|
||||
|
||||
|
||||
.. _damon_design_damos:
|
||||
|
||||
Operation Schemes
|
||||
@ -569,11 +619,22 @@ number of filters for each scheme. Each filter specifies
|
||||
- whether it is to allow (include) or reject (exclude) applying
|
||||
the scheme's action to the memory (``allow``).
|
||||
|
||||
When multiple filters are installed, each filter is evaluated in the installed
|
||||
order. If a part of memory is matched to one of the filter, next filters are
|
||||
ignored. If the memory passes through the filters evaluation stage because it
|
||||
is not matched to any of the filters, applying the scheme's action to it is
|
||||
allowed, same to the behavior when no filter exists.
|
||||
For efficient handling of filters, some types of filters are handled by the
|
||||
core layer, while others are handled by operations set. In the latter case,
|
||||
hence, support of the filter types depends on the DAMON operations set. In
|
||||
case of the core layer-handled filters, the memory regions that excluded by the
|
||||
filter are not counted as the scheme has tried to the region. In contrast, if
|
||||
a memory regions is filtered by an operations set layer-handled filter, it is
|
||||
counted as the scheme has tried. This difference affects the statistics.
|
||||
|
||||
When multiple filters are installed, the group of filters that handled by the
|
||||
core layer are evaluated first. After that, the group of filters that handled
|
||||
by the operations layer are evaluated. Filters in each of the groups are
|
||||
evaluated in the installed order. If a part of memory is matched to one of the
|
||||
filter, next filters are ignored. If the part passes through the filters
|
||||
evaluation stage because it is not matched to any of the filters, applying the
|
||||
scheme's action to it depends on the last filter's allowance type. If the last
|
||||
filter was for allowing, the part of memory will be rejected, and vice versa.
|
||||
|
||||
For example, let's assume 1) a filter for allowing anonymous pages and 2)
|
||||
another filter for rejecting young pages are installed in the order. If a page
|
||||
@ -585,39 +646,29 @@ second reject-filter blocks it. If the page is neither anonymous nor young,
|
||||
the page will pass through the filters evaluation stage since there is no
|
||||
matching filter, and the action will be applied to the page.
|
||||
|
||||
Note that the action can equally be applied to memory that either explicitly
|
||||
filter-allowed or filters evaluation stage passed. It means that installing
|
||||
allow-filters at the end of the list makes no practical change but only
|
||||
filters-checking overhead.
|
||||
|
||||
For efficient handling of filters, some types of filters are handled by the
|
||||
core layer, while others are handled by operations set. In the latter case,
|
||||
hence, support of the filter types depends on the DAMON operations set. In
|
||||
case of the core layer-handled filters, the memory regions that excluded by the
|
||||
filter are not counted as the scheme has tried to the region. In contrast, if
|
||||
a memory regions is filtered by an operations set layer-handled filter, it is
|
||||
counted as the scheme has tried. This difference affects the statistics.
|
||||
|
||||
Below ``type`` of filters are currently supported.
|
||||
|
||||
- anonymous page
|
||||
- Applied to pages that containing data that not stored in files.
|
||||
- Handled by operations set layer. Supported by only ``paddr`` set.
|
||||
- memory cgroup
|
||||
- Applied to pages that belonging to a given cgroup.
|
||||
- Handled by operations set layer. Supported by only ``paddr`` set.
|
||||
- young page
|
||||
- Applied to pages that are accessed after the last access check from the
|
||||
scheme.
|
||||
- Handled by operations set layer. Supported by only ``paddr`` set.
|
||||
- address range
|
||||
- Applied to pages that belonging to a given address range.
|
||||
- Handled by the core logic.
|
||||
- DAMON monitoring target
|
||||
- Applied to pages that belonging to a given DAMON monitoring target.
|
||||
- Handled by the core logic.
|
||||
- Core layer handled
|
||||
- addr
|
||||
- Applied to pages that belonging to a given address range.
|
||||
- target
|
||||
- Applied to pages that belonging to a given DAMON monitoring target.
|
||||
- Operations layer handled, supported by only ``paddr`` operations set.
|
||||
- anon
|
||||
- Applied to pages that containing data that not stored in files.
|
||||
- active
|
||||
- Applied to active pages.
|
||||
- memcg
|
||||
- Applied to pages that belonging to a given cgroup.
|
||||
- young
|
||||
- Applied to pages that are accessed after the last access check from the
|
||||
scheme.
|
||||
- hugepage_size
|
||||
- Applied to pages that managed in a given size range.
|
||||
- unmapped
|
||||
- Applied to pages that unmapped.
|
||||
|
||||
To know how user-space can set the watermarks via :ref:`DAMON sysfs interface
|
||||
To know how user-space can set the filters via :ref:`DAMON sysfs interface
|
||||
<sysfs_interface>`, refer to :ref:`filters <sysfs_filters>` part of the
|
||||
documentation.
|
||||
|
||||
|
@ -36,7 +36,7 @@ Then, list the DAMON-found regions of different access patterns, sorted by the
|
||||
"access temperature". "Access temperature" is a metric representing the
|
||||
access-hotness of a region. It is calculated as a weighted sum of the access
|
||||
frequency and the age of the region. If the access frequency is 0 %, the
|
||||
temperature is multipled by minus one. That is, if a region is not accessed,
|
||||
temperature is multiplied by minus one. That is, if a region is not accessed,
|
||||
it gets minus temperature and it gets lower as not accessed for longer time.
|
||||
The sorting is in temperature-ascendint order, so the region at the top of the
|
||||
list is the coldest, and the one at the bottom is the hottest one. ::
|
||||
@ -58,11 +58,11 @@ list is the coldest, and the one at the bottom is the hottest one. ::
|
||||
The list shows not seemingly hot regions, and only minimum access pattern
|
||||
diversity. Every region has zero access frequency. The number of region is
|
||||
10, which is the default ``min_nr_regions value``. Size of each region is also
|
||||
nearly idential. We can suspect this is because “adaptive regions adjustment”
|
||||
nearly identical. We can suspect this is because “adaptive regions adjustment”
|
||||
mechanism was not well working. As the guide suggested, we can get relative
|
||||
hotness of regions using ``age`` as the recency information. That would be
|
||||
better than nothing, but given the fact that the longest age is only about 6
|
||||
seconds while we waited about ten minuts, it is unclear how useful this will
|
||||
seconds while we waited about ten minutes, it is unclear how useful this will
|
||||
be.
|
||||
|
||||
The temperature ranges to total size of regions of each range histogram
|
||||
@ -190,7 +190,7 @@ for sampling and aggregation intervals, respectively). ::
|
||||
The number of regions having different access patterns has significantly
|
||||
increased. Size of each region is also more varied. Total size of non-zero
|
||||
access frequency regions is also significantly increased. Maybe this is already
|
||||
good enough to make some meaningful memory management efficieny changes.
|
||||
good enough to make some meaningful memory management efficiency changes.
|
||||
|
||||
800ms/16s intervals: Another bias
|
||||
=================================
|
||||
|
@ -400,7 +400,7 @@ Exclusive access memory
|
||||
Some devices have features such as atomic PTE bits that can be used to implement
|
||||
atomic access to system memory. To support atomic operations to a shared virtual
|
||||
memory page such a device needs access to that page which is exclusive of any
|
||||
userspace access from the CPU. The ``make_device_exclusive_range()`` function
|
||||
userspace access from the CPU. The ``make_device_exclusive()`` function
|
||||
can be used to make a memory range inaccessible from userspace.
|
||||
|
||||
This replaces all mappings for pages in the given range with special swap
|
||||
|
@ -62,5 +62,4 @@ documentation, or deleted if it has served its purpose.
|
||||
unevictable-lru
|
||||
vmalloced-kernel-stacks
|
||||
vmemmap_dedup
|
||||
z3fold
|
||||
zsmalloc
|
||||
|
@ -338,10 +338,272 @@ Statistics
|
||||
|
||||
Zones
|
||||
=====
|
||||
As we have mentioned, each zone in memory is described by a ``struct zone``
|
||||
which is an element of the ``node_zones`` array of the node it belongs to.
|
||||
``struct zone`` is the core data structure of the page allocator. A zone
|
||||
represents a range of physical memory and may have holes.
|
||||
|
||||
.. admonition:: Stub
|
||||
The page allocator uses the GFP flags, see :ref:`mm-api-gfp-flags`, specified by
|
||||
a memory allocation to determine the highest zone in a node from which the
|
||||
memory allocation can allocate memory. The page allocator first allocates memory
|
||||
from that zone, if the page allocator can't allocate the requested amount of
|
||||
memory from the zone, it will allocate memory from the next lower zone in the
|
||||
node, the process continues up to and including the lowest zone. For example, if
|
||||
a node contains ``ZONE_DMA32``, ``ZONE_NORMAL`` and ``ZONE_MOVABLE`` and the
|
||||
highest zone of a memory allocation is ``ZONE_MOVABLE``, the order of the zones
|
||||
from which the page allocator allocates memory is ``ZONE_MOVABLE`` >
|
||||
``ZONE_NORMAL`` > ``ZONE_DMA32``.
|
||||
|
||||
This section is incomplete. Please list and describe the appropriate fields.
|
||||
At runtime, free pages in a zone are in the Per-CPU Pagesets (PCP) or free areas
|
||||
of the zone. The Per-CPU Pagesets are a vital mechanism in the kernel's memory
|
||||
management system. By handling most frequent allocations and frees locally on
|
||||
each CPU, the Per-CPU Pagesets improve performance and scalability, especially
|
||||
on systems with many cores. The page allocator in the kernel employs a two-step
|
||||
strategy for memory allocation, starting with the Per-CPU Pagesets before
|
||||
falling back to the buddy allocator. Pages are transferred between the Per-CPU
|
||||
Pagesets and the global free areas (managed by the buddy allocator) in batches.
|
||||
This minimizes the overhead of frequent interactions with the global buddy
|
||||
allocator.
|
||||
|
||||
Architecture specific code calls free_area_init() to initializes zones.
|
||||
|
||||
Zone structure
|
||||
--------------
|
||||
The zones structure ``struct zone`` is defined in ``include/linux/mmzone.h``.
|
||||
Here we briefly describe fields of this structure:
|
||||
|
||||
General
|
||||
~~~~~~~
|
||||
|
||||
``_watermark``
|
||||
The watermarks for this zone. When the amount of free pages in a zone is below
|
||||
the min watermark, boosting is ignored, an allocation may trigger direct
|
||||
reclaim and direct compaction, it is also used to throttle direct reclaim.
|
||||
When the amount of free pages in a zone is below the low watermark, kswapd is
|
||||
woken up. When the amount of free pages in a zone is above the high watermark,
|
||||
kswapd stops reclaiming (a zone is balanced) when the
|
||||
``NUMA_BALANCING_MEMORY_TIERING`` bit of ``sysctl_numa_balancing_mode`` is not
|
||||
set. The promo watermark is used for memory tiering and NUMA balancing. When
|
||||
the amount of free pages in a zone is above the promo watermark, kswapd stops
|
||||
reclaiming when the ``NUMA_BALANCING_MEMORY_TIERING`` bit of
|
||||
``sysctl_numa_balancing_mode`` is set. The watermarks are set by
|
||||
``__setup_per_zone_wmarks()``. The min watermark is calculated according to
|
||||
``vm.min_free_kbytes`` sysctl. The other three watermarks are set according
|
||||
to the distance between two watermarks. The distance itself is calculated
|
||||
taking ``vm.watermark_scale_factor`` sysctl into account.
|
||||
|
||||
``watermark_boost``
|
||||
The number of pages which are used to boost watermarks to increase reclaim
|
||||
pressure to reduce the likelihood of future fallbacks and wake kswapd now
|
||||
as the node may be balanced overall and kswapd will not wake naturally.
|
||||
|
||||
``nr_reserved_highatomic``
|
||||
The number of pages which are reserved for high-order atomic allocations.
|
||||
|
||||
``nr_free_highatomic``
|
||||
The number of free pages in reserved highatomic pageblocks
|
||||
|
||||
``lowmem_reserve``
|
||||
The array of the amounts of the memory reserved in this zone for memory
|
||||
allocations. For example, if the highest zone a memory allocation can
|
||||
allocate memory from is ``ZONE_MOVABLE``, the amount of memory reserved in
|
||||
this zone for this allocation is ``lowmem_reserve[ZONE_MOVABLE]`` when
|
||||
attempting to allocate memory from this zone. This is a mechanism the page
|
||||
allocator uses to prevent allocations which could use ``highmem`` from using
|
||||
too much ``lowmem``. For some specialised workloads on ``highmem`` machines,
|
||||
it is dangerous for the kernel to allow process memory to be allocated from
|
||||
the ``lowmem`` zone. This is because that memory could then be pinned via the
|
||||
``mlock()`` system call, or by unavailability of swapspace.
|
||||
``vm.lowmem_reserve_ratio`` sysctl determines how aggressive the kernel is in
|
||||
defending these lower zones. This array is recalculated by
|
||||
``setup_per_zone_lowmem_reserve()`` at runtime if ``vm.lowmem_reserve_ratio``
|
||||
sysctl changes.
|
||||
|
||||
``node``
|
||||
The index of the node this zone belongs to. Available only when
|
||||
``CONFIG_NUMA`` is enabled because there is only one zone in a UMA system.
|
||||
|
||||
``zone_pgdat``
|
||||
Pointer to the ``struct pglist_data`` of the node this zone belongs to.
|
||||
|
||||
``per_cpu_pageset``
|
||||
Pointer to the Per-CPU Pagesets (PCP) allocated and initialized by
|
||||
``setup_zone_pageset()``. By handling most frequent allocations and frees
|
||||
locally on each CPU, PCP improves performance and scalability on systems with
|
||||
many cores.
|
||||
|
||||
``pageset_high_min``
|
||||
Copied to the ``high_min`` of the Per-CPU Pagesets for faster access.
|
||||
|
||||
``pageset_high_max``
|
||||
Copied to the ``high_max`` of the Per-CPU Pagesets for faster access.
|
||||
|
||||
``pageset_batch``
|
||||
Copied to the ``batch`` of the Per-CPU Pagesets for faster access. The
|
||||
``batch``, ``high_min`` and ``high_max`` of the Per-CPU Pagesets are used to
|
||||
calculate the number of elements the Per-CPU Pagesets obtain from the buddy
|
||||
allocator under a single hold of the lock for efficiency. They are also used
|
||||
to decide if the Per-CPU Pagesets return pages to the buddy allocator in page
|
||||
free process.
|
||||
|
||||
``pageblock_flags``
|
||||
The pointer to the flags for the pageblocks in the zone (see
|
||||
``include/linux/pageblock-flags.h`` for flags list). The memory is allocated
|
||||
in ``setup_usemap()``. Each pageblock occupies ``NR_PAGEBLOCK_BITS`` bits.
|
||||
Defined only when ``CONFIG_FLATMEM`` is enabled. The flags is stored in
|
||||
``mem_section`` when ``CONFIG_SPARSEMEM`` is enabled.
|
||||
|
||||
``zone_start_pfn``
|
||||
The start pfn of the zone. It is initialized by
|
||||
``calculate_node_totalpages()``.
|
||||
|
||||
``managed_pages``
|
||||
The present pages managed by the buddy system, which is calculated as:
|
||||
``managed_pages`` = ``present_pages`` - ``reserved_pages``, ``reserved_pages``
|
||||
includes pages allocated by the memblock allocator. It should be used by page
|
||||
allocator and vm scanner to calculate all kinds of watermarks and thresholds.
|
||||
It is accessed using ``atomic_long_xxx()`` functions. It is initialized in
|
||||
``free_area_init_core()`` and then is reinitialized when memblock allocator
|
||||
frees pages into buddy system.
|
||||
|
||||
``spanned_pages``
|
||||
The total pages spanned by the zone, including holes, which is calculated as:
|
||||
``spanned_pages`` = ``zone_end_pfn`` - ``zone_start_pfn``. It is initialized
|
||||
by ``calculate_node_totalpages()``.
|
||||
|
||||
``present_pages``
|
||||
The physical pages existing within the zone, which is calculated as:
|
||||
``present_pages`` = ``spanned_pages`` - ``absent_pages`` (pages in holes). It
|
||||
may be used by memory hotplug or memory power management logic to figure out
|
||||
unmanaged pages by checking (``present_pages`` - ``managed_pages``). Write
|
||||
access to ``present_pages`` at runtime should be protected by
|
||||
``mem_hotplug_begin/done()``. Any reader who can't tolerant drift of
|
||||
``present_pages`` should use ``get_online_mems()`` to get a stable value. It
|
||||
is initialized by ``calculate_node_totalpages()``.
|
||||
|
||||
``present_early_pages``
|
||||
The present pages existing within the zone located on memory available since
|
||||
early boot, excluding hotplugged memory. Defined only when
|
||||
``CONFIG_MEMORY_HOTPLUG`` is enabled and initialized by
|
||||
``calculate_node_totalpages()``.
|
||||
|
||||
``cma_pages``
|
||||
The pages reserved for CMA use. These pages behave like ``ZONE_MOVABLE`` when
|
||||
they are not used for CMA. Defined only when ``CONFIG_CMA`` is enabled.
|
||||
|
||||
``name``
|
||||
The name of the zone. It is a pointer to the corresponding element of
|
||||
the ``zone_names`` array.
|
||||
|
||||
``nr_isolate_pageblock``
|
||||
Number of isolated pageblocks. It is used to solve incorrect freepage counting
|
||||
problem due to racy retrieving migratetype of pageblock. Protected by
|
||||
``zone->lock``. Defined only when ``CONFIG_MEMORY_ISOLATION`` is enabled.
|
||||
|
||||
``span_seqlock``
|
||||
The seqlock to protect ``zone_start_pfn`` and ``spanned_pages``. It is a
|
||||
seqlock because it has to be read outside of ``zone->lock``, and it is done in
|
||||
the main allocator path. However, the seqlock is written quite infrequently.
|
||||
Defined only when ``CONFIG_MEMORY_HOTPLUG`` is enabled.
|
||||
|
||||
``initialized``
|
||||
The flag indicating if the zone is initialized. Set by
|
||||
``init_currently_empty_zone()`` during boot.
|
||||
|
||||
``free_area``
|
||||
The array of free areas, where each element corresponds to a specific order
|
||||
which is a power of two. The buddy allocator uses this structure to manage
|
||||
free memory efficiently. When allocating, it tries to find the smallest
|
||||
sufficient block, if the smallest sufficient block is larger than the
|
||||
requested size, it will be recursively split into the next smaller blocks
|
||||
until the required size is reached. When a page is freed, it may be merged
|
||||
with its buddy to form a larger block. It is initialized by
|
||||
``zone_init_free_lists()``.
|
||||
|
||||
``unaccepted_pages``
|
||||
The list of pages to be accepted. All pages on the list are ``MAX_PAGE_ORDER``.
|
||||
Defined only when ``CONFIG_UNACCEPTED_MEMORY`` is enabled.
|
||||
|
||||
``flags``
|
||||
The zone flags. The least three bits are used and defined by
|
||||
``enum zone_flags``. ``ZONE_BOOSTED_WATERMARK`` (bit 0): zone recently boosted
|
||||
watermarks. Cleared when kswapd is woken. ``ZONE_RECLAIM_ACTIVE`` (bit 1):
|
||||
kswapd may be scanning the zone. ``ZONE_BELOW_HIGH`` (bit 2): zone is below
|
||||
high watermark.
|
||||
|
||||
``lock``
|
||||
The main lock that protects the internal data structures of the page allocator
|
||||
specific to the zone, especially protects ``free_area``.
|
||||
|
||||
``percpu_drift_mark``
|
||||
When free pages are below this point, additional steps are taken when reading
|
||||
the number of free pages to avoid per-cpu counter drift allowing watermarks
|
||||
to be breached. It is updated in ``refresh_zone_stat_thresholds()``.
|
||||
|
||||
Compaction control
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
``compact_cached_free_pfn``
|
||||
The PFN where compaction free scanner should start in the next scan.
|
||||
|
||||
``compact_cached_migrate_pfn``
|
||||
The PFNs where compaction migration scanner should start in the next scan.
|
||||
This array has two elements: the first one is used in ``MIGRATE_ASYNC`` mode,
|
||||
and the other one is used in ``MIGRATE_SYNC`` mode.
|
||||
|
||||
``compact_init_migrate_pfn``
|
||||
The initial migration PFN which is initialized to 0 at boot time, and to the
|
||||
first pageblock with migratable pages in the zone after a full compaction
|
||||
finishes. It is used to check if a scan is a whole zone scan or not.
|
||||
|
||||
``compact_init_free_pfn``
|
||||
The initial free PFN which is initialized to 0 at boot time and to the last
|
||||
pageblock with free ``MIGRATE_MOVABLE`` pages in the zone. It is used to check
|
||||
if it is the start of a scan.
|
||||
|
||||
``compact_considered``
|
||||
The number of compactions attempted since last failure. It is reset in
|
||||
``defer_compaction()`` when a compaction fails to result in a page allocation
|
||||
success. It is increased by 1 in ``compaction_deferred()`` when a compaction
|
||||
should be skipped. ``compaction_deferred()`` is called before
|
||||
``compact_zone()`` is called, ``compaction_defer_reset()`` is called when
|
||||
``compact_zone()`` returns ``COMPACT_SUCCESS``, ``defer_compaction()`` is
|
||||
called when ``compact_zone()`` returns ``COMPACT_PARTIAL_SKIPPED`` or
|
||||
``COMPACT_COMPLETE``.
|
||||
|
||||
``compact_defer_shift``
|
||||
The number of compactions skipped before trying again is
|
||||
``1<<compact_defer_shift``. It is increased by 1 in ``defer_compaction()``.
|
||||
It is reset in ``compaction_defer_reset()`` when a direct compaction results
|
||||
in a page allocation success. Its maximum value is ``COMPACT_MAX_DEFER_SHIFT``.
|
||||
|
||||
``compact_order_failed``
|
||||
The minimum compaction failed order. It is set in ``compaction_defer_reset()``
|
||||
when a compaction succeeds and in ``defer_compaction()`` when a compaction
|
||||
fails to result in a page allocation success.
|
||||
|
||||
``compact_blockskip_flush``
|
||||
Set to true when compaction migration scanner and free scanner meet, which
|
||||
means the ``PB_migrate_skip`` bits should be cleared.
|
||||
|
||||
``contiguous``
|
||||
Set to true when the zone is contiguous (in other words, no hole).
|
||||
|
||||
Statistics
|
||||
~~~~~~~~~~
|
||||
|
||||
``vm_stat``
|
||||
VM statistics for the zone. The items tracked are defined by
|
||||
``enum zone_stat_item``.
|
||||
|
||||
``vm_numa_event``
|
||||
VM NUMA event statistics for the zone. The items tracked are defined by
|
||||
``enum numa_stat_item``.
|
||||
|
||||
``per_cpu_zonestats``
|
||||
Per-CPU VM statistics for the zone. It records VM statistics and VM NUMA event
|
||||
statistics on a per-CPU basis. It reduces updates to the global ``vm_stat``
|
||||
and ``vm_numa_event`` fields of the zone to improve performance.
|
||||
|
||||
.. _pages:
|
||||
|
||||
|
@ -716,9 +716,14 @@ calls :c:func:`!rcu_read_lock` to ensure that the VMA is looked up in an RCU
|
||||
critical section, then attempts to VMA lock it via :c:func:`!vma_start_read`,
|
||||
before releasing the RCU lock via :c:func:`!rcu_read_unlock`.
|
||||
|
||||
VMA read locks hold the read lock on the :c:member:`!vma->vm_lock` semaphore for
|
||||
their duration and the caller of :c:func:`!lock_vma_under_rcu` must release it
|
||||
via :c:func:`!vma_end_read`.
|
||||
In cases when the user already holds mmap read lock, :c:func:`!vma_start_read_locked`
|
||||
and :c:func:`!vma_start_read_locked_nested` can be used. These functions do not
|
||||
fail due to lock contention but the caller should still check their return values
|
||||
in case they fail for other reasons.
|
||||
|
||||
VMA read locks increment :c:member:`!vma.vm_refcnt` reference counter for their
|
||||
duration and the caller of :c:func:`!lock_vma_under_rcu` must drop it via
|
||||
:c:func:`!vma_end_read`.
|
||||
|
||||
VMA **write** locks are acquired via :c:func:`!vma_start_write` in instances where a
|
||||
VMA is about to be modified, unlike :c:func:`!vma_start_read` the lock is always
|
||||
@ -726,9 +731,9 @@ acquired. An mmap write lock **must** be held for the duration of the VMA write
|
||||
lock, releasing or downgrading the mmap write lock also releases the VMA write
|
||||
lock so there is no :c:func:`!vma_end_write` function.
|
||||
|
||||
Note that a semaphore write lock is not held across a VMA lock. Rather, a
|
||||
sequence number is used for serialisation, and the write semaphore is only
|
||||
acquired at the point of write lock to update this.
|
||||
Note that when write-locking a VMA lock, the :c:member:`!vma.vm_refcnt` is temporarily
|
||||
modified so that readers can detect the presense of a writer. The reference counter is
|
||||
restored once the vma sequence number used for serialisation is updated.
|
||||
|
||||
This ensures the semantics we require - VMA write locks provide exclusive write
|
||||
access to the VMA.
|
||||
@ -738,7 +743,7 @@ Implementation details
|
||||
|
||||
The VMA lock mechanism is designed to be a lightweight means of avoiding the use
|
||||
of the heavily contended mmap lock. It is implemented using a combination of a
|
||||
read/write semaphore and sequence numbers belonging to the containing
|
||||
reference counter and sequence numbers belonging to the containing
|
||||
:c:struct:`!struct mm_struct` and the VMA.
|
||||
|
||||
Read locks are acquired via :c:func:`!vma_start_read`, which is an optimistic
|
||||
@ -779,28 +784,31 @@ release of any VMA locks on its release makes sense, as you would never want to
|
||||
keep VMAs locked across entirely separate write operations. It also maintains
|
||||
correct lock ordering.
|
||||
|
||||
Each time a VMA read lock is acquired, we acquire a read lock on the
|
||||
:c:member:`!vma->vm_lock` read/write semaphore and hold it, while checking that
|
||||
the sequence count of the VMA does not match that of the mm.
|
||||
Each time a VMA read lock is acquired, we increment :c:member:`!vma.vm_refcnt`
|
||||
reference counter and check that the sequence count of the VMA does not match
|
||||
that of the mm.
|
||||
|
||||
If it does, the read lock fails. If it does not, we hold the lock, excluding
|
||||
writers, but permitting other readers, who will also obtain this lock under RCU.
|
||||
If it does, the read lock fails and :c:member:`!vma.vm_refcnt` is dropped.
|
||||
If it does not, we keep the reference counter raised, excluding writers, but
|
||||
permitting other readers, who can also obtain this lock under RCU.
|
||||
|
||||
Importantly, maple tree operations performed in :c:func:`!lock_vma_under_rcu`
|
||||
are also RCU safe, so the whole read lock operation is guaranteed to function
|
||||
correctly.
|
||||
|
||||
On the write side, we acquire a write lock on the :c:member:`!vma->vm_lock`
|
||||
read/write semaphore, before setting the VMA's sequence number under this lock,
|
||||
also simultaneously holding the mmap write lock.
|
||||
On the write side, we set a bit in :c:member:`!vma.vm_refcnt` which can't be
|
||||
modified by readers and wait for all readers to drop their reference count.
|
||||
Once there are no readers, the VMA's sequence number is set to match that of
|
||||
the mm. During this entire operation mmap write lock is held.
|
||||
|
||||
This way, if any read locks are in effect, :c:func:`!vma_start_write` will sleep
|
||||
until these are finished and mutual exclusion is achieved.
|
||||
|
||||
After setting the VMA's sequence number, the lock is released, avoiding
|
||||
complexity with a long-term held write lock.
|
||||
After setting the VMA's sequence number, the bit in :c:member:`!vma.vm_refcnt`
|
||||
indicating a writer is cleared. From this point on, VMA's sequence number will
|
||||
indicate VMA's write-locked state until mmap write lock is dropped or downgraded.
|
||||
|
||||
This clever combination of a read/write semaphore and sequence count allows for
|
||||
This clever combination of a reference counter and sequence count allows for
|
||||
fast RCU-based per-VMA lock acquisition (especially on page fault, though
|
||||
utilised elsewhere) with minimal complexity around lock ordering.
|
||||
|
||||
|
@ -116,14 +116,27 @@ pages:
|
||||
succeeds on tail pages.
|
||||
|
||||
- map/unmap of a PMD entry for the whole THP increment/decrement
|
||||
folio->_entire_mapcount, increment/decrement folio->_large_mapcount
|
||||
and also increment/decrement folio->_nr_pages_mapped by ENTIRELY_MAPPED
|
||||
when _entire_mapcount goes from -1 to 0 or 0 to -1.
|
||||
folio->_entire_mapcount and folio->_large_mapcount.
|
||||
|
||||
We also maintain the two slots for tracking MM owners (MM ID and
|
||||
corresponding mapcount), and the current status ("maybe mapped shared" vs.
|
||||
"mapped exclusively").
|
||||
|
||||
With CONFIG_PAGE_MAPCOUNT, we also increment/decrement
|
||||
folio->_nr_pages_mapped by ENTIRELY_MAPPED when _entire_mapcount goes
|
||||
from -1 to 0 or 0 to -1.
|
||||
|
||||
- map/unmap of individual pages with PTE entry increment/decrement
|
||||
page->_mapcount, increment/decrement folio->_large_mapcount and also
|
||||
increment/decrement folio->_nr_pages_mapped when page->_mapcount goes
|
||||
from -1 to 0 or 0 to -1 as this counts the number of pages mapped by PTE.
|
||||
folio->_large_mapcount.
|
||||
|
||||
We also maintain the two slots for tracking MM owners (MM ID and
|
||||
corresponding mapcount), and the current status ("maybe mapped shared" vs.
|
||||
"mapped exclusively").
|
||||
|
||||
With CONFIG_PAGE_MAPCOUNT, we also increment/decrement
|
||||
page->_mapcount and increment/decrement folio->_nr_pages_mapped when
|
||||
page->_mapcount goes from -1 to 0 or 0 to -1 as this counts the number
|
||||
of pages mapped by PTE.
|
||||
|
||||
split_huge_page internally has to distribute the refcounts in the head
|
||||
page to the tail pages before clearing all PG_head/tail bits from the page
|
||||
@ -151,8 +164,8 @@ clear where references should go after split: it will stay on the head page.
|
||||
Note that split_huge_pmd() doesn't have any limitations on refcounting:
|
||||
pmd can be split at any point and never fails.
|
||||
|
||||
Partial unmap and deferred_split_folio()
|
||||
========================================
|
||||
Partial unmap and deferred_split_folio() (anon THP only)
|
||||
========================================================
|
||||
|
||||
Unmapping part of THP (with munmap() or other way) is not going to free
|
||||
memory immediately. Instead, we detect that a subpage of THP is not in use
|
||||
@ -167,3 +180,13 @@ a THP crosses a VMA boundary.
|
||||
The function deferred_split_folio() is used to queue a folio for splitting.
|
||||
The splitting itself will happen when we get memory pressure via shrinker
|
||||
interface.
|
||||
|
||||
With CONFIG_PAGE_MAPCOUNT, we reliably detect partial mappings based on
|
||||
folio->_nr_pages_mapped.
|
||||
|
||||
With CONFIG_NO_PAGE_MAPCOUNT, we detect partial mappings based on the
|
||||
average per-page mapcount in a THP: if the average is < 1, an anon THP is
|
||||
certainly partially mapped. As long as only a single process maps a THP,
|
||||
this detection is reliable. With long-running child processes, there can
|
||||
be scenarios where partial mappings can currently not be detected, and
|
||||
might need asynchronous detection during memory reclaim in the future.
|
||||
|
@ -1,28 +0,0 @@
|
||||
======
|
||||
z3fold
|
||||
======
|
||||
|
||||
z3fold is a special purpose allocator for storing compressed pages.
|
||||
It is designed to store up to three compressed pages per physical page.
|
||||
It is a zbud derivative which allows for higher compression
|
||||
ratio keeping the simplicity and determinism of its predecessor.
|
||||
|
||||
The main differences between z3fold and zbud are:
|
||||
|
||||
* unlike zbud, z3fold allows for up to PAGE_SIZE allocations
|
||||
* z3fold can hold up to 3 compressed pages in its page
|
||||
* z3fold doesn't export any API itself and is thus intended to be used
|
||||
via the zpool API.
|
||||
|
||||
To keep the determinism and simplicity, z3fold, just like zbud, always
|
||||
stores an integral number of compressed pages per page, but it can store
|
||||
up to 3 pages unlike zbud which can store at most 2. Therefore the
|
||||
compression ratio goes to around 2.7x while zbud's one is around 1.7x.
|
||||
|
||||
Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not
|
||||
return a dereferenceable pointer. Instead, it returns an unsigned long
|
||||
handle which encodes actual location of the allocated object.
|
||||
|
||||
Keeping effective compression ratio close to zsmalloc's, z3fold doesn't
|
||||
depend on MMU enabled and provides more predictable reclaim behavior
|
||||
which makes it a better fit for small and response-critical systems.
|
@ -27,9 +27,8 @@ Instead, it returns an opaque handle (unsigned long) which encodes actual
|
||||
location of the allocated object. The reason for this indirection is that
|
||||
zsmalloc does not keep zspages permanently mapped since that would cause
|
||||
issues on 32-bit systems where the VA region for kernel space mappings
|
||||
is very small. So, before using the allocating memory, the object has to
|
||||
be mapped using zs_map_object() to get a usable pointer and subsequently
|
||||
unmapped using zs_unmap_object().
|
||||
is very small. So, using the allocated memory should be done through the
|
||||
proper handle-based APIs.
|
||||
|
||||
stat
|
||||
====
|
||||
|
@ -326,7 +326,7 @@ devm_memunmap_pages() 和 devm_release_mem_region() 当资源可以绑定到 ``s
|
||||
|
||||
一些设备具有诸如原子PTE位的功能,可以用来实现对系统内存的原子访问。为了支持对一
|
||||
个共享的虚拟内存页的原子操作,这样的设备需要对该页的访问是排他的,而不是来自CPU
|
||||
的任何用户空间访问。 ``make_device_exclusive_range()`` 函数可以用来使一
|
||||
的任何用户空间访问。 ``make_device_exclusive()`` 函数可以用来使一
|
||||
个内存范围不能从用户空间访问。
|
||||
|
||||
这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会
|
||||
|
@ -58,7 +58,6 @@ Linux内存管理文档
|
||||
remap_file_pages
|
||||
split_page_table_lock
|
||||
vmalloced-kernel-stacks
|
||||
z3fold
|
||||
zsmalloc
|
||||
|
||||
TODOLIST:
|
||||
|
@ -1,31 +0,0 @@
|
||||
:Original: Documentation/mm/z3fold.rst
|
||||
|
||||
:翻译:
|
||||
|
||||
司延腾 Yanteng Si <siyanteng@loongson.cn>
|
||||
|
||||
:校译:
|
||||
|
||||
|
||||
======
|
||||
z3fold
|
||||
======
|
||||
|
||||
z3fold是一个专门用于存储压缩页的分配器。它被设计为每个物理页最多可以存储三个压缩页。
|
||||
它是zbud的衍生物,允许更高的压缩率,保持其前辈的简单性和确定性。
|
||||
|
||||
z3fold和zbud的主要区别是:
|
||||
|
||||
* 与zbud不同的是,z3fold允许最大的PAGE_SIZE分配。
|
||||
* z3fold在其页面中最多可以容纳3个压缩页面
|
||||
* z3fold本身没有输出任何API,因此打算通过zpool的API来使用
|
||||
|
||||
为了保持确定性和简单性,z3fold,就像zbud一样,总是在每页存储一个整数的压缩页,但是
|
||||
它最多可以存储3页,不像zbud最多可以存储2页。因此压缩率达到2.7倍左右,而zbud的压缩
|
||||
率是1.7倍左右。
|
||||
|
||||
不像zbud(但也像zsmalloc),z3fold_alloc()那样不返回一个可重复引用的指针。相反,它
|
||||
返回一个无符号长句柄,它编码了被分配对象的实际位置。
|
||||
|
||||
保持有效的压缩率接近于zsmalloc,z3fold不依赖于MMU的启用,并提供更可预测的回收行
|
||||
为,这使得它更适合于小型和反应迅速的系统。
|
20
MAINTAINERS
20
MAINTAINERS
@ -10933,6 +10933,8 @@ F: fs/hugetlbfs/
|
||||
F: include/linux/hugetlb.h
|
||||
F: include/trace/events/hugetlbfs.h
|
||||
F: mm/hugetlb.c
|
||||
F: mm/hugetlb_cma.c
|
||||
F: mm/hugetlb_cma.h
|
||||
F: mm/hugetlb_vmemmap.c
|
||||
F: mm/hugetlb_vmemmap.h
|
||||
F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c
|
||||
@ -14177,8 +14179,8 @@ F: include/linux/maple_tree.h
|
||||
F: include/trace/events/maple_tree.h
|
||||
F: lib/maple_tree.c
|
||||
F: lib/test_maple_tree.c
|
||||
F: tools/testing/radix-tree/linux/maple_tree.h
|
||||
F: tools/testing/radix-tree/maple.c
|
||||
F: tools/testing/shared/linux/maple_tree.h
|
||||
|
||||
MARDUK (CREATOR CI40) DEVICE TREE SUPPORT
|
||||
M: Rahul Bedarkar <rahulbedarkar89@gmail.com>
|
||||
@ -24292,6 +24294,7 @@ F: drivers/hwmon/tmp513.c
|
||||
|
||||
TMPFS (SHMEM FILESYSTEM)
|
||||
M: Hugh Dickins <hughd@google.com>
|
||||
R: Baolin Wang <baolin.wang@linux.alibaba.com>
|
||||
L: linux-mm@kvack.org
|
||||
S: Maintained
|
||||
F: include/linux/shmem_fs.h
|
||||
@ -25650,7 +25653,6 @@ F: tools/testing/vsock/
|
||||
VMALLOC
|
||||
M: Andrew Morton <akpm@linux-foundation.org>
|
||||
R: Uladzislau Rezki <urezki@gmail.com>
|
||||
R: Christoph Hellwig <hch@infradead.org>
|
||||
L: linux-mm@kvack.org
|
||||
S: Maintained
|
||||
W: http://www.linux-mm.org
|
||||
@ -26539,13 +26541,6 @@ S: Maintained
|
||||
F: Documentation/input/devices/yealink.rst
|
||||
F: drivers/input/misc/yealink.*
|
||||
|
||||
Z3FOLD COMPRESSED PAGE ALLOCATOR
|
||||
M: Vitaly Wool <vitaly.wool@konsulko.com>
|
||||
R: Miaohe Lin <linmiaohe@huawei.com>
|
||||
L: linux-mm@kvack.org
|
||||
S: Maintained
|
||||
F: mm/z3fold.c
|
||||
|
||||
Z8530 DRIVER FOR AX.25
|
||||
M: Joerg Reuter <jreuter@yaina.de>
|
||||
L: linux-hams@vger.kernel.org
|
||||
@ -26556,13 +26551,6 @@ F: Documentation/networking/device_drivers/hamradio/z8530drv.rst
|
||||
F: drivers/net/hamradio/*scc.c
|
||||
F: drivers/net/hamradio/z8530.h
|
||||
|
||||
ZBUD COMPRESSED PAGE ALLOCATOR
|
||||
M: Seth Jennings <sjenning@redhat.com>
|
||||
M: Dan Streetman <ddstreet@ieee.org>
|
||||
L: linux-mm@kvack.org
|
||||
S: Maintained
|
||||
F: mm/zbud.c
|
||||
|
||||
ZD1211RW WIRELESS DRIVER
|
||||
L: linux-wireless@vger.kernel.org
|
||||
S: Orphan
|
||||
|
@ -273,14 +273,6 @@ srm_paging_stop (void)
|
||||
}
|
||||
#endif
|
||||
|
||||
void __init
|
||||
mem_init(void)
|
||||
{
|
||||
set_max_mapnr(max_low_pfn);
|
||||
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
|
||||
memblock_free_all();
|
||||
}
|
||||
|
||||
static const pgprot_t protection_map[16] = {
|
||||
[VM_NONE] = _PAGE_P(_PAGE_FOE | _PAGE_FOW |
|
||||
_PAGE_FOR),
|
||||
|
@ -150,41 +150,18 @@ void __init setup_arch_memory(void)
|
||||
*/
|
||||
max_zone_pfn[ZONE_HIGHMEM] = max_high_pfn;
|
||||
|
||||
high_memory = (void *)(min_high_pfn << PAGE_SHIFT);
|
||||
|
||||
arch_pfn_offset = min(min_low_pfn, min_high_pfn);
|
||||
kmap_init();
|
||||
|
||||
#else /* CONFIG_HIGHMEM */
|
||||
/* pfn_valid() uses this when FLATMEM=y and HIGHMEM=n */
|
||||
max_mapnr = max_low_pfn - min_low_pfn;
|
||||
|
||||
#endif /* CONFIG_HIGHMEM */
|
||||
|
||||
free_area_init(max_zone_pfn);
|
||||
}
|
||||
|
||||
static void __init highmem_init(void)
|
||||
void __init arch_mm_preinit(void)
|
||||
{
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
unsigned long tmp;
|
||||
|
||||
memblock_phys_free(high_mem_start, high_mem_sz);
|
||||
for (tmp = min_high_pfn; tmp < max_high_pfn; tmp++)
|
||||
free_highmem_page(pfn_to_page(tmp));
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* mem_init - initializes memory
|
||||
*
|
||||
* Frees up bootmem
|
||||
* Calculates and displays memory available/used
|
||||
*/
|
||||
void __init mem_init(void)
|
||||
{
|
||||
memblock_free_all();
|
||||
highmem_init();
|
||||
|
||||
BUILD_BUG_ON((PTRS_PER_PGD * sizeof(pgd_t)) > PAGE_SIZE);
|
||||
BUILD_BUG_ON((PTRS_PER_PUD * sizeof(pud_t)) > PAGE_SIZE);
|
||||
|
@ -32,7 +32,7 @@ void __iomem *ioremap(phys_addr_t paddr, unsigned long size)
|
||||
return (void __iomem *)(u32)paddr;
|
||||
|
||||
return ioremap_prot(paddr, size,
|
||||
pgprot_val(pgprot_noncached(PAGE_KERNEL)));
|
||||
pgprot_noncached(PAGE_KERNEL));
|
||||
}
|
||||
EXPORT_SYMBOL(ioremap);
|
||||
|
||||
@ -44,10 +44,8 @@ EXPORT_SYMBOL(ioremap);
|
||||
* might need finer access control (R/W/X)
|
||||
*/
|
||||
void __iomem *ioremap_prot(phys_addr_t paddr, size_t size,
|
||||
unsigned long flags)
|
||||
pgprot_t prot)
|
||||
{
|
||||
pgprot_t prot = __pgprot(flags);
|
||||
|
||||
/* force uncached */
|
||||
return generic_ioremap_prot(paddr, size, pgprot_noncached(prot));
|
||||
}
|
||||
|
@ -19,14 +19,13 @@ extern struct page *empty_zero_page;
|
||||
#define ZERO_PAGE(vaddr) (empty_zero_page)
|
||||
#endif
|
||||
|
||||
#ifndef CONFIG_MMU
|
||||
|
||||
#include <asm-generic/pgtable-nopud.h>
|
||||
|
||||
#ifndef CONFIG_MMU
|
||||
#include <asm/pgtable-nommu.h>
|
||||
|
||||
#else
|
||||
|
||||
#include <asm-generic/pgtable-nopud.h>
|
||||
#include <asm/page.h>
|
||||
#include <asm/pgtable-hwdef.h>
|
||||
|
||||
|
@ -237,56 +237,17 @@ static inline void poison_init_mem(void *s, size_t count)
|
||||
*p++ = 0xe7fddef0;
|
||||
}
|
||||
|
||||
static void __init free_highpages(void)
|
||||
{
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
unsigned long max_low = max_low_pfn;
|
||||
phys_addr_t range_start, range_end;
|
||||
u64 i;
|
||||
|
||||
/* set highmem page free */
|
||||
for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE,
|
||||
&range_start, &range_end, NULL) {
|
||||
unsigned long start = PFN_UP(range_start);
|
||||
unsigned long end = PFN_DOWN(range_end);
|
||||
|
||||
/* Ignore complete lowmem entries */
|
||||
if (end <= max_low)
|
||||
continue;
|
||||
|
||||
/* Truncate partial highmem entries */
|
||||
if (start < max_low)
|
||||
start = max_low;
|
||||
|
||||
for (; start < end; start++)
|
||||
free_highmem_page(pfn_to_page(start));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* mem_init() marks the free areas in the mem_map and tells us how much
|
||||
* memory is free. This is done after various parts of the system have
|
||||
* claimed their memory after the kernel image.
|
||||
*/
|
||||
void __init mem_init(void)
|
||||
void __init arch_mm_preinit(void)
|
||||
{
|
||||
#ifdef CONFIG_ARM_LPAE
|
||||
swiotlb_init(max_pfn > arm_dma_pfn_limit, SWIOTLB_VERBOSE);
|
||||
#endif
|
||||
|
||||
set_max_mapnr(pfn_to_page(max_pfn) - mem_map);
|
||||
|
||||
/* this will put all unused low memory onto the freelists */
|
||||
memblock_free_all();
|
||||
|
||||
#ifdef CONFIG_SA1111
|
||||
/* now that our DMA memory is actually so designated, we can free it */
|
||||
free_reserved_area(__va(PHYS_OFFSET), swapper_pg_dir, -1, NULL);
|
||||
memblock_phys_free(PHYS_OFFSET, __pa(swapper_pg_dir) - PHYS_OFFSET);
|
||||
#endif
|
||||
|
||||
free_highpages();
|
||||
|
||||
/*
|
||||
* Check boundaries twice: Some fundamental inconsistencies can
|
||||
* be detected at build time already.
|
||||
|
@ -41,6 +41,7 @@ config ARM64
|
||||
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
|
||||
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
|
||||
select ARCH_HAS_NONLEAF_PMD_YOUNG if ARM64_HAFT
|
||||
select ARCH_HAS_PTDUMP
|
||||
select ARCH_HAS_PTE_DEVMAP
|
||||
select ARCH_HAS_PTE_SPECIAL
|
||||
select ARCH_HAS_HW_PTE_YOUNG
|
||||
@ -157,7 +158,6 @@ config ARM64
|
||||
select GENERIC_IRQ_SHOW_LEVEL
|
||||
select GENERIC_LIB_DEVMEM_IS_ALLOWED
|
||||
select GENERIC_PCI_IOMAP
|
||||
select GENERIC_PTDUMP
|
||||
select GENERIC_SCHED_CLOCK
|
||||
select GENERIC_SMP_IDLE_THREAD
|
||||
select GENERIC_TIME_VSYSCALL
|
||||
|
@ -270,9 +270,9 @@ int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook);
|
||||
#define _PAGE_IOREMAP PROT_DEVICE_nGnRE
|
||||
|
||||
#define ioremap_wc(addr, size) \
|
||||
ioremap_prot((addr), (size), PROT_NORMAL_NC)
|
||||
ioremap_prot((addr), (size), __pgprot(PROT_NORMAL_NC))
|
||||
#define ioremap_np(addr, size) \
|
||||
ioremap_prot((addr), (size), PROT_DEVICE_nGnRnE)
|
||||
ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRnE))
|
||||
|
||||
/*
|
||||
* io{read,write}{16,32,64}be() macros
|
||||
@ -293,7 +293,7 @@ static inline void __iomem *ioremap_cache(phys_addr_t addr, size_t size)
|
||||
if (pfn_is_map_memory(__phys_to_pfn(addr)))
|
||||
return (void __iomem *)__phys_to_virt(addr);
|
||||
|
||||
return ioremap_prot(addr, size, PROT_NORMAL);
|
||||
return ioremap_prot(addr, size, __pgprot(PROT_NORMAL));
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -7,7 +7,7 @@
|
||||
|
||||
#include <linux/ptdump.h>
|
||||
|
||||
#ifdef CONFIG_PTDUMP_CORE
|
||||
#ifdef CONFIG_PTDUMP
|
||||
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/seq_file.h>
|
||||
@ -70,6 +70,6 @@ static inline void ptdump_debugfs_register(struct ptdump_info *info,
|
||||
#else
|
||||
static inline void note_page(struct ptdump_state *pt_st, unsigned long addr,
|
||||
int level, u64 val) { }
|
||||
#endif /* CONFIG_PTDUMP_CORE */
|
||||
#endif /* CONFIG_PTDUMP */
|
||||
|
||||
#endif /* __ASM_PTDUMP_H */
|
||||
|
@ -322,13 +322,6 @@ static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
|
||||
struct mm_struct *mm,
|
||||
unsigned long uaddr)
|
||||
{
|
||||
__flush_tlb_page_nosync(mm, uaddr);
|
||||
}
|
||||
|
||||
/*
|
||||
* If mprotect/munmap/etc occurs during TLB batched flushing, we need to
|
||||
* synchronise all the TLBI issued with a DSB to avoid the race mentioned in
|
||||
@ -450,7 +443,7 @@ static inline bool __flush_tlb_range_limit_excess(unsigned long start,
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
|
||||
static inline void __flush_tlb_range_nosync(struct mm_struct *mm,
|
||||
unsigned long start, unsigned long end,
|
||||
unsigned long stride, bool last_level,
|
||||
int tlb_level)
|
||||
@ -462,12 +455,12 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
|
||||
pages = (end - start) >> PAGE_SHIFT;
|
||||
|
||||
if (__flush_tlb_range_limit_excess(start, end, pages, stride)) {
|
||||
flush_tlb_mm(vma->vm_mm);
|
||||
flush_tlb_mm(mm);
|
||||
return;
|
||||
}
|
||||
|
||||
dsb(ishst);
|
||||
asid = ASID(vma->vm_mm);
|
||||
asid = ASID(mm);
|
||||
|
||||
if (last_level)
|
||||
__flush_tlb_range_op(vale1is, start, pages, stride, asid,
|
||||
@ -476,7 +469,7 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
|
||||
__flush_tlb_range_op(vae1is, start, pages, stride, asid,
|
||||
tlb_level, true, lpa2_is_enabled());
|
||||
|
||||
mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
|
||||
mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
|
||||
}
|
||||
|
||||
static inline void __flush_tlb_range(struct vm_area_struct *vma,
|
||||
@ -484,7 +477,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
|
||||
unsigned long stride, bool last_level,
|
||||
int tlb_level)
|
||||
{
|
||||
__flush_tlb_range_nosync(vma, start, end, stride,
|
||||
__flush_tlb_range_nosync(vma->vm_mm, start, end, stride,
|
||||
last_level, tlb_level);
|
||||
dsb(ish);
|
||||
}
|
||||
@ -535,6 +528,12 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr)
|
||||
dsb(ish);
|
||||
isb();
|
||||
}
|
||||
|
||||
static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
|
||||
struct mm_struct *mm, unsigned long start, unsigned long end)
|
||||
{
|
||||
__flush_tlb_range_nosync(mm, start, end, PAGE_SIZE, true, 3);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -379,7 +379,7 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
|
||||
prot = __acpi_get_writethrough_mem_attribute();
|
||||
}
|
||||
}
|
||||
return ioremap_prot(phys, size, pgprot_val(prot));
|
||||
return ioremap_prot(phys, size, prot);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -71,8 +71,8 @@ config PTDUMP_STAGE2_DEBUGFS
|
||||
depends on KVM
|
||||
depends on DEBUG_KERNEL
|
||||
depends on DEBUG_FS
|
||||
depends on GENERIC_PTDUMP
|
||||
select PTDUMP_CORE
|
||||
depends on ARCH_HAS_PTDUMP
|
||||
select PTDUMP
|
||||
default n
|
||||
help
|
||||
Say Y here if you want to show the stage-2 kernel pagetables
|
||||
|
@ -5,7 +5,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \
|
||||
context.o proc.o pageattr.o fixmap.o
|
||||
obj-$(CONFIG_ARM64_CONTPTE) += contpte.o
|
||||
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
|
||||
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
|
||||
obj-$(CONFIG_PTDUMP) += ptdump.o
|
||||
obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o
|
||||
obj-$(CONFIG_TRANS_TABLE) += trans_pgd.o
|
||||
obj-$(CONFIG_TRANS_TABLE) += trans_pgd-asm.o
|
||||
|
@ -335,7 +335,7 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
|
||||
* eliding the trailing DSB applies here.
|
||||
*/
|
||||
addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
|
||||
__flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE,
|
||||
__flush_tlb_range_nosync(vma->vm_mm, addr, addr + CONT_PTE_SIZE,
|
||||
PAGE_SIZE, true, 3);
|
||||
}
|
||||
|
||||
|
@ -309,8 +309,6 @@ void __init arm64_memblock_init(void)
|
||||
}
|
||||
|
||||
early_init_fdt_scan_reserved_mem();
|
||||
|
||||
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
|
||||
}
|
||||
|
||||
void __init bootmem_init(void)
|
||||
@ -359,12 +357,7 @@ void __init bootmem_init(void)
|
||||
memblock_dump_all();
|
||||
}
|
||||
|
||||
/*
|
||||
* mem_init() marks the free areas in the mem_map and tells us how much memory
|
||||
* is free. This is done after various parts of the system have claimed their
|
||||
* memory after the kernel image.
|
||||
*/
|
||||
void __init mem_init(void)
|
||||
void __init arch_mm_preinit(void)
|
||||
{
|
||||
unsigned int flags = SWIOTLB_VERBOSE;
|
||||
bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit);
|
||||
@ -388,9 +381,6 @@ void __init mem_init(void)
|
||||
swiotlb_init(swiotlb, flags);
|
||||
swiotlb_update_mem_attributes();
|
||||
|
||||
/* this will put all unused low memory onto the freelists */
|
||||
memblock_free_all();
|
||||
|
||||
/*
|
||||
* Check boundaries twice: Some fundamental inconsistencies can be
|
||||
* detected at build time already.
|
||||
|
@ -15,10 +15,9 @@ int arm64_ioremap_prot_hook_register(ioremap_prot_hook_t hook)
|
||||
}
|
||||
|
||||
void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
|
||||
unsigned long prot)
|
||||
pgprot_t pgprot)
|
||||
{
|
||||
unsigned long last_addr = phys_addr + size - 1;
|
||||
pgprot_t pgprot = __pgprot(prot);
|
||||
|
||||
/* Don't allow outside PHYS_MASK */
|
||||
if (last_addr & ~PHYS_MASK)
|
||||
|
@ -36,7 +36,7 @@
|
||||
*/
|
||||
#define ioremap_wc(addr, size) \
|
||||
ioremap_prot((addr), (size), \
|
||||
(_PAGE_IOREMAP & ~_CACHE_MASK) | _CACHE_UNCACHED)
|
||||
__pgprot((_PAGE_IOREMAP & ~_CACHE_MASK) | _CACHE_UNCACHED))
|
||||
|
||||
#include <asm-generic/io.h>
|
||||
|
||||
|
@ -12,6 +12,45 @@
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/pgalloc.h>
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_INITRD
|
||||
static void __init setup_initrd(void)
|
||||
{
|
||||
unsigned long size;
|
||||
|
||||
if (initrd_start >= initrd_end) {
|
||||
pr_err("initrd not found or empty");
|
||||
goto disable;
|
||||
}
|
||||
|
||||
if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) {
|
||||
pr_err("initrd extends beyond end of memory");
|
||||
goto disable;
|
||||
}
|
||||
|
||||
size = initrd_end - initrd_start;
|
||||
|
||||
if (memblock_is_region_reserved(__pa(initrd_start), size)) {
|
||||
pr_err("INITRD: 0x%08lx+0x%08lx overlaps in-use memory region",
|
||||
__pa(initrd_start), size);
|
||||
goto disable;
|
||||
}
|
||||
|
||||
memblock_reserve(__pa(initrd_start), size);
|
||||
|
||||
pr_info("Initial ramdisk at: 0x%p (%lu bytes)\n",
|
||||
(void *)(initrd_start), size);
|
||||
|
||||
initrd_below_start_ok = 1;
|
||||
|
||||
return;
|
||||
|
||||
disable:
|
||||
initrd_start = initrd_end = 0;
|
||||
|
||||
pr_err(" - disabling initrd\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
static void __init csky_memblock_init(void)
|
||||
{
|
||||
unsigned long lowmem_size = PFN_DOWN(LOWMEM_LIMIT - PHYS_OFFSET_OFFSET);
|
||||
@ -40,6 +79,10 @@ static void __init csky_memblock_init(void)
|
||||
max_low_pfn = min_low_pfn + sseg_size;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_INITRD
|
||||
setup_initrd();
|
||||
#endif
|
||||
|
||||
max_zone_pfn[ZONE_NORMAL] = max_low_pfn;
|
||||
|
||||
mmu_init(min_low_pfn, max_low_pfn);
|
||||
|
@ -42,73 +42,6 @@ unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
|
||||
__page_aligned_bss;
|
||||
EXPORT_SYMBOL(empty_zero_page);
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_INITRD
|
||||
static void __init setup_initrd(void)
|
||||
{
|
||||
unsigned long size;
|
||||
|
||||
if (initrd_start >= initrd_end) {
|
||||
pr_err("initrd not found or empty");
|
||||
goto disable;
|
||||
}
|
||||
|
||||
if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) {
|
||||
pr_err("initrd extends beyond end of memory");
|
||||
goto disable;
|
||||
}
|
||||
|
||||
size = initrd_end - initrd_start;
|
||||
|
||||
if (memblock_is_region_reserved(__pa(initrd_start), size)) {
|
||||
pr_err("INITRD: 0x%08lx+0x%08lx overlaps in-use memory region",
|
||||
__pa(initrd_start), size);
|
||||
goto disable;
|
||||
}
|
||||
|
||||
memblock_reserve(__pa(initrd_start), size);
|
||||
|
||||
pr_info("Initial ramdisk at: 0x%p (%lu bytes)\n",
|
||||
(void *)(initrd_start), size);
|
||||
|
||||
initrd_below_start_ok = 1;
|
||||
|
||||
return;
|
||||
|
||||
disable:
|
||||
initrd_start = initrd_end = 0;
|
||||
|
||||
pr_err(" - disabling initrd\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
void __init mem_init(void)
|
||||
{
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
unsigned long tmp;
|
||||
|
||||
set_max_mapnr(highend_pfn - ARCH_PFN_OFFSET);
|
||||
#else
|
||||
set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET);
|
||||
#endif
|
||||
high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_INITRD
|
||||
setup_initrd();
|
||||
#endif
|
||||
|
||||
memblock_free_all();
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
for (tmp = highstart_pfn; tmp < highend_pfn; tmp++) {
|
||||
struct page *page = pfn_to_page(tmp);
|
||||
|
||||
/* FIXME not sure about */
|
||||
if (!memblock_is_reserved(tmp << PAGE_SHIFT))
|
||||
free_highmem_page(page);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void free_initmem(void)
|
||||
{
|
||||
free_initmem_default(-1);
|
||||
|
@ -43,32 +43,6 @@ DEFINE_SPINLOCK(kmap_gen_lock);
|
||||
/* checkpatch says don't init this to 0. */
|
||||
unsigned long long kmap_generation;
|
||||
|
||||
/*
|
||||
* mem_init - initializes memory
|
||||
*
|
||||
* Frees up bootmem
|
||||
* Fixes up more stuff for HIGHMEM
|
||||
* Calculates and displays memory available/used
|
||||
*/
|
||||
void __init mem_init(void)
|
||||
{
|
||||
/* No idea where this is actually declared. Seems to evade LXR. */
|
||||
memblock_free_all();
|
||||
|
||||
/*
|
||||
* To-Do: someone somewhere should wipe out the bootmem map
|
||||
* after we're done?
|
||||
*/
|
||||
|
||||
/*
|
||||
* This can be moved to some more virtual-memory-specific
|
||||
* initialization hook at some point. Set the init_mm
|
||||
* descriptors "context" value to point to the initial
|
||||
* kernel segment table's physical address.
|
||||
*/
|
||||
init_mm.context.ptbase = __pa(init_mm.pgd);
|
||||
}
|
||||
|
||||
void sync_icache_dcache(pte_t pte)
|
||||
{
|
||||
unsigned long addr;
|
||||
@ -104,10 +78,10 @@ static void __init paging_init(void)
|
||||
free_area_init(max_zone_pfn); /* sets up the zonelists and mem_map */
|
||||
|
||||
/*
|
||||
* Start of high memory area. Will probably need something more
|
||||
* fancy if we... get more fancy.
|
||||
* Set the init_mm descriptors "context" value to point to the
|
||||
* initial kernel segment table's physical address.
|
||||
*/
|
||||
high_memory = (void *)((bootmem_lastpg + 1) << PAGE_SHIFT);
|
||||
init_mm.context.ptbase = __pa(init_mm.pgd);
|
||||
}
|
||||
|
||||
#ifndef DMA_RESERVE
|
||||
|
@ -109,8 +109,7 @@ CONFIG_BINFMT_MISC=m
|
||||
CONFIG_ZPOOL=y
|
||||
CONFIG_ZSWAP=y
|
||||
CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD=y
|
||||
CONFIG_ZBUD=y
|
||||
CONFIG_ZSMALLOC=m
|
||||
CONFIG_ZSMALLOC=y
|
||||
# CONFIG_COMPAT_BRK is not set
|
||||
CONFIG_MEMORY_HOTPLUG=y
|
||||
# CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE is not set
|
||||
|
@ -23,9 +23,9 @@ extern void __init early_iounmap(void __iomem *addr, unsigned long size);
|
||||
#ifdef CONFIG_ARCH_IOREMAP
|
||||
|
||||
static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size,
|
||||
unsigned long prot_val)
|
||||
pgprot_t prot)
|
||||
{
|
||||
switch (prot_val & _CACHE_MASK) {
|
||||
switch (pgprot_val(prot) & _CACHE_MASK) {
|
||||
case _CACHE_CC:
|
||||
return (void __iomem *)(unsigned long)(CACHE_BASE + offset);
|
||||
case _CACHE_SUC:
|
||||
@ -38,7 +38,7 @@ static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size,
|
||||
}
|
||||
|
||||
#define ioremap(offset, size) \
|
||||
ioremap_prot((offset), (size), pgprot_val(PAGE_KERNEL_SUC))
|
||||
ioremap_prot((offset), (size), PAGE_KERNEL_SUC)
|
||||
|
||||
#define iounmap(addr) ((void)(addr))
|
||||
|
||||
@ -55,10 +55,10 @@ static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size,
|
||||
*/
|
||||
#define ioremap_wc(offset, size) \
|
||||
ioremap_prot((offset), (size), \
|
||||
pgprot_val(wc_enabled ? PAGE_KERNEL_WUC : PAGE_KERNEL_SUC))
|
||||
wc_enabled ? PAGE_KERNEL_WUC : PAGE_KERNEL_SUC)
|
||||
|
||||
#define ioremap_cache(offset, size) \
|
||||
ioremap_prot((offset), (size), pgprot_val(PAGE_KERNEL))
|
||||
ioremap_prot((offset), (size), PAGE_KERNEL)
|
||||
|
||||
#define mmiowb() wmb()
|
||||
|
||||
|
@ -387,12 +387,6 @@ void __init paging_init(void)
|
||||
free_area_init(zones_size);
|
||||
}
|
||||
|
||||
void __init mem_init(void)
|
||||
{
|
||||
high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
|
||||
memblock_free_all();
|
||||
}
|
||||
|
||||
int pcibus_to_node(struct pci_bus *bus)
|
||||
{
|
||||
return dev_to_node(&bus->dev);
|
||||
|
@ -75,14 +75,6 @@ void __init paging_init(void)
|
||||
|
||||
free_area_init(max_zone_pfns);
|
||||
}
|
||||
|
||||
void __init mem_init(void)
|
||||
{
|
||||
max_mapnr = max_low_pfn;
|
||||
high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
|
||||
|
||||
memblock_free_all();
|
||||
}
|
||||
#endif /* !CONFIG_NUMA */
|
||||
|
||||
void __ref free_initmem(void)
|
||||
|
@ -121,7 +121,5 @@ static inline void init_pointer_tables(void)
|
||||
|
||||
void __init mem_init(void)
|
||||
{
|
||||
/* this will put all memory onto the freelists */
|
||||
memblock_free_all();
|
||||
init_pointer_tables();
|
||||
}
|
||||
|
@ -52,19 +52,6 @@ static void __init highmem_init(void)
|
||||
map_page(PKMAP_BASE, 0, 0); /* XXX gross */
|
||||
pkmap_page_table = virt_to_kpte(PKMAP_BASE);
|
||||
}
|
||||
|
||||
static void __meminit highmem_setup(void)
|
||||
{
|
||||
unsigned long pfn;
|
||||
|
||||
for (pfn = max_low_pfn; pfn < max_pfn; ++pfn) {
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
/* FIXME not sure about */
|
||||
if (!memblock_is_reserved(pfn << PAGE_SHIFT))
|
||||
free_highmem_page(page);
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_HIGHMEM */
|
||||
|
||||
/*
|
||||
@ -104,17 +91,13 @@ void __init setup_memory(void)
|
||||
*
|
||||
* min_low_pfn - the first page (mm/bootmem.c - node_boot_start)
|
||||
* max_low_pfn
|
||||
* max_mapnr - the first unused page (mm/bootmem.c - node_low_pfn)
|
||||
*/
|
||||
|
||||
/* memory start is from the kernel end (aligned) to higher addr */
|
||||
min_low_pfn = memory_start >> PAGE_SHIFT; /* minimum for allocation */
|
||||
/* RAM is assumed contiguous */
|
||||
max_mapnr = memory_size >> PAGE_SHIFT;
|
||||
max_low_pfn = ((u64)memory_start + (u64)lowmem_size) >> PAGE_SHIFT;
|
||||
max_pfn = ((u64)memory_start + (u64)memory_size) >> PAGE_SHIFT;
|
||||
|
||||
pr_info("%s: max_mapnr: %#lx\n", __func__, max_mapnr);
|
||||
pr_info("%s: min_low_pfn: %#lx\n", __func__, min_low_pfn);
|
||||
pr_info("%s: max_low_pfn: %#lx\n", __func__, max_low_pfn);
|
||||
pr_info("%s: max_pfn: %#lx\n", __func__, max_pfn);
|
||||
@ -124,14 +107,6 @@ void __init setup_memory(void)
|
||||
|
||||
void __init mem_init(void)
|
||||
{
|
||||
high_memory = (void *)__va(memory_start + lowmem_size - 1);
|
||||
|
||||
/* this will put all memory onto the freelists */
|
||||
memblock_free_all();
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
highmem_setup();
|
||||
#endif
|
||||
|
||||
mem_init_done = 1;
|
||||
}
|
||||
|
||||
|
@ -115,7 +115,7 @@ static inline unsigned long isa_virt_to_bus(volatile void *address)
|
||||
}
|
||||
|
||||
void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size,
|
||||
unsigned long prot_val);
|
||||
pgprot_t prot);
|
||||
void iounmap(const volatile void __iomem *addr);
|
||||
|
||||
/*
|
||||
@ -130,7 +130,7 @@ void iounmap(const volatile void __iomem *addr);
|
||||
* address.
|
||||
*/
|
||||
#define ioremap(offset, size) \
|
||||
ioremap_prot((offset), (size), _CACHE_UNCACHED)
|
||||
ioremap_prot((offset), (size), __pgprot(_CACHE_UNCACHED))
|
||||
|
||||
/*
|
||||
* ioremap_cache - map bus memory into CPU space
|
||||
@ -148,7 +148,7 @@ void iounmap(const volatile void __iomem *addr);
|
||||
* memory-like regions on I/O busses.
|
||||
*/
|
||||
#define ioremap_cache(offset, size) \
|
||||
ioremap_prot((offset), (size), _page_cachable_default)
|
||||
ioremap_prot((offset), (size), __pgprot(_page_cachable_default))
|
||||
|
||||
/*
|
||||
* ioremap_wc - map bus memory into CPU space
|
||||
@ -169,7 +169,7 @@ void iounmap(const volatile void __iomem *addr);
|
||||
* _CACHE_UNCACHED option (see cpu_probe() method).
|
||||
*/
|
||||
#define ioremap_wc(offset, size) \
|
||||
ioremap_prot((offset), (size), boot_cpu_data.writecombine)
|
||||
ioremap_prot((offset), (size), __pgprot(boot_cpu_data.writecombine))
|
||||
|
||||
#if defined(CONFIG_CPU_CAVIUM_OCTEON)
|
||||
#define war_io_reorder_wmb() wmb()
|
||||
|
@ -20,6 +20,4 @@
|
||||
#define nid_to_addrbase(nid) 0
|
||||
#endif
|
||||
|
||||
extern void setup_zero_pages(void);
|
||||
|
||||
#endif /* _ASM_MMZONE_H_ */
|
||||
|
@ -164,13 +164,6 @@ void __init paging_init(void)
|
||||
free_area_init(zones_size);
|
||||
}
|
||||
|
||||
void __init mem_init(void)
|
||||
{
|
||||
high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT);
|
||||
memblock_free_all();
|
||||
setup_zero_pages(); /* This comes from node 0 */
|
||||
}
|
||||
|
||||
/* All PCI device belongs to logical Node-0 */
|
||||
int pcibus_to_node(struct pci_bus *bus)
|
||||
{
|
||||
|
@ -59,24 +59,16 @@ EXPORT_SYMBOL(zero_page_mask);
|
||||
/*
|
||||
* Not static inline because used by IP27 special magic initialization code
|
||||
*/
|
||||
void setup_zero_pages(void)
|
||||
static void __init setup_zero_pages(void)
|
||||
{
|
||||
unsigned int order, i;
|
||||
struct page *page;
|
||||
unsigned int order;
|
||||
|
||||
if (cpu_has_vce)
|
||||
order = 3;
|
||||
else
|
||||
order = 0;
|
||||
|
||||
empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
|
||||
if (!empty_zero_page)
|
||||
panic("Oh boy, that early out of memory?");
|
||||
|
||||
page = virt_to_page((void *)empty_zero_page);
|
||||
split_page(page, order);
|
||||
for (i = 0; i < (1 << order); i++, page++)
|
||||
mark_page_reserved(page);
|
||||
empty_zero_page = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE << order, PAGE_SIZE);
|
||||
|
||||
zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK;
|
||||
}
|
||||
@ -423,17 +415,8 @@ void __init paging_init(void)
|
||||
" %ldk highmem ignored\n",
|
||||
(highend_pfn - max_low_pfn) << (PAGE_SHIFT - 10));
|
||||
max_zone_pfns[ZONE_HIGHMEM] = max_low_pfn;
|
||||
|
||||
max_mapnr = max_low_pfn;
|
||||
} else if (highend_pfn) {
|
||||
max_mapnr = highend_pfn;
|
||||
} else {
|
||||
max_mapnr = max_low_pfn;
|
||||
}
|
||||
#else
|
||||
max_mapnr = max_low_pfn;
|
||||
#endif
|
||||
high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
|
||||
|
||||
free_area_init(max_zone_pfns);
|
||||
}
|
||||
@ -442,26 +425,7 @@ void __init paging_init(void)
|
||||
static struct kcore_list kcore_kseg0;
|
||||
#endif
|
||||
|
||||
static inline void __init mem_init_free_highmem(void)
|
||||
{
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
unsigned long tmp;
|
||||
|
||||
if (cpu_has_dc_aliases)
|
||||
return;
|
||||
|
||||
for (tmp = highstart_pfn; tmp < highend_pfn; tmp++) {
|
||||
struct page *page = pfn_to_page(tmp);
|
||||
|
||||
if (!memblock_is_memory(PFN_PHYS(tmp)))
|
||||
SetPageReserved(page);
|
||||
else
|
||||
free_highmem_page(page);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void __init mem_init(void)
|
||||
void __init arch_mm_preinit(void)
|
||||
{
|
||||
/*
|
||||
* When PFN_PTE_SHIFT is greater than PAGE_SHIFT we won't have enough PTE
|
||||
@ -470,9 +434,7 @@ void __init mem_init(void)
|
||||
BUILD_BUG_ON(IS_ENABLED(CONFIG_32BIT) && (PFN_PTE_SHIFT > PAGE_SHIFT));
|
||||
|
||||
maar_init();
|
||||
memblock_free_all();
|
||||
setup_zero_pages(); /* Setup zeroed pages. */
|
||||
mem_init_free_highmem();
|
||||
|
||||
#ifdef CONFIG_64BIT
|
||||
if ((unsigned long) &_text > (unsigned long) CKSEG0)
|
||||
@ -482,6 +444,11 @@ void __init mem_init(void)
|
||||
0x80000000 - 4, KCORE_TEXT);
|
||||
#endif
|
||||
}
|
||||
#else /* CONFIG_NUMA */
|
||||
void __init arch_mm_preinit(void)
|
||||
{
|
||||
setup_zero_pages(); /* This comes from node 0 */
|
||||
}
|
||||
#endif /* !CONFIG_NUMA */
|
||||
|
||||
void free_init_pages(const char *what, unsigned long begin, unsigned long end)
|
||||
|
@ -44,9 +44,9 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
|
||||
* ioremap_prot gives the caller control over cache coherency attributes (CCA)
|
||||
*/
|
||||
void __iomem *ioremap_prot(phys_addr_t phys_addr, unsigned long size,
|
||||
unsigned long prot_val)
|
||||
pgprot_t prot)
|
||||
{
|
||||
unsigned long flags = prot_val & _CACHE_MASK;
|
||||
unsigned long flags = pgprot_val(prot) & _CACHE_MASK;
|
||||
unsigned long offset, pfn, last_pfn;
|
||||
struct vm_struct *area;
|
||||
phys_addr_t last_addr;
|
||||
|
@ -3,9 +3,9 @@
|
||||
#include <ioremap.h>
|
||||
|
||||
void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size,
|
||||
unsigned long prot_val)
|
||||
pgprot_t prot)
|
||||
{
|
||||
unsigned long flags = prot_val & _CACHE_MASK;
|
||||
unsigned long flags = pgprot_val(prot) & _CACHE_MASK;
|
||||
u64 base = (flags == _CACHE_UNCACHED ? IO_BASE : UNCAC_BASE);
|
||||
void __iomem *addr;
|
||||
|
||||
|
@ -406,8 +406,6 @@ void __init prom_meminit(void)
|
||||
}
|
||||
}
|
||||
|
||||
extern void setup_zero_pages(void);
|
||||
|
||||
void __init paging_init(void)
|
||||
{
|
||||
unsigned long zones_size[MAX_NR_ZONES] = {0, };
|
||||
@ -416,10 +414,3 @@ void __init paging_init(void)
|
||||
zones_size[ZONE_NORMAL] = max_low_pfn;
|
||||
free_area_init(zones_size);
|
||||
}
|
||||
|
||||
void __init mem_init(void)
|
||||
{
|
||||
high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT);
|
||||
memblock_free_all();
|
||||
setup_zero_pages(); /* This comes from node 0 */
|
||||
}
|
||||
|
@ -149,6 +149,8 @@ void __init setup_arch(char **cmdline_p)
|
||||
memory_start = memblock_start_of_DRAM();
|
||||
memory_end = memblock_end_of_DRAM();
|
||||
|
||||
pr_debug("%s: start=%lx, end=%lx\n", __func__, memory_start, memory_end);
|
||||
|
||||
setup_initial_init_mm(_stext, _etext, _edata, _end);
|
||||
init_task.thread.kregs = &fake_regs;
|
||||
|
||||
@ -156,7 +158,6 @@ void __init setup_arch(char **cmdline_p)
|
||||
*cmdline_p = boot_command_line;
|
||||
|
||||
find_limits(&min_low_pfn, &max_low_pfn, &max_pfn);
|
||||
max_mapnr = max_low_pfn;
|
||||
|
||||
memblock_reserve(__pa_symbol(_stext), _end - _stext);
|
||||
#ifdef CONFIG_BLK_DEV_INITRD
|
||||
|
@ -51,7 +51,7 @@ void __init paging_init(void)
|
||||
pagetable_init();
|
||||
pgd_current = swapper_pg_dir;
|
||||
|
||||
max_zone_pfn[ZONE_NORMAL] = max_mapnr;
|
||||
max_zone_pfn[ZONE_NORMAL] = max_low_pfn;
|
||||
|
||||
/* pass the memory from the bootmem allocator to the main allocator */
|
||||
free_area_init(max_zone_pfn);
|
||||
@ -60,20 +60,6 @@ void __init paging_init(void)
|
||||
(unsigned long)empty_zero_page + PAGE_SIZE);
|
||||
}
|
||||
|
||||
void __init mem_init(void)
|
||||
{
|
||||
unsigned long end_mem = memory_end; /* this must not include
|
||||
kernel stack at top */
|
||||
|
||||
pr_debug("mem_init: start=%lx, end=%lx\n", memory_start, memory_end);
|
||||
|
||||
end_mem &= PAGE_MASK;
|
||||
high_memory = __va(end_mem);
|
||||
|
||||
/* this will put all memory onto the freelists */
|
||||
memblock_free_all();
|
||||
}
|
||||
|
||||
void __init mmu_init(void)
|
||||
{
|
||||
flush_tlb_all();
|
||||
|
@ -193,15 +193,9 @@ void __init mem_init(void)
|
||||
{
|
||||
BUG_ON(!mem_map);
|
||||
|
||||
max_mapnr = max_low_pfn;
|
||||
high_memory = (void *)__va(max_low_pfn * PAGE_SIZE);
|
||||
|
||||
/* clear the zero-page */
|
||||
memset((void *)empty_zero_page, 0, PAGE_SIZE);
|
||||
|
||||
/* this will put all low memory onto the freelists */
|
||||
memblock_free_all();
|
||||
|
||||
printk("mem_init_done ...........................................\n");
|
||||
mem_init_done = 1;
|
||||
return;
|
||||
|
@ -131,7 +131,7 @@ static inline void gsc_writeq(unsigned long long val, unsigned long addr)
|
||||
_PAGE_ACCESSED | _PAGE_NO_CACHE)
|
||||
|
||||
#define ioremap_wc(addr, size) \
|
||||
ioremap_prot((addr), (size), _PAGE_IOREMAP)
|
||||
ioremap_prot((addr), (size), __pgprot(_PAGE_IOREMAP))
|
||||
|
||||
#define pci_iounmap pci_iounmap
|
||||
|
||||
|
@ -562,10 +562,6 @@ void __init mem_init(void)
|
||||
BUILD_BUG_ON(TMPALIAS_MAP_START >= 0x80000000);
|
||||
#endif
|
||||
|
||||
high_memory = __va((max_pfn << PAGE_SHIFT));
|
||||
set_max_mapnr(max_low_pfn);
|
||||
memblock_free_all();
|
||||
|
||||
#ifdef CONFIG_PA11
|
||||
if (boot_cpu_data.cpu_type == pcxl2 || boot_cpu_data.cpu_type == pcxl) {
|
||||
pcxl_dma_start = (unsigned long)SET_MAP_OFFSET(MAP_START);
|
||||
|
@ -14,7 +14,7 @@
|
||||
#include <linux/mm.h>
|
||||
|
||||
void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
|
||||
unsigned long prot)
|
||||
pgprot_t prot)
|
||||
{
|
||||
#ifdef CONFIG_EISA
|
||||
unsigned long end = phys_addr + size - 1;
|
||||
@ -41,6 +41,6 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
|
||||
}
|
||||
}
|
||||
|
||||
return generic_ioremap_prot(phys_addr, size, __pgprot(prot));
|
||||
return generic_ioremap_prot(phys_addr, size, prot);
|
||||
}
|
||||
EXPORT_SYMBOL(ioremap_prot);
|
||||
|
@ -148,6 +148,7 @@ config PPC
|
||||
select ARCH_HAS_PHYS_TO_DMA
|
||||
select ARCH_HAS_PMEM_API
|
||||
select ARCH_HAS_PREEMPT_LAZY
|
||||
select ARCH_HAS_PTDUMP
|
||||
select ARCH_HAS_PTE_DEVMAP if PPC_BOOK3S_64
|
||||
select ARCH_HAS_PTE_SPECIAL
|
||||
select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64
|
||||
@ -207,7 +208,6 @@ config PPC
|
||||
select GENERIC_IRQ_SHOW
|
||||
select GENERIC_IRQ_SHOW_LEVEL
|
||||
select GENERIC_PCI_IOMAP if PCI
|
||||
select GENERIC_PTDUMP
|
||||
select GENERIC_SMP_IDLE_THREAD
|
||||
select GENERIC_TIME_VSYSCALL
|
||||
select GENERIC_VDSO_DATA_STORE
|
||||
|
@ -77,4 +77,4 @@ CONFIG_DEBUG_VM_PGTABLE=y
|
||||
CONFIG_DETECT_HUNG_TASK=y
|
||||
CONFIG_BDI_SWITCH=y
|
||||
CONFIG_PPC_EARLY_DEBUG=y
|
||||
CONFIG_GENERIC_PTDUMP=y
|
||||
CONFIG_PTDUMP_DEBUGFS=y
|
||||
|
@ -94,4 +94,10 @@ static inline int check_and_get_huge_psize(int shift)
|
||||
return mmu_psize;
|
||||
}
|
||||
|
||||
#define arch_has_huge_bootmem_alloc arch_has_huge_bootmem_alloc
|
||||
|
||||
static inline bool arch_has_huge_bootmem_alloc(void)
|
||||
{
|
||||
return (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled());
|
||||
}
|
||||
#endif
|
||||
|
@ -826,7 +826,7 @@ void __iomem *ioremap_wt(phys_addr_t address, unsigned long size);
|
||||
|
||||
void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size);
|
||||
#define ioremap_cache(addr, size) \
|
||||
ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL))
|
||||
ioremap_prot((addr), (size), PAGE_KERNEL)
|
||||
|
||||
#define iounmap iounmap
|
||||
|
||||
|
@ -957,8 +957,6 @@ void __init setup_arch(char **cmdline_p)
|
||||
|
||||
/* Parse memory topology */
|
||||
mem_topology_setup();
|
||||
/* Set max_mapnr before paging_init() */
|
||||
set_max_mapnr(max_pfn);
|
||||
high_memory = (void *)__va(max_low_pfn * PAGE_SIZE);
|
||||
|
||||
/*
|
||||
|
@ -15,5 +15,5 @@ obj-$(CONFIG_NUMA) += numa.o
|
||||
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
|
||||
obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
|
||||
obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
|
||||
obj-$(CONFIG_PTDUMP_CORE) += ptdump/
|
||||
obj-$(CONFIG_PTDUMP) += ptdump/
|
||||
obj-$(CONFIG_KASAN) += kasan/
|
||||
|
@ -113,6 +113,7 @@ static int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate)
|
||||
gpage_freearray[nr_gpages] = 0;
|
||||
list_add(&m->list, &huge_boot_pages[0]);
|
||||
m->hstate = hstate;
|
||||
m->flags = 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -41,6 +41,7 @@
|
||||
#include <linux/libfdt.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/memory.h>
|
||||
#include <linux/bootmem_info.h>
|
||||
|
||||
#include <asm/pgalloc.h>
|
||||
#include <asm/page.h>
|
||||
@ -386,10 +387,13 @@ void __ref vmemmap_free(unsigned long start, unsigned long end,
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
|
||||
void register_page_bootmem_memmap(unsigned long section_nr,
|
||||
struct page *start_page, unsigned long size)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
|
||||
|
||||
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
|
||||
|
||||
|
@ -34,9 +34,9 @@ void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size)
|
||||
return __ioremap_caller(addr, size, prot, caller);
|
||||
}
|
||||
|
||||
void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long flags)
|
||||
void __iomem *ioremap_prot(phys_addr_t addr, size_t size, pgprot_t prot)
|
||||
{
|
||||
pte_t pte = __pte(flags);
|
||||
pte_t pte = __pte(pgprot_val(prot));
|
||||
void *caller = __builtin_return_address(0);
|
||||
|
||||
/* writeable implies dirty for kernel addresses */
|
||||
|
@ -273,7 +273,7 @@ void __init paging_init(void)
|
||||
mark_nonram_nosave();
|
||||
}
|
||||
|
||||
void __init mem_init(void)
|
||||
void __init arch_mm_preinit(void)
|
||||
{
|
||||
/*
|
||||
* book3s is limited to 16 page sizes due to encoding this in
|
||||
@ -295,22 +295,6 @@ void __init mem_init(void)
|
||||
|
||||
kasan_late_init();
|
||||
|
||||
memblock_free_all();
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
{
|
||||
unsigned long pfn, highmem_mapnr;
|
||||
|
||||
highmem_mapnr = lowmem_end_addr >> PAGE_SHIFT;
|
||||
for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) {
|
||||
phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT;
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
if (memblock_is_memory(paddr) && !memblock_is_reserved(paddr))
|
||||
free_highmem_page(page);
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_HIGHMEM */
|
||||
|
||||
#if defined(CONFIG_PPC_E500) && !defined(CONFIG_SMP)
|
||||
/*
|
||||
* If smp is enabled, next_tlbcam_idx is initialized in the cpu up
|
||||
|
@ -190,10 +190,10 @@ static void spu_unmap(struct spu *spu)
|
||||
static int __init setup_areas(struct spu *spu)
|
||||
{
|
||||
struct table {char* name; unsigned long addr; unsigned long size;};
|
||||
unsigned long shadow_flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO));
|
||||
|
||||
spu_pdata(spu)->shadow = ioremap_prot(spu_pdata(spu)->shadow_addr,
|
||||
sizeof(struct spe_shadow), shadow_flags);
|
||||
sizeof(struct spe_shadow),
|
||||
pgprot_noncached_wc(PAGE_KERNEL_RO));
|
||||
if (!spu_pdata(spu)->shadow) {
|
||||
pr_debug("%s:%d: ioremap shadow failed\n", __func__, __LINE__);
|
||||
goto fail_ioremap;
|
||||
|
@ -45,6 +45,7 @@ config RISCV
|
||||
select ARCH_HAS_PMEM_API
|
||||
select ARCH_HAS_PREEMPT_LAZY
|
||||
select ARCH_HAS_PREPARE_SYNC_CORE_CMD
|
||||
select ARCH_HAS_PTDUMP if MMU
|
||||
select ARCH_HAS_PTE_DEVMAP if 64BIT && MMU
|
||||
select ARCH_HAS_PTE_SPECIAL
|
||||
select ARCH_HAS_SET_DIRECT_MAP if MMU
|
||||
@ -115,7 +116,6 @@ config RISCV
|
||||
select GENERIC_LIB_DEVMEM_IS_ALLOWED
|
||||
select GENERIC_PENDING_IRQ if SMP
|
||||
select GENERIC_PCI_IOMAP
|
||||
select GENERIC_PTDUMP if MMU
|
||||
select GENERIC_SCHED_CLOCK
|
||||
select GENERIC_SMP_IDLE_THREAD
|
||||
select GENERIC_TIME_VSYSCALL if MMU && 64BIT
|
||||
|
@ -137,7 +137,7 @@ __io_writes_outs(outs, u64, q, __io_pbr(), __io_paw())
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
#define arch_memremap_wb(addr, size, flags) \
|
||||
((__force void *)ioremap_prot((addr), (size), _PAGE_KERNEL))
|
||||
((__force void *)ioremap_prot((addr), (size), __pgprot(_PAGE_KERNEL)))
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_RISCV_IO_H */
|
||||
|
@ -60,8 +60,7 @@ void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start,
|
||||
|
||||
bool arch_tlbbatch_should_defer(struct mm_struct *mm);
|
||||
void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
|
||||
struct mm_struct *mm,
|
||||
unsigned long uaddr);
|
||||
struct mm_struct *mm, unsigned long start, unsigned long end);
|
||||
void arch_flush_tlb_batched_pending(struct mm_struct *mm);
|
||||
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
|
||||
|
||||
|
@ -305,7 +305,7 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
|
||||
}
|
||||
}
|
||||
|
||||
return ioremap_prot(phys, size, pgprot_val(prot));
|
||||
return ioremap_prot(phys, size, prot);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PCI
|
||||
|
@ -19,7 +19,7 @@ obj-y += context.o
|
||||
obj-y += pmem.o
|
||||
|
||||
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
|
||||
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
|
||||
obj-$(CONFIG_PTDUMP) += ptdump.o
|
||||
obj-$(CONFIG_KASAN) += kasan_init.o
|
||||
|
||||
ifdef CONFIG_KASAN
|
||||
|
@ -171,7 +171,7 @@ static void __init print_vm_layout(void)
|
||||
static void print_vm_layout(void) { }
|
||||
#endif /* CONFIG_DEBUG_VM */
|
||||
|
||||
void __init mem_init(void)
|
||||
void __init arch_mm_preinit(void)
|
||||
{
|
||||
bool swiotlb = max_pfn > PFN_DOWN(dma32_phys_limit);
|
||||
#ifdef CONFIG_FLATMEM
|
||||
@ -192,7 +192,6 @@ void __init mem_init(void)
|
||||
}
|
||||
|
||||
swiotlb_init(swiotlb, SWIOTLB_VERBOSE);
|
||||
memblock_free_all();
|
||||
|
||||
print_vm_layout();
|
||||
}
|
||||
@ -295,10 +294,8 @@ static void __init setup_bootmem(void)
|
||||
phys_ram_end = memblock_end_of_DRAM();
|
||||
min_low_pfn = PFN_UP(phys_ram_base);
|
||||
max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end);
|
||||
high_memory = (void *)(__va(PFN_PHYS(max_low_pfn)));
|
||||
|
||||
dma32_phys_limit = min(4UL * SZ_1G, (unsigned long)PFN_PHYS(max_low_pfn));
|
||||
set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET);
|
||||
|
||||
reserve_initrd_mem();
|
||||
|
||||
|
@ -186,8 +186,7 @@ bool arch_tlbbatch_should_defer(struct mm_struct *mm)
|
||||
}
|
||||
|
||||
void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
|
||||
struct mm_struct *mm,
|
||||
unsigned long uaddr)
|
||||
struct mm_struct *mm, unsigned long start, unsigned long end)
|
||||
{
|
||||
cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
|
||||
}
|
||||
|
@ -92,6 +92,7 @@ config S390
|
||||
select ARCH_HAS_MEM_ENCRYPT
|
||||
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
|
||||
select ARCH_HAS_PREEMPT_LAZY
|
||||
select ARCH_HAS_PTDUMP
|
||||
select ARCH_HAS_PTE_SPECIAL
|
||||
select ARCH_HAS_SCALED_CPUTIME
|
||||
select ARCH_HAS_SET_DIRECT_MAP
|
||||
@ -159,7 +160,6 @@ config S390
|
||||
select GENERIC_CPU_VULNERABILITIES
|
||||
select GENERIC_ENTRY
|
||||
select GENERIC_GETTIMEOFDAY
|
||||
select GENERIC_PTDUMP
|
||||
select GENERIC_SMP_IDLE_THREAD
|
||||
select GENERIC_TIME_VSYSCALL
|
||||
select GENERIC_VDSO_DATA_STORE
|
||||
|
@ -92,7 +92,7 @@ CONFIG_UNIXWARE_DISKLABEL=y
|
||||
CONFIG_IOSCHED_BFQ=y
|
||||
CONFIG_BINFMT_MISC=m
|
||||
CONFIG_ZSWAP=y
|
||||
CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y
|
||||
CONFIG_ZSMALLOC=y
|
||||
CONFIG_ZSMALLOC_STAT=y
|
||||
CONFIG_SLAB_BUCKETS=y
|
||||
CONFIG_SLUB_STATS=y
|
||||
|
@ -86,7 +86,7 @@ CONFIG_UNIXWARE_DISKLABEL=y
|
||||
CONFIG_IOSCHED_BFQ=y
|
||||
CONFIG_BINFMT_MISC=m
|
||||
CONFIG_ZSWAP=y
|
||||
CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD=y
|
||||
CONFIG_ZSMALLOC=y
|
||||
CONFIG_ZSMALLOC_STAT=y
|
||||
CONFIG_SLAB_BUCKETS=y
|
||||
# CONFIG_COMPAT_BRK is not set
|
||||
|
@ -33,7 +33,7 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
|
||||
#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL)
|
||||
|
||||
#define ioremap_wc(addr, size) \
|
||||
ioremap_prot((addr), (size), pgprot_val(pgprot_writecombine(PAGE_KERNEL)))
|
||||
ioremap_prot((addr), (size), pgprot_writecombine(PAGE_KERNEL))
|
||||
|
||||
static inline void __iomem *ioport_map(unsigned long port, unsigned int nr)
|
||||
{
|
||||
|
@ -9,6 +9,6 @@ obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o
|
||||
obj-$(CONFIG_CMM) += cmm.o
|
||||
obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o
|
||||
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
|
||||
obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o
|
||||
obj-$(CONFIG_PTDUMP) += dump_pagetables.o
|
||||
obj-$(CONFIG_PGSTE) += gmap.o
|
||||
obj-$(CONFIG_PFAULT) += pfault.o
|
||||
|
@ -74,8 +74,6 @@ static void __init setup_zero_pages(void)
|
||||
{
|
||||
unsigned long total_pages = memblock_estimated_nr_free_pages();
|
||||
unsigned int order;
|
||||
struct page *page;
|
||||
int i;
|
||||
|
||||
/* Latest machines require a mapping granularity of 512KB */
|
||||
order = 7;
|
||||
@ -84,16 +82,7 @@ static void __init setup_zero_pages(void)
|
||||
while (order > 2 && (total_pages >> 10) < (1UL << order))
|
||||
order--;
|
||||
|
||||
empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
|
||||
if (!empty_zero_page)
|
||||
panic("Out of memory in setup_zero_pages");
|
||||
|
||||
page = virt_to_page((void *) empty_zero_page);
|
||||
split_page(page, order);
|
||||
for (i = 1 << order; i > 0; i--) {
|
||||
mark_page_reserved(page);
|
||||
page++;
|
||||
}
|
||||
empty_zero_page = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE << order, PAGE_SIZE);
|
||||
|
||||
zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK;
|
||||
}
|
||||
@ -166,18 +155,13 @@ static void pv_init(void)
|
||||
swiotlb_update_mem_attributes();
|
||||
}
|
||||
|
||||
void __init mem_init(void)
|
||||
void __init arch_mm_preinit(void)
|
||||
{
|
||||
cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask);
|
||||
cpumask_set_cpu(0, mm_cpumask(&init_mm));
|
||||
|
||||
set_max_mapnr(max_low_pfn);
|
||||
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
|
||||
|
||||
pv_init();
|
||||
|
||||
/* this will put all low memory onto the freelists */
|
||||
memblock_free_all();
|
||||
setup_zero_pages(); /* Setup zeroed pages. */
|
||||
}
|
||||
|
||||
@ -239,16 +223,13 @@ struct s390_cma_mem_data {
|
||||
static int s390_cma_check_range(struct cma *cma, void *data)
|
||||
{
|
||||
struct s390_cma_mem_data *mem_data;
|
||||
unsigned long start, end;
|
||||
|
||||
mem_data = data;
|
||||
start = cma_get_base(cma);
|
||||
end = start + cma_get_size(cma);
|
||||
if (end < mem_data->start)
|
||||
return 0;
|
||||
if (start >= mem_data->end)
|
||||
return 0;
|
||||
return -EBUSY;
|
||||
|
||||
if (cma_intersects(cma, mem_data->start, mem_data->end))
|
||||
return -EBUSY;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int s390_cma_mem_notifier(struct notifier_block *nb,
|
||||
|
@ -255,7 +255,7 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res,
|
||||
}
|
||||
|
||||
void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
|
||||
unsigned long prot)
|
||||
pgprot_t prot)
|
||||
{
|
||||
/*
|
||||
* When PCI MIO instructions are unavailable the "physical" address
|
||||
@ -265,7 +265,7 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
|
||||
if (!static_branch_unlikely(&have_mio))
|
||||
return (void __iomem *)phys_addr;
|
||||
|
||||
return generic_ioremap_prot(phys_addr, size, __pgprot(prot));
|
||||
return generic_ioremap_prot(phys_addr, size, prot);
|
||||
}
|
||||
EXPORT_SYMBOL(ioremap_prot);
|
||||
|
||||
|
@ -58,7 +58,7 @@ static int __init landisk_devices_setup(void)
|
||||
/* open I/O area window */
|
||||
paddrbase = virt_to_phys((void *)PA_AREA5_IO);
|
||||
prot = PAGE_KERNEL_PCC(1, _PAGE_PCC_IO16);
|
||||
cf_ide_base = ioremap_prot(paddrbase, PAGE_SIZE, pgprot_val(prot));
|
||||
cf_ide_base = ioremap_prot(paddrbase, PAGE_SIZE, prot);
|
||||
if (!cf_ide_base) {
|
||||
printk("allocate_cf_area : can't open CF I/O window!\n");
|
||||
return -ENOMEM;
|
||||
|
@ -53,7 +53,7 @@ static int __init lboxre2_devices_setup(void)
|
||||
paddrbase = virt_to_phys((void*)PA_AREA5_IO);
|
||||
psize = PAGE_SIZE;
|
||||
prot = PAGE_KERNEL_PCC(1, _PAGE_PCC_IO16);
|
||||
cf0_io_base = (u32)ioremap_prot(paddrbase, psize, pgprot_val(prot));
|
||||
cf0_io_base = (u32)ioremap_prot(paddrbase, psize, prot);
|
||||
if (!cf0_io_base) {
|
||||
printk(KERN_ERR "%s : can't open CF I/O window!\n" , __func__ );
|
||||
return -ENOMEM;
|
||||
|
@ -75,7 +75,7 @@ static int __init sh03_devices_setup(void)
|
||||
/* open I/O area window */
|
||||
paddrbase = virt_to_phys((void *)PA_AREA5_IO);
|
||||
prot = PAGE_KERNEL_PCC(1, _PAGE_PCC_IO16);
|
||||
cf_ide_base = ioremap_prot(paddrbase, PAGE_SIZE, pgprot_val(prot));
|
||||
cf_ide_base = ioremap_prot(paddrbase, PAGE_SIZE, prot);
|
||||
if (!cf_ide_base) {
|
||||
printk("allocate_cf_area : can't open CF I/O window!\n");
|
||||
return -ENOMEM;
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user