1
0
mirror of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2026-01-11 17:10:13 +00:00

Compare commits

...

456 Commits

Author SHA1 Message Date
Linus Torvalds
7203ca412f Significant patch series in this merge are as follows:
- The 10 patch series "__vmalloc()/kvmalloc() and no-block support" from
   Uladzislau Rezki reworks the vmalloc() code to support non-blocking
   allocations (GFP_ATOIC, GFP_NOWAIT).
 
 - The 2 patch series "ksm: fix exec/fork inheritance" from xu xin fixes
   a rare case where the KSM MMF_VM_MERGE_ANY prctl state is not inherited
   across fork/exec.
 
 - The 4 patch series "mm/zswap: misc cleanup of code and documentations"
   from SeongJae Park does some light maintenance work on the zswap code.
 
 - The 5 patch series "mm/page_owner: add debugfs files 'show_handles'
   and 'show_stacks_handles'" from Mauricio Faria de Oliveira enhances the
   /sys/kernel/debug/page_owner debug feature.  It adds unique identifiers
   to differentiate the various stack traces so that userspace monitoring
   tools can better match stack traces over time.
 
 - The 2 patch series "mm/page_alloc: pcp->batch cleanups" from Joshua
   Hahn makes some minor alterations to the page allocator's per-cpu-pages
   feature.
 
 - The 2 patch series "Improve UFFDIO_MOVE scalability by removing
   anon_vma lock" from Lokesh Gidra addresses a scalability issue in
   userfaultfd's UFFDIO_MOVE operation.
 
 - The 2 patch series "kasan: cleanups for kasan_enabled() checks" from
   Sabyrzhan Tasbolatov performs some cleanup in the KASAN code.
 
 - The 2 patch series "drivers/base/node: fold node register and
   unregister functions" from Donet Tom cleans up the NUMA node handling
   code a little.
 
 - The 4 patch series "mm: some optimizations for prot numa" from Kefeng
   Wang provides some cleanups and small optimizations to the NUMA
   allocation hinting code.
 
 - The 5 patch series "mm/page_alloc: Batch callers of
   free_pcppages_bulk" from Joshua Hahn addresses long lock hold times at
   boot on large machines.  These were causing (harmless) softlockup
   warnings.
 
 - The 2 patch series "optimize the logic for handling dirty file folios
   during reclaim" from Baolin Wang removes some now-unnecessary work from
   page reclaim.
 
 - The 10 patch series "mm/damon: allow DAMOS auto-tuned for per-memcg
   per-node memory usage" from SeongJae Park enhances the DAMOS auto-tuning
   feature.
 
 - The 2 patch series "mm/damon: fixes for address alignment issues in
   DAMON_LRU_SORT and DAMON_RECLAIM" from Quanmin Yan fixes DAMON_LRU_SORT
   and DAMON_RECLAIM with certain userspace configuration.
 
 - The 15 patch series "expand mmap_prepare functionality, port more
   users" from Lorenzo Stoakes enhances the new(ish)
   file_operations.mmap_prepare() method and ports additional callsites
   from the old ->mmap() over to ->mmap_prepare().
 
 - The 8 patch series "Fix stale IOTLB entries for kernel address space"
   from Lu Baolu fixes a bug (and possible security issue on non-x86) in
   the IOMMU code.  In some situations the IOMMU could be left hanging onto
   a stale kernel pagetable entry.
 
 - The 4 patch series "mm/huge_memory: cleanup __split_unmapped_folio()"
   from Wei Yang cleans up and optimizes the folio splitting code.
 
 - The 5 patch series "mm, swap: misc cleanup and bugfix" from Kairui
   Song implements some cleanups and a minor fix in the swap discard code.
 
 - The 8 patch series "mm/damon: misc documentation fixups" from SeongJae
   Park does as advertised.
 
 - The 9 patch series "mm/damon: support pin-point targets removal" from
   SeongJae Park permits userspace to remove a specific monitoring target
   in the middle of the current targets list.
 
 - The 2 patch series "mm: MISC follow-up patches for linux/pgalloc.h"
   from Harry Yoo implements a couple of cleanups related to mm header file
   inclusion.
 
 - The 2 patch series "mm/swapfile.c: select swap devices of default
   priority round robin" from Baoquan He improves the selection of swap
   devices for NUMA machines.
 
 - The 3 patch series "mm: Convert memory block states (MEM_*) macros to
   enums" from Israel Batista changes the memory block labels from macros
   to enums so they will appear in kernel debug info.
 
 - The 3 patch series "ksm: perform a range-walk to jump over holes in
   break_ksm" from Pedro Demarchi Gomes addresses an inefficiency when KSM
   unmerges an address range.
 
 - The 22 patch series "mm/damon/tests: fix memory bugs in kunit tests"
   from SeongJae Park fixes leaks and unhandled malloc() failures in DAMON
   userspace unit tests.
 
 - The 2 patch series "some cleanups for pageout()" from Baolin Wang
   cleans up a couple of minor things in the page scanner's
   writeback-for-eviction code.
 
 - The 2 patch series "mm/hugetlb: refactor sysfs/sysctl interfaces" from
   Hui Zhu moves hugetlb's sysfs/sysctl handling code into a new file.
 
 - The 9 patch series "introduce VM_MAYBE_GUARD and make it sticky" from
   Lorenzo Stoakes makes the VMA guard regions available in /proc/pid/smaps
   and improves the mergeability of guarded VMAs.
 
 - The 2 patch series "mm: perform guard region install/remove under VMA
   lock" from Lorenzo Stoakes reduces mmap lock contention for callers
   performing VMA guard region operations.
 
 - The 2 patch series "vma_start_write_killable" from Matthew Wilcox
   starts work in permitting applications to be killed when they are
   waiting on a read_lock on the VMA lock.
 
 - The 11 patch series "mm/damon/tests: add more tests for online
   parameters commit" from SeongJae Park adds additional userspace testing
   of DAMON's "commit" feature.
 
 - The 9 patch series "mm/damon: misc cleanups" from SeongJae Park does
   that.
 
 - The 2 patch series "make VM_SOFTDIRTY a sticky VMA flag" from Lorenzo
   Stoakes addresses the possible loss of a VMA's VM_SOFTDIRTY flag when
   that VMA is merged with another.
 
 - The 16 patch series "mm: support device-private THP" from Balbir Singh
   introduces support for Transparent Huge Page (THP) migration in zone
   device-private memory.
 
 - The 3 patch series "Optimize folio split in memory failure" from Zi
   Yan optimizes folio split operations in the memory failure code.
 
 - The 2 patch series "mm/huge_memory: Define split_type and consolidate
   split support checks" from Wei Yang provides some more cleanups in the
   folio splitting code.
 
 - The 16 patch series "mm: remove is_swap_[pte, pmd]() + non-swap
   entries, introduce leaf entries" from Lorenzo Stoakes cleans up our
   handling of pagetable leaf entries by introducing the concept of
   'software leaf entries', of type softleaf_t.
 
 - The 4 patch series "reparent the THP split queue" from Muchun Song
   reparents the THP split queue to its parent memcg.  This is in
   preparation for addressing the long-standing "dying memcg" problem,
   wherein dead memcg's linger for too long, consuming memory resources.
 
 - The 3 patch series "unify PMD scan results and remove redundant
   cleanup" from Wei Yang does a little cleanup in the hugepage collapse
   code.
 
 - The 6 patch series "zram: introduce writeback bio batching" from
   Sergey Senozhatsky improves zram writeback efficiency by introducing
   batched bio writeback support.
 
 - The 4 patch series "memcg: cleanup the memcg stats interfaces" from
   Shakeel Butt cleans up our handling of the interrupt safety of some
   memcg stats.
 
 - The 4 patch series "make vmalloc gfp flags usage more apparent" from
   Vishal Moola cleans up vmalloc's handling of incoming GFP flags.
 
 - The 6 patch series "mm: Add soft-dirty and uffd-wp support for RISC-V"
   from Chunyan Zhang teches soft dirty and userfaultfd write protect
   tracking to use RISC-V's Svrsw60t59b extension.
 
 - The 5 patch series "mm: swap: small fixes and comment cleanups" from
   Youngjun Park fixes a small bug and cleans up some of the swap code.
 
 - The 4 patch series "initial work on making VMA flags a bitmap" from
   Lorenzo Stoakes starts work on converting the vma struct's flags to a
   bitmap, so we stop running out of them, especially on 32-bit.
 
 - The 2 patch series "mm/swapfile: fix and cleanup swap list iterations"
   from Youngjun Park addresses a possible bug in the swap discard code and
   cleans things up a little.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCaTEb0wAKCRDdBJ7gKXxA
 jjfIAP94W4EkCCwNOupnChoG+YWw/JW21anXt5NN+i5svn1yugEAwzvv6A+cAFng
 o+ug/fyrfPZG7PLp2R8WFyGIP0YoBA4=
 =IUzS
 -----END PGP SIGNATURE-----

Merge tag 'mm-stable-2025-12-03-21-26' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull MM updates from Andrew Morton:

  "__vmalloc()/kvmalloc() and no-block support" (Uladzislau Rezki)
     Rework the vmalloc() code to support non-blocking allocations
     (GFP_ATOIC, GFP_NOWAIT)

  "ksm: fix exec/fork inheritance" (xu xin)
     Fix a rare case where the KSM MMF_VM_MERGE_ANY prctl state is not
     inherited across fork/exec

  "mm/zswap: misc cleanup of code and documentations" (SeongJae Park)
     Some light maintenance work on the zswap code

  "mm/page_owner: add debugfs files 'show_handles' and 'show_stacks_handles'" (Mauricio Faria de Oliveira)
     Enhance the /sys/kernel/debug/page_owner debug feature by adding
     unique identifiers to differentiate the various stack traces so
     that userspace monitoring tools can better match stack traces over
     time

  "mm/page_alloc: pcp->batch cleanups" (Joshua Hahn)
     Minor alterations to the page allocator's per-cpu-pages feature

  "Improve UFFDIO_MOVE scalability by removing anon_vma lock" (Lokesh Gidra)
     Address a scalability issue in userfaultfd's UFFDIO_MOVE operation

  "kasan: cleanups for kasan_enabled() checks" (Sabyrzhan Tasbolatov)

  "drivers/base/node: fold node register and unregister functions" (Donet Tom)
     Clean up the NUMA node handling code a little

  "mm: some optimizations for prot numa" (Kefeng Wang)
     Cleanups and small optimizations to the NUMA allocation hinting
     code

  "mm/page_alloc: Batch callers of free_pcppages_bulk" (Joshua Hahn)
     Address long lock hold times at boot on large machines. These were
     causing (harmless) softlockup warnings

  "optimize the logic for handling dirty file folios during reclaim" (Baolin Wang)
     Remove some now-unnecessary work from page reclaim

  "mm/damon: allow DAMOS auto-tuned for per-memcg per-node memory usage" (SeongJae Park)
     Enhance the DAMOS auto-tuning feature

  "mm/damon: fixes for address alignment issues in DAMON_LRU_SORT and DAMON_RECLAIM" (Quanmin Yan)
     Fix DAMON_LRU_SORT and DAMON_RECLAIM with certain userspace
     configuration

  "expand mmap_prepare functionality, port more users" (Lorenzo Stoakes)
     Enhance the new(ish) file_operations.mmap_prepare() method and port
     additional callsites from the old ->mmap() over to ->mmap_prepare()

  "Fix stale IOTLB entries for kernel address space" (Lu Baolu)
     Fix a bug (and possible security issue on non-x86) in the IOMMU
     code. In some situations the IOMMU could be left hanging onto a
     stale kernel pagetable entry

  "mm/huge_memory: cleanup __split_unmapped_folio()" (Wei Yang)
     Clean up and optimize the folio splitting code

  "mm, swap: misc cleanup and bugfix" (Kairui Song)
     Some cleanups and a minor fix in the swap discard code

  "mm/damon: misc documentation fixups" (SeongJae Park)

  "mm/damon: support pin-point targets removal" (SeongJae Park)
     Permit userspace to remove a specific monitoring target in the
     middle of the current targets list

  "mm: MISC follow-up patches for linux/pgalloc.h" (Harry Yoo)
     A couple of cleanups related to mm header file inclusion

  "mm/swapfile.c: select swap devices of default priority round robin" (Baoquan He)
     improve the selection of swap devices for NUMA machines

  "mm: Convert memory block states (MEM_*) macros to enums" (Israel Batista)
     Change the memory block labels from macros to enums so they will
     appear in kernel debug info

  "ksm: perform a range-walk to jump over holes in break_ksm" (Pedro Demarchi Gomes)
     Address an inefficiency when KSM unmerges an address range

  "mm/damon/tests: fix memory bugs in kunit tests" (SeongJae Park)
     Fix leaks and unhandled malloc() failures in DAMON userspace unit
     tests

  "some cleanups for pageout()" (Baolin Wang)
     Clean up a couple of minor things in the page scanner's
     writeback-for-eviction code

  "mm/hugetlb: refactor sysfs/sysctl interfaces" (Hui Zhu)
     Move hugetlb's sysfs/sysctl handling code into a new file

  "introduce VM_MAYBE_GUARD and make it sticky" (Lorenzo Stoakes)
     Make the VMA guard regions available in /proc/pid/smaps and
     improves the mergeability of guarded VMAs

  "mm: perform guard region install/remove under VMA lock" (Lorenzo Stoakes)
     Reduce mmap lock contention for callers performing VMA guard region
     operations

  "vma_start_write_killable" (Matthew Wilcox)
     Start work on permitting applications to be killed when they are
     waiting on a read_lock on the VMA lock

  "mm/damon/tests: add more tests for online parameters commit" (SeongJae Park)
     Add additional userspace testing of DAMON's "commit" feature

  "mm/damon: misc cleanups" (SeongJae Park)

  "make VM_SOFTDIRTY a sticky VMA flag" (Lorenzo Stoakes)
     Address the possible loss of a VMA's VM_SOFTDIRTY flag when that
     VMA is merged with another

  "mm: support device-private THP" (Balbir Singh)
     Introduce support for Transparent Huge Page (THP) migration in zone
     device-private memory

  "Optimize folio split in memory failure" (Zi Yan)

  "mm/huge_memory: Define split_type and consolidate split support checks" (Wei Yang)
     Some more cleanups in the folio splitting code

  "mm: remove is_swap_[pte, pmd]() + non-swap entries, introduce leaf entries" (Lorenzo Stoakes)
     Clean up our handling of pagetable leaf entries by introducing the
     concept of 'software leaf entries', of type softleaf_t

  "reparent the THP split queue" (Muchun Song)
     Reparent the THP split queue to its parent memcg. This is in
     preparation for addressing the long-standing "dying memcg" problem,
     wherein dead memcg's linger for too long, consuming memory
     resources

  "unify PMD scan results and remove redundant cleanup" (Wei Yang)
     A little cleanup in the hugepage collapse code

  "zram: introduce writeback bio batching" (Sergey Senozhatsky)
     Improve zram writeback efficiency by introducing batched bio
     writeback support

  "memcg: cleanup the memcg stats interfaces" (Shakeel Butt)
     Clean up our handling of the interrupt safety of some memcg stats

  "make vmalloc gfp flags usage more apparent" (Vishal Moola)
     Clean up vmalloc's handling of incoming GFP flags

  "mm: Add soft-dirty and uffd-wp support for RISC-V" (Chunyan Zhang)
     Teach soft dirty and userfaultfd write protect tracking to use
     RISC-V's Svrsw60t59b extension

  "mm: swap: small fixes and comment cleanups" (Youngjun Park)
     Fix a small bug and clean up some of the swap code

  "initial work on making VMA flags a bitmap" (Lorenzo Stoakes)
     Start work on converting the vma struct's flags to a bitmap, so we
     stop running out of them, especially on 32-bit

  "mm/swapfile: fix and cleanup swap list iterations" (Youngjun Park)
     Address a possible bug in the swap discard code and clean things
     up a little

[ This merge also reverts commit ebb9aeb980e5 ("vfio/nvgrace-gpu:
  register device memory for poison handling") because it looks
  broken to me, I've asked for clarification   - Linus ]

* tag 'mm-stable-2025-12-03-21-26' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (321 commits)
  mm: fix vma_start_write_killable() signal handling
  mm/swapfile: use plist_for_each_entry in __folio_throttle_swaprate
  mm/swapfile: fix list iteration when next node is removed during discard
  fs/proc/task_mmu.c: fix make_uffd_wp_huge_pte() huge pte handling
  mm/kfence: add reboot notifier to disable KFENCE on shutdown
  memcg: remove inc/dec_lruvec_kmem_state helpers
  selftests/mm/uffd: initialize char variable to Null
  mm: fix DEBUG_RODATA_TEST indentation in Kconfig
  mm: introduce VMA flags bitmap type
  tools/testing/vma: eliminate dependency on vma->__vm_flags
  mm: simplify and rename mm flags function for clarity
  mm: declare VMA flags by bit
  zram: fix a spelling mistake
  mm/page_alloc: optimize lowmem_reserve max lookup using its semantic monotonicity
  mm/vmscan: skip increasing kswapd_failures when reclaim was boosted
  pagemap: update BUDDY flag documentation
  mm: swap: remove scan_swap_map_slots() references from comments
  mm: swap: change swap_alloc_slow() to void
  mm, swap: remove redundant comment for read_swap_cache_async
  mm, swap: use SWP_SOLIDSTATE to determine if swap is rotational
  ...
2025-12-05 13:52:43 -08:00
Linus Torvalds
ac20755937 Summary
* Move jiffies converters out of kernel/sysctl.c
 
   Moved the jiffies converters into kernel/time/jiffies.c and replaced
   the pipe-max-size proc_handler converter with a macro based version.
   This is all part of the effort to relocate non-sysctl logic out of
   kernel/sysctl.c into more relevant subsystems. No functional changes.
 
 * Generalize proc handler converter creation
 
   Removed duplicated sysctl converter logic by consolidating it in
   macros. These are used inside sysctl core as well as in pipe.c and
   jiffies.c. Converter kernel and user space pointer args are now
   automatically const qualified for the convenience of the caller. No
   functional changes.
 
 * Miscellaneous
 
   Fixed kernel-doc format warnings, removed unnecessary __user
   qualifiers, and moved the nmi_watchdog sysctl into .rodata.
 
 * Testing
 
   This series was run through sysctl selftests/kunit test suite in
   x86_64. It went into linux-next after rc2, giving it a good 4/5 weeks
   of testing.
 -----BEGIN PGP SIGNATURE-----
 
 iQGzBAABCgAdFiEErkcJVyXmMSXOyyeQupfNUreWQU8FAmktuJMACgkQupfNUreW
 QU9l8Qv+Noh/wLTqBEmHCrQ8k19YCNlBHO6a10Q5bFiAiGdTAMCZ7oFzoAwAjv5y
 pLtzS75G89zP0O6wgkxTsmoDNi4MRJenOCyjyEDFYvrK+qSTm0CWs0sZCsHqX4Dg
 7M+7PVK/EbMO5509J2ae6cYS9pjfwg3EBQZ978b/FATkuhRjxOIJhIv3ZoaFjme4
 0q/xqHw+oms5CUL035BfqtkoskIiRT19DAvM/DEjc2ByaHCTGURv00XLvSDHaRer
 O0Z8nXaxOOCscLunZbC3UL+hC7tB0nPE+XSzm9ylBEM7bTxeZmtvx2G6ru0+873U
 Sp+BwpFhe0RmzBFlclkd7UPtvGlFAY2QgAfpSaiLsodkoX0mctquTgpy99LhxKej
 EEyjl9tPVrYoH4MG562bZPGrQHtV4vnR9DXYx56vYtY2Fyr1GZmWawQoMZsHe9AU
 cVw5HrfeKeHBhk9hi3ZvT9z96ns3YBmIHnYNDeMy+mF/i+cVu///GwgGuFqUqKag
 3eWcTaPh
 =DXVD
 -----END PGP SIGNATURE-----

Merge tag 'sysctl-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/sysctl/sysctl

Pull sysctl updates from Joel Granados:

 - Move jiffies converters out of kernel/sysctl.c

   Move the jiffies converters into kernel/time/jiffies.c and replace
   the pipe-max-size proc_handler converter with a macro based version.
   This is all part of the effort to relocate non-sysctl logic out of
   kernel/sysctl.c into more relevant subsystems. No functional changes.

 - Generalize proc handler converter creation

   Remove duplicated sysctl converter logic by consolidating it in
   macros. These are used inside sysctl core as well as in pipe.c and
   jiffies.c. Converter kernel and user space pointer args are now
   automatically const qualified for the convenience of the caller. No
   functional changes.

 - Miscellaneous

   Fix kernel-doc format warnings, remove unnecessary __user
   qualifiers, and move the nmi_watchdog sysctl into .rodata.

 - Testing

   This series was run through sysctl selftests/kunit test suite in
   x86_64. It went into linux-next after rc2, giving it a good 4/5 weeks
   of testing.

* tag 'sysctl-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/sysctl/sysctl: (21 commits)
  sysctl: Wrap do_proc_douintvec with the public function proc_douintvec_conv
  sysctl: Create pipe-max-size converter using sysctl UINT macros
  sysctl: Move proc_doulongvec_ms_jiffies_minmax to kernel/time/jiffies.c
  sysctl: Move jiffies converters to kernel/time/jiffies.c
  sysctl: Move UINT converter macros to sysctl header
  sysctl: Move INT converter macros to sysctl header
  sysctl: Allow custom converters from outside sysctl
  sysctl: remove __user qualifier from stack_erasing_sysctl buffer argument
  sysctl: Create macro for user-to-kernel uint converter
  sysctl: Add optional range checking to SYSCTL_UINT_CONV_CUSTOM
  sysctl: Create unsigned int converter using new macro
  sysctl: Add optional range checking to SYSCTL_INT_CONV_CUSTOM
  sysctl: Create integer converters with one macro
  sysctl: Create converter functions with two new macros
  sysctl: Discriminate between kernel and user converter params
  sysctl: Indicate the direction of operation with macro names
  sysctl: Remove superfluous __do_proc_* indirection
  sysctl: Remove superfluous tbl_data param from "dovec" functions
  sysctl: Replace void pointer with const pointer to ctl_table
  sysctl: fix kernel-doc format warning
  ...
2025-12-05 11:15:37 -08:00
Linus Torvalds
d1d36025a6 Probes for v6.19
- fprobe: Performance enhancement of the fprobe using rhltable
   . fprobe: use rhltable for fprobe_ip_table. The fprobe IP table has
     been converted to use an rhltable for improved performance when
     dealing with a large number of probed functions.
   . Fix a suspicious RCU usage warning of the above change in the
     fprobe entry handler.
   . Remove an unused local variable of the above change.
   . Fix to initialize fprobe_ip_table in core_initcall().
 
 - fprobe: Performance optimization of fprobe by ftrace
   . fprobe: Use ftrace instead of fgraph for entry only probes. This
     avoids the unneeded overhead of fgraph stack setup.
   . Also update fprobe selftest for entry-only probe.
   . fprobe: Use ftrace only if CONFIG_DYNAMIC_FTRACE_WITH_ARGS or
     WITH_REGS is defined.
 
 - probes: Cleanup probe event subsystems.
   . uprobe/eprobe: Allocate traceprobe_parse_context per probe instead
     of each probe argument parsing. This reduce memory allocation/free
     of temporary working memory.
   . uprobes: Cleanup code using __free().
   . eprobes: Cleanup code using __free().
   . probes: Cleanup code using __free(trace_probe_log_clear) to clear
     error log automatically.
   . probes: Replace strcpy() with memcpy() in __trace_probe_log_err().
 -----BEGIN PGP SIGNATURE-----
 
 iQFPBAABCgA5FiEEh7BulGwFlgAOi5DV2/sHvwUrPxsFAmkvhSsbHG1hc2FtaS5o
 aXJhbWF0c3VAZ21haWwuY29tAAoJENv7B78FKz8bWhEH/23XM5Msjy5vopB+ECZb
 iCj8SkWrQzfiCBILUqxCkZdfJHFomGPHewxvxIOWdb7evtHuy0Ypne/Uw/TMAtAh
 xvDQmu03IV2jO7h7GExsnEh0nX0upYg4IVmN0sCSSWSfgLLTWO9ICClavV9adcva
 ZR+5TdZbK+W59n+ejxA9OMDt1G+nz1Ls9Qhx9ktf7odkJzBkQGPq/heZuPbF3+6k
 Vj2IHTuqWobDDt+ekKOBRWNh9cS61ybxvsr/vmkT6s904ortP6mZa3zEYPRVOUNG
 WJ/KGJwvExTcaG/Dy2g6q8tam1Bidx9/S6klyOGXQXxvaIT1VtBc66HzAUfso6jg
 yIc=
 =w6Kq
 -----END PGP SIGNATURE-----

Merge tag 'probes-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull probes updates from Masami Hiramatsu:
 "fprobe performance enhancement using rhltable:
   - use rhltable for fprobe_ip_table. The fprobe IP table has been
     converted to use an rhltable for improved performance when dealing
     with a large number of probed functions
   - Fix a suspicious RCU usage warning of the above change in the
     fprobe entry handler
   - Remove an unused local variable of the above change
   - Fix to initialize fprobe_ip_table in core_initcall()

  Performance optimization of fprobe by ftrace:
   - Use ftrace instead of fgraph for entry only probes. This avoids the
     unneeded overhead of fgraph stack setup
   - Also update fprobe selftest for entry-only probe
   - fprobe: Use ftrace only if CONFIG_DYNAMIC_FTRACE_WITH_ARGS or
     WITH_REGS is defined

  Cleanup probe event subsystems:
   - Allocate traceprobe_parse_context per probe instead of each probe
     argument parsing. This reduce memory allocation/free of temporary
     working memory
   - Cleanup code using __free()
   - Replace strcpy() with memcpy() in __trace_probe_log_err()"

* tag 'probes-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
  tracing: fprobe: use ftrace if CONFIG_DYNAMIC_FTRACE_WITH_ARGS
  lib/test_fprobe: add testcase for mixed fprobe
  tracing: fprobe: optimization for entry only case
  tracing: fprobe: Fix to init fprobe_ip_table earlier
  tracing: fprobe: Remove unused local variable
  tracing: probes: Replace strcpy() with memcpy() in __trace_probe_log_err()
  tracing: fprobe: fix suspicious rcu usage in fprobe_entry
  tracing: uprobe: eprobes: Allocate traceprobe_parse_context per probe
  tracing: uprobes: Cleanup __trace_uprobe_create() with __free()
  tracing: eprobe: Cleanup eprobe event using __free()
  tracing: probes: Use __free() for trace_probe_log
  tracing: fprobe: use rhltable for fprobe_ip_table
2025-12-05 10:55:47 -08:00
Linus Torvalds
2e8c1c6a50 ktest: Fix for v6.19:
- Fix incorrect variable in error message in config-bisect.pl
 
   If the old config file fails to get copied as the last good or bad
   config file, then it fails the program and prints an error message.
   But the variable used to print what the old config's name was incorrect.
   It was $config when it should have been $output_config.
 -----BEGIN PGP SIGNATURE-----
 
 iIoEABYKADIWIQRRSw7ePDh/lE+zeZMp5XQQmuv6qgUCaTL5PxQccm9zdGVkdEBn
 b29kbWlzLm9yZwAKCRAp5XQQmuv6qnWBAQDtZ+UNtbeNR2eHzbcQ1+ENi0aWGwF9
 e93hKyvAWkMgWAEAklDIdstyCaSQQgq3X4ilv1kaG1eu+KSWNnyhmqnyqAM=
 =KPW2
 -----END PGP SIGNATURE-----

Merge tag 'ktest-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-ktest

Pull ktest fix from Steven Rostedt:

 - Fix incorrect variable in error message in config-bisect.pl

   If the old config file fails to get copied as the last good or bad
   config file, then it fails the program and prints an error message.

   But the variable used to print what the old config's name was
   incorrect. It was $config when it should have been $output_config.

* tag 'ktest-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-ktest:
  ktest.pl: Fix uninitialized var in config-bisect.pl
2025-12-05 10:53:43 -08:00
Linus Torvalds
2ba59045fb - Add helper functions for allocations
The allocation of the per CPU buffer descriptor, the buffer page
   descriptors and the buffer page data itself can be pretty ugly.
   Add some helper macros and a function to have the code that allocates
   buffer pages and such look a little cleaner.
 -----BEGIN PGP SIGNATURE-----
 
 iIoEABYKADIWIQRRSw7ePDh/lE+zeZMp5XQQmuv6qgUCaTL3JxQccm9zdGVkdEBn
 b29kbWlzLm9yZwAKCRAp5XQQmuv6qvDgAP9HFxPe2EqGspnY0RungWDs3yCxqlUp
 Eqz7SaI9GCXdXgD/TKiz3YjNVxZveeDU6QHWsDl4svoBzjSAsaeTkXD+OQ8=
 =siR0
 -----END PGP SIGNATURE-----

Merge tag 'trace-ringbuffer-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull trace ring-buffer cleanup from Steven Rostedt:

 - Add helper functions for allocations

   The allocation of the per CPU buffer descriptor, the buffer page
   descriptors and the buffer page data itself can be pretty ugly.

   Add some helper macros and a function to have the code that allocates
   buffer pages and such look a little cleaner.

* tag 'trace-ringbuffer-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
  ring-buffer: Add helper functions for allocations
2025-12-05 10:50:24 -08:00
Linus Torvalds
0b1b4a3d8e Runtime verifier updates for v6.19:
- Adapt the ftracetest script to be run from a different folder
 
   This uses the already existing OPT_TEST_DIR but extends it further to run
   independent tests, then add an --rv flag to allow using the script for
   testing RV (mostly) independently on ftrace.
 
 - Add basic RV selftests in selftests/verification for more validations
 
   Add more validations for available/enabled monitors and reactors. This
   could have caught the bug introducing kernel panic solved above. Tests use
   ftracetest.
 
 - Convert react() function in reactor to use va_list directly
 
   Use a central helper to handle the variadic arguments. Clean up macros
   and mark functions as static.
 
 - Add lockdep annotations to reactors to have lockdep complain of errors
 
   If the reactors are called from improper context. Useful to develop new
   reactors. This highlights a warning in the panic reactor that is related
   to the printk subsystem and not to RV.
 
 - Convert core RV code to use lock guards and __free helpers
 
   This completely removes goto statements.
 
 - Fix compilation if !CONFIG_RV_REACTORS
 
   Fix the warning by keeping LTL monitor variable as always static.
 -----BEGIN PGP SIGNATURE-----
 
 iIoEABYKADIWIQRRSw7ePDh/lE+zeZMp5XQQmuv6qgUCaTBoVxQccm9zdGVkdEBn
 b29kbWlzLm9yZwAKCRAp5XQQmuv6qtWpAQDxPQAJQvBZ41l9q9Cis7PqGGezT4Nv
 g6Fh/ydMOlJCsQD/R0Xd5JxPmBI8FLCwCfqHo7wYKUhP8GfL/ORPEWhU2gI=
 =EEot
 -----END PGP SIGNATURE-----

Merge tag 'trace-rv-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull runtime verifier updates from Steven Rostedt:

 - Adapt the ftracetest script to be run from a different folder

   This uses the already existing OPT_TEST_DIR but extends it further to
   run independent tests, then add an --rv flag to allow using the
   script for testing RV (mostly) independently on ftrace.

 - Add basic RV selftests in selftests/verification for more validations

   Add more validations for available/enabled monitors and reactors.
   This could have caught the bug introducing kernel panic solved above.
   Tests use ftracetest.

 - Convert react() function in reactor to use va_list directly

   Use a central helper to handle the variadic arguments. Clean up
   macros and mark functions as static.

 - Add lockdep annotations to reactors to have lockdep complain of
   errors

   If the reactors are called from improper context. Useful to develop
   new reactors. This highlights a warning in the panic reactor that is
   related to the printk subsystem and not to RV.

 - Convert core RV code to use lock guards and __free helpers

   This completely removes goto statements.

 - Fix compilation if !CONFIG_RV_REACTORS

   Fix the warning by keeping LTL monitor variable as always static.

* tag 'trace-rv-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
  rv: Fix compilation if !CONFIG_RV_REACTORS
  rv: Convert to use __free
  rv: Convert to use lock guard
  rv: Add explicit lockdep context for reactors
  rv: Make rv_reacting_on() static
  rv: Pass va_list to reactors
  selftests/verification: Add initial RV tests
  selftest/ftrace: Generalise ftracetest to use with RV
2025-12-05 10:17:00 -08:00
Linus Torvalds
0771cee974 ftrace fixes for v6.19:
- Fix regression of pid filtering of function graph tracer
 
   When the function graph tracer allowed multiple instances of
   graph tracing using subops, the filtering by pid broke.
 
   The ftrace_ops->private that was used for pid filtering wasn't
   updated on creation.
 
   The wrong function entry callback was used when pid filtering was
   enabled when the function graph tracer started, which meant that
   the pid filtering wasn't happening.
 
 - Remove no longer needed ftrace_trace_task()
 
   With PID filtering working via ftrace_pids_enabled() and fgraph_pid_func(),
   the coarse-grained ftrace_trace_task() check in graph_entry() is obsolete.
 
   It was only a fallback for uninitialized op->private (now fixed), and its
   removal ensures consistent PID filtering with standard function tracing.
 -----BEGIN PGP SIGNATURE-----
 
 iIoEABYKADIWIQRRSw7ePDh/lE+zeZMp5XQQmuv6qgUCaS90FhQccm9zdGVkdEBn
 b29kbWlzLm9yZwAKCRAp5XQQmuv6qrqMAQDbU53VhvZ6rE0pNvu0Tlk+LDCu3gxg
 F2wisWr65389OgD/VFLTVRjCZh1iY7FFWjAPGRCMbetljmMgK5vpH6XSigA=
 =VKaD
 -----END PGP SIGNATURE-----

Merge tag 'ftrace-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull ftrace updates from Steven Rostedt:

 - Fix regression of pid filtering of function graph tracer

   When the function graph tracer allowed multiple instances of graph
   tracing using subops, the filtering by pid broke.

   The ftrace_ops->private that was used for pid filtering wasn't
   updated on creation.

   The wrong function entry callback was used when pid filtering was
   enabled when the function graph tracer started, which meant that
   the pid filtering wasn't happening.

 - Remove no longer needed ftrace_trace_task()

   With PID filtering working via ftrace_pids_enabled() and
   fgraph_pid_func(), the coarse-grained ftrace_trace_task()
   check in graph_entry() is obsolete.

   It was only a fallback for uninitialized op->private (now fixed),
   and its removal ensures consistent PID filtering with standard
   function tracing.

* tag 'ftrace-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
  fgraph: Remove coarse PID filtering from graph_entry()
  fgraph: Check ftrace_pids_enabled on registration for early filtering
  fgraph: Initialize ftrace_ops->private for function graph ops
2025-12-05 10:13:04 -08:00
Linus Torvalds
69c5079b49 tracing updates for v6.19:
- Merge branch shared with kprobes on extending trace options
 
   The trace options were defined by a 32 bit variable. This limits the
   tracing instances to have a total of 32 different options. As that limit
   has been hit, and more options are being added, increase the option mask
   to a 64 bit number, doubling the number of options available.
 
   As this is required for the kprobe topic branches as well as the tracing
   topic branch, a separate branch was created and merged into both.
 
 - Make trace_user_fault_read() available for the rest of tracing
 
   The function trace_user_fault_read() is used by trace_marker file read to
   allow reading user space to be done fast and without locking or
   allocations. Make this available so that the system call trace events can
   use it too.
 
 - Have system call trace events read user space values
 
   Now that the system call trace events callbacks are called in a faultable
   context, take advantage of this and read the user space buffers for
   various system calls. For example, show the path name of the openat system
   call instead of just showing the pointer to that path name in user space.
   Also show the contents of the buffer of the write system call. Several
   system call trace events are updated to make tracing into a light weight
   strace tool for all applications in the system.
 
 - Update perf system call tracing to do the same
 
 - And a config and syscall_user_buf_size file to control the size of the buffer
 
   Limit the amount of data that can be read from user space. The default
   size is 63 bytes but that can be expanded to 165 bytes.
 
 - Allow the persistent ring buffer to print system calls normally
 
   The persistent ring buffer prints trace events by their type and ignores
   the print_fmt. This is because the print_fmt may change from kernel to
   kernel. As the system call output is fixed by the system call ABI itself,
   there's no reason to limit that. This makes reading the system call events
   in the persistent ring buffer much nicer and easier to understand.
 
 - Add options to show text offset to function profiler
 
   The function profiler that counts the number of times a function is hit
   currently lists all functions by its name and offset. But this becomes
   ambiguous when there are several functions with the same name. Add a
   tracing option that changes the output to be that of _text+offset
   instead. Now a user space tool can use this information to map the
   _text+offset to the unique function it is counting.
 
 - Report bad dynamic event command
 
   If a bad command is passed to the dynamic_events file, report it properly
   in the error log.
 
 - Clean up tracer options
 
   Clean up the tracer option code a bit, by removing some useless code and
   also using switch statements instead of a series of if statements.
 
 - Have tracing options be instance specific
 
   Tracers can have their own options (function tracer, irqsoff tracer,
   function graph tracer, etc). But now that the same tracer can be enabled
   in multiple trace instances, their options are still global. The API is
   per instance, thus changing one affects other instances. This isn't even
   consistent, as the option take affect differently depending on when an
   tracer started in an instance.  Make the options for instances only affect
   the instance it is changed under.
 
 - Optimize pid_list lock contention
 
   Whenever the pid_list is read, it uses a spin lock. This happens at every
   sched switch. Taking the lock at sched switch can be removed by instead
   using a seqlock counter.
 
 - Clean up the trace trigger structures
 
   The trigger code uses two different structures to implement a single
   tigger. This was due to trying to reuse code for the two different types
   of triggers (always on trigger, and count limited trigger). But by adding
   a single field to one structure, the other structure could be absorbed
   into the first structure making he code easier to understand.
 
 - Create a bulk garbage collector for trace triggers
 
   If user space has triggers for several hundreds of events and then removes
   them, it can take several seconds to complete. This is because each
   removal calls the slow tracepoint_synchronize_unregister() that can take
   hundreds of milliseconds to complete. Instead, create a helper thread that
   will do the clean up. When a trigger is removed, it will create the
   kthread if it isn't already created, and then add the trigger to a llist.
   The kthread will take the items off the llist, call
   tracepoint_synchronize_unregister(), and then remove the items it took
   off. It will then check if there's more items to free before sleeping.
 
   This makes user space removing all these triggers to finish in less than a
   second.
 
 - Allow function tracing of some of the tracing infrastructure code
 
   Because the tracing code can cause recursion issues if it is traced by the
   function tracer the entire tracing directory disables function tracing.
   But not all of tracing causes issues if it is traced. Namely, the event
   tracing code. Add a config that enables some of the tracing code to be
   traced to help in debugging it. Note, when this is enabled, it does add
   noise to general function tracing, especially if events are enabled as
   well (which is a common case).
 
 - Add boot-time backup instance for persistent buffer
 
   The persistent ring buffer is used mostly for kernel crash analysis in the
   field. One issue is that if there's a crash, the data in the persistent
   ring buffer must be read before tracing can begin using it. This slows
   down the boot process. Once tracing starts in the persistent ring buffer,
   the old data must be freed and the addresses no longer match and old
   events can't be in the buffer with new events.
 
   Create a way to create a backup buffer that copies the persistent ring
   buffer at boot up. Then after a crash, the always on tracer can begin
   immediately as well as the normal boot process while the crash analysis
   tooling uses the backup buffer. After the backup buffer is finished being
   read, it can be removed.
 
 - Enable function graph args and return address options at the same time
 
   Currently the when reading of arguments in the function graph tracer is
   enabled, the option to record the parent function in the entry event can
   not be enabled. Update the code so that it can.
 
 - Add new struct_offset() helper macro
 
   Add a new macro that takes a pointer to a structure and a name of one of
   its members and it will return the offset of that member. This allows the
   ring buffer code to simplify the following:
 
   From:  size = struct_size(entry, buf, cnt - sizeof(entry->id));
     To:  size = struct_offset(entry, id) + cnt;
 
   There should be other simplifications that this macro can help out with as
   well.
 -----BEGIN PGP SIGNATURE-----
 
 iIoEABYKADIWIQRRSw7ePDh/lE+zeZMp5XQQmuv6qgUCaS9xqxQccm9zdGVkdEBn
 b29kbWlzLm9yZwAKCRAp5XQQmuv6qj6tAQD4MR1lsE3XpH09asO4CDDfhbtRSQVD
 o8bVKVihWx/j5gD/XezjqE2Q2+DO6dhnsQY6pbtNdXoKgaMuQJGA+dvPsQc=
 =HilC
 -----END PGP SIGNATURE-----

Merge tag 'trace-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull tracing updates from Steven Rostedt:

 - Extend tracing option mask to 64 bits

   The trace options were defined by a 32 bit variable. This limits the
   tracing instances to have a total of 32 different options. As that
   limit has been hit, and more options are being added, increase the
   option mask to a 64 bit number, doubling the number of options
   available.

   As this is required for the kprobe topic branches as well as the
   tracing topic branch, a separate branch was created and merged into
   both.

 - Make trace_user_fault_read() available for the rest of tracing

   The function trace_user_fault_read() is used by trace_marker file
   read to allow reading user space to be done fast and without locking
   or allocations. Make this available so that the system call trace
   events can use it too.

 - Have system call trace events read user space values

   Now that the system call trace events callbacks are called in a
   faultable context, take advantage of this and read the user space
   buffers for various system calls. For example, show the path name of
   the openat system call instead of just showing the pointer to that
   path name in user space. Also show the contents of the buffer of the
   write system call. Several system call trace events are updated to
   make tracing into a light weight strace tool for all applications in
   the system.

 - Update perf system call tracing to do the same

 - And a config and syscall_user_buf_size file to control the size of
   the buffer

   Limit the amount of data that can be read from user space. The
   default size is 63 bytes but that can be expanded to 165 bytes.

 - Allow the persistent ring buffer to print system calls normally

   The persistent ring buffer prints trace events by their type and
   ignores the print_fmt. This is because the print_fmt may change from
   kernel to kernel. As the system call output is fixed by the system
   call ABI itself, there's no reason to limit that. This makes reading
   the system call events in the persistent ring buffer much nicer and
   easier to understand.

 - Add options to show text offset to function profiler

   The function profiler that counts the number of times a function is
   hit currently lists all functions by its name and offset. But this
   becomes ambiguous when there are several functions with the same
   name.

   Add a tracing option that changes the output to be that of
   '_text+offset' instead. Now a user space tool can use this
   information to map the '_text+offset' to the unique function it is
   counting.

 - Report bad dynamic event command

   If a bad command is passed to the dynamic_events file, report it
   properly in the error log.

 - Clean up tracer options

   Clean up the tracer option code a bit, by removing some useless code
   and also using switch statements instead of a series of if
   statements.

 - Have tracing options be instance specific

   Tracers can have their own options (function tracer, irqsoff tracer,
   function graph tracer, etc). But now that the same tracer can be
   enabled in multiple trace instances, their options are still global.
   The API is per instance, thus changing one affects other instances.
   This isn't even consistent, as the option take affect differently
   depending on when an tracer started in an instance. Make the options
   for instances only affect the instance it is changed under.

 - Optimize pid_list lock contention

   Whenever the pid_list is read, it uses a spin lock. This happens at
   every sched switch. Taking the lock at sched switch can be removed by
   instead using a seqlock counter.

 - Clean up the trace trigger structures

   The trigger code uses two different structures to implement a single
   tigger. This was due to trying to reuse code for the two different
   types of triggers (always on trigger, and count limited trigger). But
   by adding a single field to one structure, the other structure could
   be absorbed into the first structure making he code easier to
   understand.

 - Create a bulk garbage collector for trace triggers

   If user space has triggers for several hundreds of events and then
   removes them, it can take several seconds to complete. This is
   because each removal calls tracepoint_synchronize_unregister() that
   can take hundreds of milliseconds to complete.

   Instead, create a helper thread that will do the clean up. When a
   trigger is removed, it will create the kthread if it isn't already
   created, and then add the trigger to a llist. The kthread will take
   the items off the llist, call tracepoint_synchronize_unregister(),
   and then remove the items it took off. It will then check if there's
   more items to free before sleeping.

   This makes user space removing all these triggers to finish in less
   than a second.

 - Allow function tracing of some of the tracing infrastructure code

   Because the tracing code can cause recursion issues if it is traced
   by the function tracer the entire tracing directory disables function
   tracing. But not all of tracing causes issues if it is traced.
   Namely, the event tracing code. Add a config that enables some of the
   tracing code to be traced to help in debugging it. Note, when this is
   enabled, it does add noise to general function tracing, especially if
   events are enabled as well (which is a common case).

 - Add boot-time backup instance for persistent buffer

   The persistent ring buffer is used mostly for kernel crash analysis
   in the field. One issue is that if there's a crash, the data in the
   persistent ring buffer must be read before tracing can begin using
   it. This slows down the boot process. Once tracing starts in the
   persistent ring buffer, the old data must be freed and the addresses
   no longer match and old events can't be in the buffer with new
   events.

   Create a way to create a backup buffer that copies the persistent
   ring buffer at boot up. Then after a crash, the always on tracer can
   begin immediately as well as the normal boot process while the crash
   analysis tooling uses the backup buffer. After the backup buffer is
   finished being read, it can be removed.

 - Enable function graph args and return address options at the same
   time

   Currently the when reading of arguments in the function graph tracer
   is enabled, the option to record the parent function in the entry
   event can not be enabled. Update the code so that it can.

 - Add new struct_offset() helper macro

   Add a new macro that takes a pointer to a structure and a name of one
   of its members and it will return the offset of that member. This
   allows the ring buffer code to simplify the following:

   From:  size = struct_size(entry, buf, cnt - sizeof(entry->id));
     To:  size = struct_offset(entry, id) + cnt;

   There should be other simplifications that this macro can help out
   with as well

* tag 'trace-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace: (42 commits)
  overflow: Introduce struct_offset() to get offset of member
  function_graph: Enable funcgraph-args and funcgraph-retaddr to work simultaneously
  tracing: Add boot-time backup of persistent ring buffer
  ftrace: Allow tracing of some of the tracing code
  tracing: Use strim() in trigger_process_regex() instead of skip_spaces()
  tracing: Add bulk garbage collection of freeing event_trigger_data
  tracing: Remove unneeded event_mutex lock in event_trigger_regex_release()
  tracing: Merge struct event_trigger_ops into struct event_command
  tracing: Remove get_trigger_ops() and add count_func() from trigger ops
  tracing: Show the tracer options in boot-time created instance
  ftrace: Avoid redundant initialization in register_ftrace_direct
  tracing: Remove unused variable in tracing_trace_options_show()
  fgraph: Make fgraph_no_sleep_time signed
  tracing: Convert function graph set_flags() to use a switch() statement
  tracing: Have function graph tracer option sleep-time be per instance
  tracing: Move graph-time out of function graph options
  tracing: Have function graph tracer option funcgraph-irqs be per instance
  trace/pid_list: optimize pid_list->lock contention
  tracing: Have function graph tracer define options per instance
  tracing: Have function tracer define options per instance
  ...
2025-12-05 09:51:37 -08:00
Linus Torvalds
36492b7141 Detect unused tracepoints for v6.19:
If a tracepoint is defined but never used (TRACE_EVENT() created but no
 trace_<tracepoint>() called), it can take up to or more than 5K of memory
 each. This can add up as there are around a hundred unused tracepoints with
 various configs. That is 500K of wasted memory.
 
 Add a make build parameter of "UT=1" to have the build warn if an unused
 tracepoint is detected in the build. This allows detection of unused
 tracepoints to be upstream so that outreachy and the mentoring project can
 have new developers look for fixing them, without having these warnings
 suddenly show up when someone upgrades their kernel. When all known unused
 tracepoints are removed, then the "UT=1" build parameter can be removed and
 unused tracepoints will always warn. This will catch new unused tracepoints
 after the current ones have been removed.
 
 - Separate out elf functions from sorttable.c
 
   Move out the ELF parsing functions from sorttable.c so that the tracing
   tooling can use it.
 
 - Add a tracepoint verifier tool to the build process
 
   If "UT=1" is added to the kernel command line, any unused tracepoints will
   trigger a warning at build time.
 
 - Do not warn about unused tracepoints for tracepoints that are exported
 
   There are sever cases where a tracepoint is created by the kernel and used
   by modules. Since there's no easy way to detect if these are truly unused
   since the users are in modules, if a tracepoint is exported, assume it
   will eventually be used by a module. Note, there's not many exported
   tracepoints so this should not be a problem to ignore them.
 
 - Have building of modules also detect unused tracepoints
 
   Do not only check the main vmlinux for unused tracepoints, also check
   modules. If a module is defining a tracepoint it should be using it.
 
 - Add the tracepoint-update program to the ignore file
 
   The new tracepoint-update program needs to be ignored by git.
 -----BEGIN PGP SIGNATURE-----
 
 iIoEABYKADIWIQRRSw7ePDh/lE+zeZMp5XQQmuv6qgUCaS9iLxQccm9zdGVkdEBn
 b29kbWlzLm9yZwAKCRAp5XQQmuv6qk4mAP96T/IPPjox1Fd7r/Dpm+JNfYom8AZ8
 WGNL06+aEKRWZwEAqc+u/9k3r964k+pKQ7qwL3ZslG2ALSOdKbFXHpsPpw8=
 =R/qK
 -----END PGP SIGNATURE-----

Merge tag 'tracepoints-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull unused tracepoints update from Steven Rostedt:
 "Detect unused tracepoints.

  If a tracepoint is defined but never used (TRACE_EVENT() created but
  no trace_<tracepoint>() called), it can take up to or more than 5K of
  memory each. This can add up as there are around a hundred unused
  tracepoints with various configs. That is 500K of wasted memory.

  Add a make build parameter of "UT=1" to have the build warn if an
  unused tracepoint is detected in the build. This allows detection of
  unused tracepoints to be upstream so that outreachy and the mentoring
  project can have new developers look for fixing them, without having
  these warnings suddenly show up when someone upgrades their kernel.

  When all known unused tracepoints are removed, then the "UT=1" build
  parameter can be removed and unused tracepoints will always warn. This
  will catch new unused tracepoints after the current ones have been
  removed.

  Summary:

   - Separate out elf functions from sorttable.c

     Move out the ELF parsing functions from sorttable.c so that the
     tracing tooling can use it.

   - Add a tracepoint verifier tool to the build process

     If "UT=1" is added to the kernel command line, any unused
     tracepoints will trigger a warning at build time.

   - Do not warn about unused tracepoints for tracepoints that are
     exported

     There are sever cases where a tracepoint is created by the kernel
     and used by modules. Since there's no easy way to detect if these
     are truly unused since the users are in modules, if a tracepoint is
     exported, assume it will eventually be used by a module. Note,
     there's not many exported tracepoints so this should not be a
     problem to ignore them.

   - Have building of modules also detect unused tracepoints

     Do not only check the main vmlinux for unused tracepoints, also
     check modules. If a module is defining a tracepoint it should be
     using it.

   - Add the tracepoint-update program to the ignore file

     The new tracepoint-update program needs to be ignored by git"

* tag 'tracepoints-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
  scripts: add tracepoint-update to the list of ignores files
  tracing: Add warnings for unused tracepoints for modules
  tracing: Allow tracepoint-update.c to work with modules
  tracepoint: Do not warn for unused event that is exported
  tracing: Add a tracepoint verification check at build time
  sorttable: Move ELF parsing into scripts/elf-parse.[ch]
2025-12-05 09:37:41 -08:00
Linus Torvalds
5779de8d36 rtla updaets for v6.19:
- Officially add Tomas Glozar as a maintainer to RTLA tool
 
 - Add for_each_monitored_cpu() helper
 
   In multiple places, RTLA tools iterate over the list of CPUs running
   tracer threads.
 
   Use single helper instead of repeating the for/if combination.
 
 - Remove unused variable option_index in argument parsing
 
   RTLA tools use getopt_long() for argument parsing. For its last
   argument, an unused variable "option_index" is passed.
 
   Remove the variable and pass NULL to getopt_long() to shorten
   the naturally long parsing functions, and make them more readable.
 
 - Fix unassigned nr_cpus after code consolidation
 
   In recent code consolidation, timerlat tool cleanup, previously
   implemented separately for each tool, was moved to a common function
   timerlat_free().
 
   The cleanup relies on nr_cpus being set. This was not done in the new
   function, leaving the variable uninitialized.
 
   Initialize the variable properly, and remove silencing of compiler
   warning for uninitialized variables.
 
 - Stop tracing on user latency in BPF mode
 
   Despite the name, rtla-timerlat's -T/--thread option sets timerlat's
   stop_tracing_total_us option, which also stops tracing on
   return-from-user latency, not only on thread latency.
 
   Implement the same behavior also in BPF sample collection stop tracing
   handler to avoid a discrepancy and restore correspondence of behavior
   with the equivalent option of cyclictest.
 
 - Fix threshold actions always triggering
 
   A bug in threshold action logic caused the action to execute even
   if tracing did not stop because of threshold.
 
   Fix the logic to stop correctly.
 
 - Fix few minor issues in tests
 
   Extend tests that were shown to need it to 5s, fix osnoise test
   calling timerlat by mistake, and use new, more reliable output
   checking in timerlat's "top stop at failed action" test.
 
 - Do not print usage on argument parsing error
 
   RTLA prints the entire usage message on encountering errors in
   argument parsing, like a malformed CPU list.
 
   The usage message has gotten too long. Instead of printing it,
   use newly added fatal() helper function to simply exit with
   the error message, excluding the usage.
 
 - Fix unintuitive -C/--cgroup interface
 
   "-C cgroup" and "--cgroup cgroup" are invalid syntax, despite that
   being a common way to specify an option with argument. Moreover,
   using them fails silently and no cgroup is set.
 
   Create new helper function to unify the handling of all such options
   and allow all of:
 
   -Xsomething
   -X=something
   -X something
 
   as well as the equivalent for the long option.
 
 - Fix -a overriding -t argument filename
 
   Fix a bug where -a following -t custom_file.txt overrides the custom
   filename with the default timerlat_trace.txt.
 
 - Stop tracing correctly on multiple events at once
 
   In some race scenarios, RTLA BPF sample collection might send multiple
   stop tracing events via the BPF ringbuffer at once.
 
   Compare the number of events for != 0 instead of == 1 to cover for
   this scenario and stop tracing properly.
 -----BEGIN PGP SIGNATURE-----
 
 iIoEABYKADIWIQRRSw7ePDh/lE+zeZMp5XQQmuv6qgUCaS9bxBQccm9zdGVkdEBn
 b29kbWlzLm9yZwAKCRAp5XQQmuv6qhrgAP0a/AtsL9+IFXAK5JK8aO1XWApVyK9n
 48FRZWu/jrupuAD7BO+EHazmPEourNaUqYPeuymwxT+4O47RH1Q/aasLQwo=
 =RvNH
 -----END PGP SIGNATURE-----

Merge tag 'trace-tools-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull rtla trace tooling updates from Steven Rostedt:

 - Officially add Tomas Glozar as a maintainer to RTLA tool

 - Add for_each_monitored_cpu() helper

   In multiple places, RTLA tools iterate over the list of CPUs running
   tracer threads.

   Use single helper instead of repeating the for/if combination.

 - Remove unused variable option_index in argument parsing

   RTLA tools use getopt_long() for argument parsing. For its last
   argument, an unused variable "option_index" is passed.

   Remove the variable and pass NULL to getopt_long() to shorten the
   naturally long parsing functions, and make them more readable.

 - Fix unassigned nr_cpus after code consolidation

   In recent code consolidation, timerlat tool cleanup, previously
   implemented separately for each tool, was moved to a common function
   timerlat_free().

   The cleanup relies on nr_cpus being set. This was not done in the new
   function, leaving the variable uninitialized.

   Initialize the variable properly, and remove silencing of compiler
   warning for uninitialized variables.

 - Stop tracing on user latency in BPF mode

   Despite the name, rtla-timerlat's -T/--thread option sets timerlat's
   stop_tracing_total_us option, which also stops tracing on
   return-from-user latency, not only on thread latency.

   Implement the same behavior also in BPF sample collection stop
   tracing handler to avoid a discrepancy and restore correspondence of
   behavior with the equivalent option of cyclictest.

 - Fix threshold actions always triggering

   A bug in threshold action logic caused the action to execute even if
   tracing did not stop because of threshold.

   Fix the logic to stop correctly.

 - Fix few minor issues in tests

   Extend tests that were shown to need it to 5s, fix osnoise test
   calling timerlat by mistake, and use new, more reliable output
   checking in timerlat's "top stop at failed action" test.

 - Do not print usage on argument parsing error

   RTLA prints the entire usage message on encountering errors in
   argument parsing, like a malformed CPU list.

   The usage message has gotten too long. Instead of printing it, use
   newly added fatal() helper function to simply exit with the error
   message, excluding the usage.

 - Fix unintuitive -C/--cgroup interface

   "-C cgroup" and "--cgroup cgroup" are invalid syntax, despite that
   being a common way to specify an option with argument. Moreover,
   using them fails silently and no cgroup is set.

   Create new helper function to unify the handling of all such options
   and allow all of:

     -Xsomething
     -X=something
     -X something

   as well as the equivalent for the long option.

 - Fix -a overriding -t argument filename

   Fix a bug where -a following -t custom_file.txt overrides the custom
   filename with the default timerlat_trace.txt.

 - Stop tracing correctly on multiple events at once

   In some race scenarios, RTLA BPF sample collection might send
   multiple stop tracing events via the BPF ringbuffer at once.

   Compare the number of events for != 0 instead of == 1 to cover for
   this scenario and stop tracing properly.

* tag 'trace-tools-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
  rtla/timerlat: Exit top main loop on any non-zero wait_retval
  rtla/tests: Don't rely on matching ^1ALL
  rtla: Fix -a overriding -t argument
  rtla: Fix -C/--cgroup interface
  tools/rtla: Replace osnoise_hist_usage("...") with fatal("...")
  tools/rtla: Replace osnoise_top_usage("...") with fatal("...")
  tools/rtla: Replace timerlat_hist_usage("...") with fatal("...")
  tools/rtla: Replace timerlat_top_usage("...") with fatal("...")
  tools/rtla: Add fatal() and replace error handling pattern
  rtla/tests: Fix osnoise test calling timerlat
  rtla/tests: Extend action tests to 5s
  tools/rtla: Fix --on-threshold always triggering
  rtla/timerlat_bpf: Stop tracing on user latency
  tools/rtla: Fix unassigned nr_cpus
  tools/rtla: Remove unused optional option_index
  tools/rtla: Add for_each_monitored_cpu() helper
  MAINTAINERS: Add Tomas Glozar as a maintainer to RTLA tool
2025-12-05 09:34:01 -08:00
Linus Torvalds
ed1b409137 hardening updates for v6.19-rc1
- string: Add missing kernel-doc return descriptions (Kriish Sharma)
 
 - Update some mis-typed allocations
 
 - Enable GCC diagnostic context for value-tracking warnings
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRSPkdeREjth1dHnSE2KwveOeQkuwUCaS9E5QAKCRA2KwveOeQk
 u5lYAQDEXFBD3+X+k9LNuPS/FLpz5sEI0SOI4lD8xDEjhtmygAD+LVV8yRf6ajPA
 5O2f4hbKnP5+4XHwSiG+CV7QpAgHHwo=
 =6GEw
 -----END PGP SIGNATURE-----

Merge tag 'hardening-v6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux

Pull hardening updates from Kees Cook:

 - string: Add missing kernel-doc return descriptions (Kriish Sharma)

 - Update some mis-typed allocations

   These correct some accidentally wrong types used in allocations (that
   didn't affect the resulting size) that never got picked up from the
   batch I sent a few months ago.

 - Enable GCC diagnostic context for value-tracking warnings

   This results in better GCC diagnostics for the value range tracking,
   so we can get better visibility into where those values are coming
   from when we get out-of-bounds warnings at compile time.

* tag 'hardening-v6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux:
  kbuild: Enable GCC diagnostic context for value-tracking warnings
  string: Add missing kernel-doc return descriptions
  media: iris: Cast iris_hfi_gen2_get_instance() allocation type
  drm/plane: Remove const qualifier from plane->modifiers allocation type
  comedi: Adjust range_table_list allocation type
2025-12-05 09:11:02 -08:00
Linus Torvalds
3ee37abbbd pstore update for v6.19-rc1
- pstore/ram: Update module parameters from platform data (Tzung-Bi Shih)
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRSPkdeREjth1dHnSE2KwveOeQkuwUCaS9EPwAKCRA2KwveOeQk
 uyEDAQDZTrI547b000g8gjAWpQpQWB32wkNZOP4ANafAGQXLDwD9G8YxKQFRX+HJ
 mfkvgK8JqOiPjsnvbPJv/F2VdBaQHAk=
 =FDkt
 -----END PGP SIGNATURE-----

Merge tag 'pstore-v6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux

Pull pstore update from Kees Cook:

 - pstore/ram: Update module parameters from platform data (Tzung-Bi Shih)

* tag 'pstore-v6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux:
  pstore/ram: Update module parameters from platform data
2025-12-05 09:08:13 -08:00
Linus Torvalds
5d45c729ed configfs changes for v6.19
-----BEGIN PGP SIGNATURE-----
 
 iQJKBAABCgA0FiEEEsH5R1a/fCoV1sAS4bgaPnkoY3cFAmku+SoWHGEuaGluZGJv
 cmdAa2VybmVsLm9yZwAKCRDhuBo+eShjd5XpEACgVpU27v59kWhw+vIPQeNfrCzo
 L8Is/w4a0IkBbaeLvp7JhjshQ9ViEuWFJE7un9RVxOm0YC9sisGZYAfNks71nFYE
 /AAwocY4MrCLcDHwtRm5N2eykz6DtpFKorl6Go2WEUN5ye5ulDO7cpMTG5knKKb2
 CiaifWx/oRqsqQxdeA53c8fnlGKJ/R0KJXDwslj2G4tGnyz1YwbrpQdT3QNYjSWb
 hxyoet36Mprylrrfn9LBnGJuCwCXmAcxWAn/vtGwr7SDoL0o4XhcVfcnHblclDvr
 ZVcKWKpesLfqNjBeO0GsBMabPKn6wZvscvtPNh09x57MxfYLPLXnsD+rMmeMOd0P
 X3Wv7aJjvBWsgSOONVI2gRizNilYq8lnzE1BdPMenlxDk8DIh2blGeA2SzZmpvM/
 8tkpv7x/hYKXmcxGyaUPcYIqMCnXkbVHDGI05DJLCRttF21XIDPpQBuVpApMETzD
 nBAZO7sVatprmEi/+n4C8rCu7B5VuSSFW5Q/eO2QeVJmXIoPPG86b78sedIzIRHO
 rH9ox3HWgDTwi4GuBJmf7Qn8/lurS6QncVZCI91cXUnOhf1juACSRMVOOvCP9lio
 M7Dmd3U2QkkkE6h/TwM2o7wBBrzlCLRPL7TeF9ft8m29g+SU9jR8N0/fo2yJUA4F
 EZ9xxiwzNLkLTkqBtg==
 =TV16
 -----END PGP SIGNATURE-----

Merge tag 'configfs-for-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/a.hindborg/linux

Pull configfs updates from Andreas Hindborg:
 "Two commits changing constness of the configfs vtable pointers. We
  plan to follow up with changes at call sites down the road"

* tag 'configfs-for-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/a.hindborg/linux:
  configfs: Constify ct_item_ops in struct config_item_type
  configfs: Constify ct_group_ops in struct config_item_type
2025-12-05 08:59:41 -08:00
Steven Rostedt
d3042cbe84 ktest.pl: Fix uninitialized var in config-bisect.pl
The error path of copying the old config used the wrong variable in the
error message:

 $ mkdir /tmp/build
 $ ./tools/testing/ktest/config-bisect.pl -b /tmp/build config-good /tmp/config-bad
 $ chmod 0 /tmp/build
 $ ./tools/testing/ktest/config-bisect.pl -b /tmp/build config-good /tmp/config-bad good
 cp /tmp/build//.config config-good.tmp ... [0 seconds] FAILED!
 Use of uninitialized value $config in concatenation (.) or string at ./tools/testing/ktest/config-bisect.pl line 744.
 failed to copy  to config-good.tmp

When it should have shown:

 failed to copy /tmp/build//.config to config-good.tmp

Cc: stable@vger.kernel.org
Cc: John 'Warthog9' Hawley <warthog9@kernel.org>
Fixes: 0f0db065999cf ("ktest: Add standalone config-bisect.pl program")
Link: https://patch.msgid.link/20251203180924.6862bd26@gandalf.local.home
Reported-by: "John W. Krahn" <jwkrahn@shaw.ca>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2025-12-03 18:25:18 -05:00
Steven Rostedt
b1e7a590a0 ring-buffer: Add helper functions for allocations
The allocation of the per CPU buffer descriptor, the buffer page
descriptors and the buffer page data itself can be pretty ugly:

  kzalloc_node(ALIGN(sizeof(struct buffer_page), cache_line_size()),
               GFP_KERNEL, cpu_to_node(cpu));

And the data pages:

  page = alloc_pages_node(cpu_to_node(cpu),
                          GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_COMP | __GFP_ZERO, order);
  if (!page)
	return NULL;
  bpage->page = page_address(page);
  rb_init_page(bpage->page);

Add helper functions to make the code easier to read.

This does make all allocations of the data page (bpage->page) allocated
with the __GFP_RETRY_MAYFAIL flag (and not just the bulk allocator). Which
is actually better, as allocating the data page for the ring buffer tracing
should try hard but not trigger the OOM killer.

Link: https://lore.kernel.org/all/CAHk-=wjMMSAaqTjBSfYenfuzE1bMjLj+2DLtLWJuGt07UGCH_Q@mail.gmail.com/

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251125121153.35c07461@gandalf.local.home
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-02 15:49:35 -05:00
Gabriele Monaco
bbaacdc339 rv: Fix compilation if !CONFIG_RV_REACTORS
The kernel test robot spotted a compilation error if reactors are
disabled.

Fix the warning by keeping LTL monitor variable as always static.

Cc: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Link: https://patch.msgid.link/20251113150618.185479-2-gmonaco@redhat.com
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202511131948.vxi5mdjU-lkp@intel.com/
Fixes: 4f739ed19d22 ("rv: Pass va_list to reactors")
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-02 12:33:37 -05:00
Nam Cao
b30f635bb6 rv: Convert to use __free
Convert to use __free to tidy up the code.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Reviewed-by: Gabriele Monaco <gmonaco@redhat.com>
Link: https://lore.kernel.org/r/62854e2fcb8f8dd2180a98a9700702dcf89a6980.1763370183.git.namcao@linutronix.de
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
2025-12-02 07:28:32 +01:00
Nam Cao
8db3790c4d rv: Convert to use lock guard
Convert to use lock guard to tidy up the code.

Signed-off-by: Nam Cao <namcao@linutronix.de>
Reviewed-by: Gabriele Monaco <gmonaco@redhat.com>
Link: https://lore.kernel.org/r/dbefeb868093c40d4b29fd6b57294a6aa011b719.1763370183.git.namcao@linutronix.de
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
2025-12-02 07:28:20 +01:00
Matthew Wilcox (Oracle)
faf3c92352 mm: fix vma_start_write_killable() signal handling
If we get a signal, we need to restore the vm_refcnt.  We don't think that
the refcount can actually be decremented to zero here as it requires the
VMA to be detached, and the vma_mark_detached() uses TASK_UNINTERRUPTIBLE.
However, that's a bit subtle, so handle it as if the refcount was zero at
the start of this function.

Link: https://lkml.kernel.org/r/20251128040100.3022561-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>

Reported-by: syzbot+5b19bad23ac7f44bf8b8@syzkaller.appspotmail.com
Fixes: 2197bb60f890 ("mm: add vma_start_write_killable()")
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-29 10:41:11 -08:00
Youngjun Park
b60a3ef784 mm/swapfile: use plist_for_each_entry in __folio_throttle_swaprate
The loop breaks immediately after finding the first swap device and
never modifies the list. Replace plist_for_each_entry_safe() with
plist_for_each_entry() and remove the unused next variable.

Link: https://lkml.kernel.org/r/20251127100303.783198-3-youngjun.park@lge.com
Signed-off-by: Youngjun Park <youngjun.park@lge.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Acked-by: Kairui Song <kasong@tencent.com>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: Barry Song <baohua@kernel.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-29 10:41:11 -08:00
Youngjun Park
f9e82f99b3 mm/swapfile: fix list iteration when next node is removed during discard
Patch series "mm/swapfile: fix and cleanup swap list iterations", v2.

This series fixes a potential list iteration issue in swap_sync_discard()
when devices are removed, and includes a cleanup for
__folio_throttle_swaprate().


This patch (of 2):

When the next node is removed from the plist (e.g.  by swapoff),
plist_del() makes the node point to itself, causing the iteration to loop
on the same entry indefinitely.

Add a plist_node_empty() check to detect this case and restart iteration,
allowing swap_sync_discard() to continue processing remaining swap devices
that still have pending discard entries.

Additionally, switch from swap_avail_lock/swap_avail_head to
swap_lock/swap_active_head so that iteration is only affected by swapoff
operations rather than frequent availability changes, reducing exceptional
condition checks and lock contention.

Link: https://lkml.kernel.org/r/20251127100303.783198-1-youngjun.park@lge.com
Link: https://lkml.kernel.org/r/20251127100303.783198-2-youngjun.park@lge.com
Fixes: 686ea517f471 ("mm, swap: do not perform synchronous discard during allocation")
Signed-off-by: Youngjun Park <youngjun.park@lge.com>
Suggested-by: Kairui Song <kasong@tencent.com>
Acked-by: Kairui Song <kasong@tencent.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-29 10:41:10 -08:00
Lorenzo Stoakes
12f0cd3933 fs/proc/task_mmu.c: fix make_uffd_wp_huge_pte() huge pte handling
make_uffd_wp_huge_pte() should return after handling a huge_pte_none()
pte.

Link: https://lkml.kernel.org/r/66178124-ebdf-4e23-b8ca-ed3eb8030c81@lucifer.local
Fixes: 03bfbc3ad6e4 ("mm: remove is_hugetlb_entry_[migration, hwpoisoned]()")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reported-by: Vlastimil Babka <vbabka@suse.cz>
Closes: https://lkml.kernel.org/r/dc483db3-be4d-45f7-8b40-a28f5d8f5738@suse.cz
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-29 10:41:10 -08:00
Breno Leitao
ce2bba8956 mm/kfence: add reboot notifier to disable KFENCE on shutdown
During system shutdown, KFENCE can cause IPI synchronization issues if it
remains active through the reboot process.  To prevent this, register a
reboot notifier that disables KFENCE and cancels any pending timer work
early in the shutdown sequence.

This is only necessary when CONFIG_KFENCE_STATIC_KEYS is enabled, as this
configuration sends IPIs that can interfere with shutdown.  Without static
keys, no IPIs are generated and KFENCE can safely remain active.

The notifier uses maximum priority (INT_MAX) to ensure KFENCE shuts down
before other subsystems that might still depend on stable memory
allocation behavior.

This fixes a late kexec CSD lockup[1] when kfence is trying to IPI a CPU
that is busy in a IRQ-disabled context printing characters to the console.

Link: https://lkml.kernel.org/r/20251127-kfence-v2-1-daeccb5ef9aa@debian.org
Link: https://lkml.kernel.org/r/20251126-kfence-v1-1-5a6e1d7c681c@debian.org
Link: https://lore.kernel.org/all/sqwajvt7utnt463tzxgwu2yctyn5m6bjwrslsnupfexeml6hkd@v6sqmpbu3vvu/ [1]
Fixes: 0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure")
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-29 10:41:10 -08:00
Chen Ridong
f3b566d726 memcg: remove inc/dec_lruvec_kmem_state helpers
The dec_lruvec_kmem_state helper is unused by any caller and can be safely
removed.  Meanwhile, the inc_lruvec_kmem_state helper is only referenced
by shadow_lru_isolate, retaining these two helpers is unnecessary.  This
patch removes both helper functions to eliminate redundant code.

Link: https://lkml.kernel.org/r/20251126020435.1511637-1-chenridong@huaweicloud.com
Signed-off-by: Chen Ridong <chenridong@huawei.com>
Acked-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Lu Jialin <lujialin4@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Wei Xu <weixugc@google.com>
Cc: Yuanchu Xie <yuanchu@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-29 10:41:10 -08:00
Ankit Khushwaha
0384c8ea96 selftests/mm/uffd: initialize char variable to Null
In "uffd-stress.c" & "uffd-unit-tests.c". address of char variable having
garbage value (uninitialized) is passed to 'write' syscall triggers
warning.

	uffd-stress.c:246:39: warning: variable 'c' is uninitialized when
	passed  as a const pointer argument here
	[-Wuninitialized-const-pointer]

	uffd-unit-tests.c:581:31: warning: variable 'c' is uninitialized
	when passed as a const pointer argument here
	[-Wuninitialized-const-pointer]

so the fix is to assign char variable to '\0' to prevent writing of
garbage value.

Link: https://lkml.kernel.org/r/20251126160830.52124-1-ankitkhushwaha.linux@gmail.com
Signed-off-by: Ankit Khushwaha <ankitkhushwaha.linux@gmail.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Bill Wendling <morbo@google.com>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-29 10:41:09 -08:00
Geert Uytterhoeven
f65372cd7a mm: fix DEBUG_RODATA_TEST indentation in Kconfig
Most of the DEBUG_RODATA_TEST section is indented by four spaces instead
of the customary single TAB.

Link: https://lkml.kernel.org/r/74f39b1bffc6ed802088cb3e7d17b4c82330e8b3.1764058676.git.geert@linux-m68k.org
Fixes: 2959a5f726f6 ("mm: add arch-independent testcases for RODATA")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Jinbum Park <jinb.park7@gmail.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-29 10:41:09 -08:00
Lorenzo Stoakes
9ea35a25d5 mm: introduce VMA flags bitmap type
It is useful to transition to using a bitmap for VMA flags so we can avoid
running out of flags, especially for 32-bit kernels which are constrained
to 32 flags, necessitating some features to be limited to 64-bit kernels
only.

By doing so, we remove any constraint on the number of VMA flags moving
forwards no matter the platform and can decide in future to extend beyond
64 if required.

We start by declaring an opaque types, vma_flags_t (which resembles
mm_struct flags of type mm_flags_t), setting it to precisely the same size
as vm_flags_t, and place it in union with vm_flags in the VMA declaration.

We additionally update struct vm_area_desc equivalently placing the new
opaque type in union with vm_flags.

This change therefore does not impact the size of struct vm_area_struct or
struct vm_area_desc.

In order for the change to be iterative and to avoid impacting
performance, we designate VM_xxx declared bitmap flag values as those
which must exist in the first system word of the VMA flags bitmap.

We therefore declare vma_flags_clear_all(), vma_flags_overwrite_word(),
vma_flags_overwrite_word(), vma_flags_overwrite_word_once(),
vma_flags_set_word() and vma_flags_clear_word() in order to allow us to
update the existing vm_flags_*() functions to utilise these helpers.

This is a stepping stone towards converting users to the VMA flags bitmap
and behaves precisely as before.

By doing this, we can eliminate the existing private vma->__vm_flags field
in the vma->vm_flags union and replace it with the newly introduced opaque
type vma_flags, which we call flags so we refer to the new bitmap field as
vma->flags.

We update vma_flag_[test, set]_atomic() to account for the change also.

We adapt vm_flags_reset_once() to only clear those bits above the first
system word providing write-once semantics to the first system word (which
it is presumed the caller requires - and in all current use cases this is
so).

As we currently only specify that the VMA flags bitmap size is equal to
BITS_PER_LONG number of bits, this is a noop, but is defensive in
preparation for a future change that increases this.

We additionally update the VMA userland test declarations to implement the
same changes there.

Finally, we update the rust code to reference vma->vm_flags on update
rather than vma->__vm_flags which has been removed.  This is safe for now,
albeit it is implicitly performing a const cast.

Once we introduce flag helpers we can improve this more.

No functional change intended.

Link: https://lkml.kernel.org/r/bab179d7b153ac12f221b7d65caac2759282cfe9.1764064557.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Acked-by: Alice Ryhl <aliceryhl@google.com>	[rust]
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andreas Hindborg <a.hindborg@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Björn Roy Baron <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Gary Guo <gary@garyguo.net>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Trevor Gross <tmgross@umich.edu>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-29 10:41:09 -08:00
Lorenzo Stoakes
4c613f518f tools/testing/vma: eliminate dependency on vma->__vm_flags
The userland VMA test code relied on an internal implementation detail -
the existence of vma->__vm_flags to directly access VMA flags.  There is
no need to do so when we have the vm_flags_*() helper functions available.

This is ugly, but also a subsequent commit will eliminate this field
altogether so this will shortly become broken.

This patch has us utilise the helper functions instead.

Link: https://lkml.kernel.org/r/6275c53a6bb20743edcbe92d3e130183b47d18d0.1764064557.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Pedro Falcato <pfalcato@suse.de>
Acked-by: Alice Ryhl <aliceryhl@google.com>	[rust]
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andreas Hindborg <a.hindborg@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Björn Roy Baron <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Gary Guo <gary@garyguo.net>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Trevor Gross <tmgross@umich.edu>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-29 10:41:09 -08:00
Lorenzo Stoakes
58eac97a8b mm: simplify and rename mm flags function for clarity
The __mm_flags_set_word() function is slightly ambiguous - we use 'set' to
refer to setting individual bits (such as in mm_flags_set()) but here we
use it to refer to overwriting the value altogether.

Rename it to __mm_flags_overwrite_word() to eliminate this ambiguity.

We additionally simplify the functions, eliminating unnecessary
bitmap_xxx() operations (the compiler would have optimised these out but
it's worth being as clear as we can be here).

Link: https://lkml.kernel.org/r/8f0bc556e1b90eca8ea5eba41f8d5d3f9cd7c98a.1764064557.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Acked-by: Alice Ryhl <aliceryhl@google.com>	[rust]
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andreas Hindborg <a.hindborg@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Björn Roy Baron <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Gary Guo <gary@garyguo.net>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Trevor Gross <tmgross@umich.edu>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-29 10:41:08 -08:00
Lorenzo Stoakes
2b6a3f061f mm: declare VMA flags by bit
Patch series "initial work on making VMA flags a bitmap", v3.

We are in the rather silly situation that we are running out of VMA flags
as they are currently limited to a system word in size.

This leads to absurd situations where we limit features to 64-bit
architectures only because we simply do not have the ability to add a flag
for 32-bit ones.

This is very constraining and leads to hacks or, in the worst case, simply
an inability to implement features we want for entirely arbitrary reasons.

This also of course gives us something of a Y2K type situation in mm where
we might eventually exhaust all of the VMA flags even on 64-bit systems.

This series lays the groundwork for getting away from this limitation by
establishing VMA flags as a bitmap whose size we can increase in future
beyond 64 bits if required.

This is necessarily a highly iterative process given the extensive use of
VMA flags throughout the kernel, so we start by performing basic steps.

Firstly, we declare VMA flags by bit number rather than by value,
retaining the VM_xxx fields but in terms of these newly introduced
VMA_xxx_BIT fields.

While we are here, we use sparse annotations to ensure that, when dealing
with VMA bit number parameters, we cannot be passed values which are not
declared as such - providing some useful type safety.

We then introduce an opaque VMA flag type, much like the opaque mm_struct
flag type introduced in commit bb6525f2f8c4 ("mm: add bitmap mm->flags
field"), which we establish in union with vma->vm_flags (but still set at
system word size meaning there is no functional or data type size change).

We update the vm_flags_xxx() helpers to use this new bitmap, introducing
sensible helpers to do so.

This series lays the foundation for further work to expand the use of
bitmap VMA flags and eventually eliminate these arbitrary restrictions.


This patch (of 4):

In order to lay the groundwork for VMA flags being a bitmap rather than a
system word in size, we need to be able to consistently refer to VMA flags
by bit number rather than value.

Take this opportunity to do so in an enum which we which is additionally
useful for tooling to extract metadata from.

This additionally makes it very clear which bits are being used for what
at a glance.

We use the VMA_ prefix for the bit values as it is logical to do so since
these reference VMAs.  We consistently suffix with _BIT to make it clear
what the values refer to.

We declare bit values even when the flags that use them would not be
enabled by config options as this is simply clearer and clearly defines
what bit numbers are used for what, at no additional cost.

We declare a sparse-bitwise type vma_flag_t which ensures that users can't
pass around invalid VMA flags by accident and prepares for future work
towards VMA flags being a bitmap where we want to ensure bit values are
type safe.

To make life easier, we declare some macro helpers - DECLARE_VMA_BIT()
allows us to avoid duplication in the enum bit number declarations (and
maintaining the sparse __bitwise attribute), and INIT_VM_FLAG() is used to
assist with declaration of flags.

Unfortunately we can't declare both in the enum, as we run into issue with
logic in the kernel requiring that flags are preprocessor definitions, and
additionally we cannot have a macro which declares another macro so we
must define each flag macro directly.

Additionally, update the VMA userland testing vma_internal.h header to
include these changes.

We also have to fix the parameters to the vma_flag_*_atomic() functions
since VMA_MAYBE_GUARD_BIT is now of type vma_flag_t and sparse will
complain otherwise.

We have to update some rather silly if-deffery found in mm/task_mmu.c
which would otherwise break.

Finally, we update the rust binding helper as now it cannot auto-detect
the flags at all.

Link: https://lkml.kernel.org/r/cover.1764064556.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/3a35e5a0bcfa00e84af24cbafc0653e74deda64a.1764064556.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Acked-by: Alice Ryhl <aliceryhl@google.com>	[rust]
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andreas Hindborg <a.hindborg@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Björn Roy Baron <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Gary Guo <gary@garyguo.net>
Cc: Gregory Price <gourry@gourry.net>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Trevor Gross <tmgross@umich.edu>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-29 10:41:08 -08:00
Chu Guangqing
8f4338b114 zram: fix a spelling mistake
The spelling of the word "relases" is incorrect; it should be "releases".

Link: https://lkml.kernel.org/r/20251125020522.1913-1-chuguangqing@inspur.com
Signed-off-by: Chu Guangqing <chuguangqing@inspur.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-29 10:41:08 -08:00
fujunjie
a493c7a650 mm/page_alloc: optimize lowmem_reserve max lookup using its semantic monotonicity
calculate_totalreserve_pages() currently finds the maximum
lowmem_reserve[j] for a zone by scanning the full forward range [j =
zone_idx ..  MAX_NR_ZONES).  However, for a given zone i, the
lowmem_reserve[j] array (for j > i) is naturally expected to form a
monotonically non-decreasing sequence in j, not as an implementation
detail, but as a consequence that naturally arises from the semantics of
lowmem_reserve[].

For zone "i", lowmem_reserve[j] expresses how many pages in zone i must
effectively be kept in reserve when deciding whether an allocation class
that may allocate from zones up to j is allowed to fall back into i.  It
protects less flexible allocation classes (which cannot use higher zones)
from being starved by more flexible ones.

Viewed from this semantics, it is natural to expect a partial ordering in
j: as j increases, the allocation class gains access to a strictly larger
set of fallback zones.  Therefore lowmem_reserve[j] is expected to be
monotonically non-decreasing in j: more flexible allocation classes must
not be allowed to deplete low zones more aggressively than less flexible
ones.

In other words, if lowmem_reserve[j] were ever observed to *decrease* as j
grows, that would be unexpected from the reserve semantics' point of view
and would likely indicate a semantic change or a misconfiguration.

The current implementation in setup_per_zone_lowmem_reserve() reflects
this policy by accumulating managed pages from higher zones and applying
the configured ratio, which results in a non-decreasing sequence.  This
patch makes calculate_totalreserve_pages() rely on that monotonicity
explicitly and finds the maximum reserve value by scanning backward and
stopping at the first non-zero entry.  This avoids unnecessary iteration
and reflects the conceptual model more directly.  No functional behavior
changes.

To maintain this assumption explicitly, a comment is added next to
setup_per_zone_lowmem_reserve() documenting the monotonicity expectation
and noting that calculate_totalreserve_pages() relies on it.

Link: https://lkml.kernel.org/r/tencent_EB0FED91B01B1F8B6DAEE96719C5F5797F07@qq.com
Signed-off-by: fujunjie <fujunjie1@qq.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-29 10:41:08 -08:00
Jiayuan Chen
3cf41edc20 mm/vmscan: skip increasing kswapd_failures when reclaim was boosted
We have a colocation cluster used for deploying both offline and online
services simultaneously.  In this environment, we encountered a
scenario where direct memory reclamation was triggered due to kswapd
not running.

1. When applications start up, rapidly consume memory, or experience
   network traffic bursts, the kernel reaches steal_suitable_fallback(),
   which sets watermark_boost and subsequently wakes kswapd.

2. In the core logic of kswapd thread (balance_pgdat()), when reclaim is
   triggered by watermark_boost, the maximum priority is 10. Higher
   priority values mean less aggressive LRU scanning, which can result in
   no pages being reclaimed during a single scan cycle:

   if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
       raise_priority = false;

3. Additionally, many of our pods are configured with memory.low, which
   prevents memory reclamation in certain cgroups, further increasing the
   chance of failing to reclaim memory.

4. This eventually causes pgdat->kswapd_failures to continuously
   accumulate, exceeding MAX_RECLAIM_RETRIES, and consequently kswapd
   stops working.  At this point, the system's available memory is still
   significantly above the high watermark -- it's inappropriate for kswapd
   to stop under these conditions.

The final observable issue is that a brief period of rapid memory
allocation causes kswapd to stop running, ultimately triggering direct
reclaim and making the applications unresponsive.

This problem leading to direct memory reclamation has been a
long-standing issue in our production environment.  We initially held
the simple assumption that it was caused by applications allocating
memory too rapidly for kswapd to keep up with reclamation.  However,
after we began monitoring kswapd's runtime behavior, we discovered a
different pattern:

kswapd initially exhibits very aggressive activity even when there is
still considerable free memory, but it subsequently stops running
entirely, even as memory levels approach the low watermark.

In summary, both boosted watermarks and memory.low increase the
probability of kswapd operation failures.

This patch specifically addresses the scenario involving boosted
watermarks by not incrementing kswapd_failures when reclamation fails. 
A more general solution, potentially addressing memory.low or other
cases, requires further discussion.

Link: https://lkml.kernel.org/r/53de0b3ee0b822418e909db29bfa6513faff9d36@linux.dev
Link: https://lkml.kernel.org/r/20251024022711.382238-1-jiayuan.chen@linux.dev
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Yuanchu Xie <yuanchu@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-29 10:41:07 -08:00
Steven Rostedt
f6ed9c5d31 overflow: Introduce struct_offset() to get offset of member
The trace_marker_raw file in tracefs takes a buffer from user space that
contains an id as well as a raw data string which is usually a binary
structure. The structure used has the following:

	struct raw_data_entry {
		struct trace_entry	ent;
		unsigned int		id;
		char			buf[];
	};

Since the passed in "cnt" variable is both the size of buf as well as the
size of id, the code to allocate the location on the ring buffer had:

   size = struct_size(entry, buf, cnt - sizeof(entry->id));

Which is quite ugly and hard to understand. Instead, add a helper macro
called struct_offset() which then changes the above to a simple and easy
to understand:

   size = struct_offset(entry, id) + cnt;

This will likely come in handy for other use cases too.

Link: https://lore.kernel.org/all/CAHk-=whYZVoEdfO1PmtbirPdBMTV9Nxt9f09CK0k6S+HJD3Zmg@mail.gmail.com/

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Link: https://patch.msgid.link/20251126145249.05b1770a@gandalf.local.home
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Kees Cook <kees@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-27 20:18:05 -05:00
Joel Granados
564195c1a3 sysctl: Wrap do_proc_douintvec with the public function proc_douintvec_conv
Make do_proc_douintvec static and export proc_douintvec_conv wrapper
function for external use. This is to keep with the design in sysctl.c.
Update fs/pipe.c to use the new public API.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:45:38 +01:00
Joel Granados
30baaeb685 sysctl: Create pipe-max-size converter using sysctl UINT macros
Create a converter for the pipe-max-size proc_handler using the
SYSCTL_UINT_CONV_CUSTOM. Move SYSCTL_CONV_IDENTITY macro to the sysctl
header to make it available for pipe size validation. Keep returning
-EINVAL when (val == 0) by using a range checking converter and setting
the minimal valid value (extern1) to SYSCTL_ONE. Keep round_pipe_size by
passing it as the operation for SYSCTL_USER_TO_KERN_INT_CONV.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:45:37 +01:00
Joel Granados
4639faaa60 sysctl: Move proc_doulongvec_ms_jiffies_minmax to kernel/time/jiffies.c
Move proc_doulongvec_ms_jiffies_minmax to kernel/time/jiffies.c. Create
a non static wrapper function proc_doulongvec_minmax_conv that
forwards the custom convmul and convdiv argument values to the internal
do_proc_doulongvec_minmax. Remove unused linux/times.h include from
kernel/sysctl.c.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:45:37 +01:00
Joel Granados
54932988c4 sysctl: Move jiffies converters to kernel/time/jiffies.c
Move integer jiffies converters (proc_dointvec{_,_ms_,_userhz_}jiffies
and proc_dointvec_ms_jiffies_minmax) to kernel/time/jiffies.c. Error
stubs for when CONFIG_PRCO_SYSCTL is not defined are not reproduced
because all the jiffies converters go through proc_dointvec_conv which
is already stubbed. This is part of the greater effort to move sysctl
logic out of kernel/sysctl.c thereby reducing merge conflicts in
kernel/sysctl.c.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:45:37 +01:00
Joel Granados
24a08eefdd sysctl: Move UINT converter macros to sysctl header
Move SYSCTL_USER_TO_KERN_UINT_CONV and SYSCTL_UINT_CONV_CUSTOM macros to
include/linux/sysctl.h. No need to embed sysctl_kern_to_user_uint_conv
in a macro as it will not need a custom kernel pointer operation. This
is a preparation commit to enable jiffies converter creation outside
kernel/sysctl.c.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:45:37 +01:00
Joel Granados
e2e5dac304 sysctl: Move INT converter macros to sysctl header
Move direction macros (SYSCTL_{USER_TO_KERN,KERN_TO_USER}) and the
integer converter macros (SYSCTL_{USER_TO_KERN,KERN_TO_USER}_INT_CONV,
SYSCTL_INT_CONV_CUSTOM) into include/linux/sysctl.h. This is a
preparation commit to enable jiffies converter creation outside
kernel/sysctl.c.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:45:37 +01:00
Joel Granados
c5b4c183f7 sysctl: Allow custom converters from outside sysctl
The new non-static proc_dointvec_conv forwards a custom converter
function to do_proc_dointvec from outside the sysctl scope. Rename the
do_proc_dointvec call points so any future changes to proc_dointvec_conv
are propagated in sysctl.c This is a preparation commit that allows the
integer jiffie converter functions to move out of kernel/sysctl.c.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:45:37 +01:00
Joel Granados
1aa53326e1 sysctl: remove __user qualifier from stack_erasing_sysctl buffer argument
The buffer arg in proc handler functions have been void* (no __user
qualifier) since commit 32927393dc1c ("sysctl: pass kernel pointers to
->proc_handler"). The __user qualifier was erroneously brought back in
commit 0df8bdd5e3b3 ("stackleak: move stack_erasing sysctl to
stackleak.c"). This fixes the error by removing the __user qualifier.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202510221719.3ggn070M-lkp@intel.com/
Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:44:53 +01:00
Joel Granados
c3102febf4 sysctl: Create macro for user-to-kernel uint converter
Replace sysctl_user_to_kern_uint_conv function with
SYSCTL_USER_TO_KERN_UINT_CONV macro that accepts u_ptr_op parameter for
value transformation. Replacing sysctl_kern_to_user_uint_conv is not
needed as it will only be used from within sysctl.c. This is a
preparation commit for creating a custom converter in fs/pipe.c. No
Functional changes are intended.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:43:20 +01:00
Joel Granados
0c1d2dc7cc sysctl: Add optional range checking to SYSCTL_UINT_CONV_CUSTOM
Add k_ptr_range_check parameter to SYSCTL_UINT_CONV_CUSTOM macro to
enable range validation using table->extra1/extra2. Replace
do_proc_douintvec_minmax_conv with do_proc_uint_conv_minmax generated
by the updated macro.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:43:20 +01:00
Joel Granados
49d3288c1d sysctl: Create unsigned int converter using new macro
Pass sysctl_{user_to_kern,kern_to_user}_uint_conv (unsigned integer
uni-directional converters) to the new SYSCTL_UINT_CONV_CUSTOM macro
to create do_proc_douintvec_conv's replacement (do_proc_uint_conv).

This is a preparation commit to use the unsigned integer converter from
outside sysctl. No functional change is intended.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:43:20 +01:00
Joel Granados
54e77495a7 sysctl: Add optional range checking to SYSCTL_INT_CONV_CUSTOM
Extend the SYSCTL_INT_CONV_CUSTOM macro with a k_ptr_range_check
parameter to conditionally generate range validation code. When enabled,
validation is done against table->extra1 (min) and table->extra2 (max)
bounds before assignment. Add base minmax and ms_jiffies_minmax
converter instances that utilize the range checking functionality.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:43:20 +01:00
Joel Granados
796c481a4b sysctl: Create integer converters with one macro
New SYSCTL_INT_CONV_CUSTOM macro creates "bi-directional" converters
from a user-to-kernel and a kernel-to-user functions. Replace integer
versions of do_proc_*_conv functions with the ones from the new macro.
Rename "_dointvec_" to just "_int_" as these converters are not applied
to vectors and the "do" is already in the name.

Move the USER_HZ validation directly into proc_dointvec_userhz_jiffies()

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:43:20 +01:00
Joel Granados
2dc164a48e sysctl: Create converter functions with two new macros
Eight converter functions are created using two new macros
(SYSCTL_USER_TO_KERN_INT_CONV & SYSCTL_KERN_TO_USER_INT_CONV); they are
called from four pre-existing converter functions: do_proc_dointvec_conv
and do_proc_dointvec{,_userhz,_ms}_jiffies_conv. The function names
generated by the macros are differentiated by a string suffix passed as
the first macro argument.

The SYSCTL_USER_TO_KERN_INT_CONV macro first executes the u_ptr_op
operation, then checks for overflow, assigns sign (-, +) and finally
writes to the kernel var with WRITE_ONCE; it always returns an -EINVAL
when an overflow is detected. The SYSCTL_KERN_TO_USER_INT_CONV uses
READ_ONCE, casts to unsigned long, then executes the k_ptr_op before
assigning the value to the user space buffer.

The overflow check is always done against MAX_INT after applying
{k,u}_ptr_op. This approach avoids rounding or precision errors that
might occur when using the inverse operations.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:43:20 +01:00
Joel Granados
551bf18450 sysctl: Discriminate between kernel and user converter params
Rename converter parameter to indicate data flow direction: "lvalp" to
"u_ptr" indicating a user space parsed value pointer. "valp" to "k_ptr"
indicating a kernel storage value pointer. This facilitates the
identification of discrepancies between direction (copy to kernel or
copy to user space) and the modified variable. This is a preparation
commit for when the converter functions are exposed to the rest of the
kernel.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:43:20 +01:00
Joel Granados
5412f5b13d sysctl: Indicate the direction of operation with macro names
Replace the "write" integer parameter with SYSCTL_USER_TO_KERN() and
SYSCTL_KERN_TO_USER() that clearly indicate data flow direction in
sysctl operations.

"write" originates in proc_sysctl.c (proc_sys_{read,write}) and can take
one of two values: "0" or "1" when called from proc_sys_read and
proc_sys_write respectively. When write has a value of zero, data is
"written" to a user space buffer from a kernel variable (usually
ctl_table->data). Whereas when write has a value greater than zero, data
is "written" to an internal kernel variable from a user space buffer.
Remove this ambiguity by introducing macros that clearly indicate the
direction of the "write".

The write mode names in sysctl_writes_mode are left unchanged as these
directly relate to the sysctl_write_strict file in /proc/sys where the
word "write" unambiguously refers to writing to a file.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:43:20 +01:00
Joel Granados
610c9b6efb sysctl: Remove superfluous __do_proc_* indirection
Remove "__" from __do_proc_do{intvec,uintvec,ulongvec_minmax} internal
functions and delete their corresponding do_proc_do* wrappers. These
indirections are unnecessary as they do not add extra logic nor do they
indicate a layer separation.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:43:20 +01:00
Joel Granados
ee581c0e3a sysctl: Remove superfluous tbl_data param from "dovec" functions
Remove superfluous tbl_data param from do_proc_douintvec{,_r,_w}
and __do_proc_do{intvec,uintvec,ulongvec_minmax}. There is no need to
pass it as it is always contained within the ctl_table struct.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:43:20 +01:00
Joel Granados
6ca07a9b63 sysctl: Replace void pointer with const pointer to ctl_table
* Replace void* data in the converter functions with a const struct
  ctl_table* table as it was only getting forwarding values from
  ctl_table->extra{1,2}.
* Remove the void* data in the do_proc_* functions as they already had a
  pointer to the ctl_table.
* Remove min/max structures do_proc_do{uint,int}vec_minmax_conv_param;
  the min/max values get passed directly in ctl_table.
* Keep min/max initialization in extra{1,2} in proc_dou8vec_minmax.
* The do_proc_douintvec was adjusted outside sysctl.c as it is exported
  to fs/pipe.c.

Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-11-27 15:43:20 +01:00
Christophe JAILLET
f7f7809869 configfs: Constify ct_item_ops in struct config_item_type
Make 'ct_item_ops' const in struct config_item_type.
This allows constification of many structures which hold some function
pointers.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Breno Leitao <leitao@debian.org>
Link: https://lore.kernel.org/r/f43cb57418a7f59e883be8eedc7d6abe802a2094.1761390472.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
2025-11-27 12:03:27 +01:00
Christophe JAILLET
f2f36500a6 configfs: Constify ct_group_ops in struct config_item_type
Make 'ct_group_ops' const in struct config_item_type.
This allows constification of many structures which hold some function
pointers.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Breno Leitao <leitao@debian.org>
Link: https://lore.kernel.org/r/6b720cf407e8a6d30f35beb72e031b2553d1ab7e.1761390472.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
2025-11-27 12:03:27 +01:00
Shengming Hu
c264534c39 fgraph: Remove coarse PID filtering from graph_entry()
With PID filtering working via ftrace_pids_enabled() and fgraph_pid_func,
the coarse-grained ftrace_trace_task() check in graph_entry() is obsolete.

It was only a fallback for uninitialized op->private (now fixed), and its
removal ensures consistent PID filtering with standard function tracing.

Also remove unused ftrace_trace_task() definition from trace.h.

Cc: <wang.yaxin@zte.com.cn>
Cc: <mhiramat@kernel.org>
Cc: <mark.rutland@arm.com>
Cc: <mathieu.desnoyers@efficios.com>
Cc: <zhang.run@zte.com.cn>
Cc: <yang.yang29@zte.com.cn>
Link: https://patch.msgid.link/20251126173552333XoJZN20143fWbsdTEtWoU@zte.com.cn
Signed-off-by: Shengming Hu <hu.shengming@zte.com.cn>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-26 15:41:35 -05:00
Shengming Hu
1650a1b6cb fgraph: Check ftrace_pids_enabled on registration for early filtering
When registering ftrace_graph, check if ftrace_pids_enabled is active.
If enabled, assign entryfunc to fgraph_pid_func to ensure filtering
is performed before executing the saved original entry function.

Cc: stable@vger.kernel.org
Cc: <wang.yaxin@zte.com.cn>
Cc: <mhiramat@kernel.org>
Cc: <mark.rutland@arm.com>
Cc: <mathieu.desnoyers@efficios.com>
Cc: <zhang.run@zte.com.cn>
Cc: <yang.yang29@zte.com.cn>
Link: https://patch.msgid.link/20251126173331679XGVF98NLhyLJRdtNkVZ6w@zte.com.cn
Fixes: df3ec5da6a1e7 ("function_graph: Add pid tracing back to function graph tracer")
Signed-off-by: Shengming Hu <hu.shengming@zte.com.cn>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-26 15:41:16 -05:00
Shengming Hu
b5d6d3f73d fgraph: Initialize ftrace_ops->private for function graph ops
The ftrace_pids_enabled(op) check relies on op->private being properly
initialized, but fgraph_ops's underlying ftrace_ops->private was left
uninitialized. This caused ftrace_pids_enabled() to always return false,
effectively disabling PID filtering for function graph tracing.

Fix this by copying src_ops->private to dst_ops->private in
fgraph_init_ops(), ensuring PID filter state is correctly propagated.

Cc: stable@vger.kernel.org
Cc: <wang.yaxin@zte.com.cn>
Cc: <mhiramat@kernel.org>
Cc: <mark.rutland@arm.com>
Cc: <mathieu.desnoyers@efficios.com>
Cc: <zhang.run@zte.com.cn>
Cc: <yang.yang29@zte.com.cn>
Fixes: c132be2c4fcc1 ("function_graph: Have the instances use their own ftrace_ops for filtering")
Link: https://patch.msgid.link/20251126172926004y3hC8QyU4WFOjBkU_UxLC@zte.com.cn
Signed-off-by: Shengming Hu <hu.shengming@zte.com.cn>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-26 15:38:21 -05:00
pengdonglin
f83ac7544f function_graph: Enable funcgraph-args and funcgraph-retaddr to work simultaneously
Currently, the funcgraph-args and funcgraph-retaddr features are
mutually exclusive. This patch resolves this limitation by allowing
funcgraph-retaddr to have an args array.

To verify the change, use perf to trace vfs_write with both options
enabled:

Before:
 # perf ftrace -G vfs_write --graph-opts args,retaddr
   ......
   down_read() { /* <-n_tty_write+0xa3/0x540 */
     __cond_resched(); /* <-down_read+0x12/0x160 */
     preempt_count_add(); /* <-down_read+0x3b/0x160 */
     preempt_count_sub(); /* <-down_read+0x8b/0x160 */
   }

After:
 # perf ftrace -G vfs_write --graph-opts args,retaddr
   ......
   down_read(sem=0xffff8880100bea78) { /* <-n_tty_write+0xa3/0x540 */
     __cond_resched(); /* <-down_read+0x12/0x160 */
     preempt_count_add(val=1); /* <-down_read+0x3b/0x160 */
     preempt_count_sub(val=1); /* <-down_read+0x8b/0x160 */
   }

Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Xiaoqin Zhang <zhangxiaoqin@xiaomi.com>
Link: https://patch.msgid.link/20251125093425.2563849-1-dolinux.peng@gmail.com
Signed-off-by: pengdonglin <pengdonglin@xiaomi.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-26 15:13:30 -05:00
Masami Hiramatsu (Google)
20e7168326 tracing: Add boot-time backup of persistent ring buffer
Currently, the persistent ring buffer instance needs to be read before
using it. This means we have to wait for boot up user space and dump
the persistent ring buffer. However, in that case we can not start
tracing on it from the kernel cmdline.

To solve this limitation, this adds an option which allows to create
a trace instance as a backup of the persistent ring buffer at boot.
If user specifies trace_instance=<BACKUP>=<PERSIST_RB> then the
<BACKUP> instance is made as a copy of the <PERSIST_RB> instance.

For example, the below kernel cmdline records all syscalls, scheduler
and interrupt events on the persistent ring buffer `boot_map` but
before starting the tracing, it makes a `backup` instance from the
`boot_map`. Thus, the `backup` instance has the previous boot events.

'reserve_mem=12M:4M:trace trace_instance=boot_map@trace,syscalls:*,sched:*,irq:* trace_instance=backup=boot_map'

As you can see, this just make a copy of entire reserved area and
make a backup instance on it. So you can release (or shrink) the
backup instance after use it to save the memory usage.

  /sys/kernel/tracing/instances # free
                total        used        free      shared  buff/cache   available
  Mem:        1999284       55704     1930520       10132       13060     1914628
  Swap:             0           0           0
  /sys/kernel/tracing/instances # rmdir backup/
  /sys/kernel/tracing/instances # free
                total        used        free      shared  buff/cache   available
  Mem:        1999284       40640     1945584       10132       13060     1929692
  Swap:             0           0           0

Note: since there is no reason to make a copy of empty buffer, this
backup only accepts a persistent ring buffer as the original instance.
Also, since this backup is based on vmalloc(), it does not support
user-space mmap().

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/176377150002.219692.9425536150438129267.stgit@devnote2
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-26 15:13:30 -05:00
Steven Rostedt
f93a7d0cac ftrace: Allow tracing of some of the tracing code
There is times when tracing the tracing infrastructure can be useful for
debugging the tracing code. Currently all files in the tracing directory
are set to "notrace" the functions.

Add a new config option FUNCTION_SELF_TRACING that will allow some of the
files in the tracing infrastructure to be traced. It requires a config to
enable because it will add noise to the function tracer if events and
other tracing features are enabled. Tracing functions and events together
is quite common, so not tracing the event code should be the default.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Link: https://patch.msgid.link/20251120181514.736f2d5f@gandalf.local.home
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-26 15:13:30 -05:00
Steven Rostedt
400ddf1dbe tracing: Use strim() in trigger_process_regex() instead of skip_spaces()
The function trigger_process_regex() is called by a few functions, where
only one calls strim() on the buffer passed to it. That leaves the other
functions not trimming the end of the buffer passed in and making it a
little inconsistent.

Remove the strim() from event_trigger_regex_write() and have
trigger_process_regex() use strim() instead of skip_spaces(). The buff
variable is not passed in as const, so it can be modified.

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tom Zanussi <zanussi@kernel.org>
Link: https://patch.msgid.link/20251125214032.323747707@kernel.org
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-26 15:13:30 -05:00
Steven Rostedt
61d445af0a tracing: Add bulk garbage collection of freeing event_trigger_data
The event trigger data requires a full tracepoint_synchronize_unregister()
call before freeing. That call can take 100s of milliseconds to complete.
In order to allow for bulk freeing of the trigger data, it can not call
the tracepoint_synchronize_unregister() for every individual trigger data
being free.

Create a kthread that gets created the first time a trigger data is freed,
and have it use the lockless llist to get the list of data to free, run
the tracepoint_synchronize_unregister() then free everything in the list.

By freeing hundreds of event_trigger_data elements together, it only
requires two runs of the synchronization function, and not hundreds of
runs. This speeds up the operation by orders of magnitude (milliseconds
instead of several seconds).

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tom Zanussi <zanussi@kernel.org>
Link: https://patch.msgid.link/20251125214032.151674992@kernel.org
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-26 15:13:30 -05:00
Steven Rostedt
78c7051394 tracing: Remove unneeded event_mutex lock in event_trigger_regex_release()
In event_trigger_regex_release(), the only code is:

	mutex_lock(&event_mutex);
	if (file->f_mode & FMODE_READ)
		seq_release(inode, file);
	mutex_unlock(&event_mutex);

	return 0;

There's nothing special about the file->f_mode or the seq_release() that
requires any locking. Remove the unnecessary locks.

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tom Zanussi <zanussi@kernel.org>
Link: https://patch.msgid.link/20251125214031.975879283@kernel.org
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-26 15:13:29 -05:00
Steven Rostedt
b052d70f7c tracing: Merge struct event_trigger_ops into struct event_command
Now that there's pretty much a one to one mapping between the struct
event_trigger_ops and struct event_command, there's no reason to have two
different structures. Merge the function pointers of event_trigger_ops
into event_command.

There's one exception in trace_events_hist.c for the
event_hist_trigger_named_ops. This has special logic for the init and free
function pointers for "named histograms". In this case, allocate the
cmd_ops of the event_trigger_data and set it to the proper init and free
functions, which are used to initialize and free the event_trigger_data
respectively. Have the free function and the init function (on failure)
free the cmd_ops of the data element.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://patch.msgid.link/20251125200932.446322765@kernel.org
Reviewed-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-26 15:13:29 -05:00
Steven Rostedt
bdafb4d4cb tracing: Remove get_trigger_ops() and add count_func() from trigger ops
The struct event_command has a callback function called get_trigger_ops().
This callback returns the "trigger_ops" to use for the trigger. These ops
define the trigger function, how to init the trigger, how to print the
trigger and how to free it.

The only reason there's a callback function to get these ops is because
some triggers have two types of operations. One is an "always on"
operation, and the other is a "count down" operation. If a user passes in
a parameter to say how many times the trigger should execute. For example:

  echo stacktrace:5 > events/kmem/kmem_cache_alloc/trigger

It will trigger the stacktrace for the first 5 times the kmem_cache_alloc
event is hit.

Instead of having two different trigger_ops since the only difference
between them is the tigger itself (the print, init and free functions are
all the same), just use a single ops that the event_command points to and
add a function field to the trigger_ops to have a count_func.

When a trigger is added to an event, if there's a count attached to it and
the trigger ops has the count_func field, the data allocated to represent
this trigger will have a new flag set called COUNT.

Then when the trigger executes, it will check if the COUNT data flag is
set, and if so, it will call the ops count_func(). If that returns false,
it returns without executing the trigger.

This removes the need for duplicate event_trigger_ops structures.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://patch.msgid.link/20251125200932.274566147@kernel.org
Reviewed-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-26 15:13:29 -05:00
Masami Hiramatsu (Google)
23c0e9cc76 tracing: Show the tracer options in boot-time created instance
Since tracer_init_tracefs_work_func() only updates the tracer options
for the global_trace, the instances created by the kernel cmdline
do not have those options.

Fix to update tracer options for those boot-time created instances
to show those options.

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://patch.msgid.link/176354112555.2356172.3989277078358802353.stgit@mhiramat.tok.corp.google.com
Fixes: 428add559b69 ("tracing: Have tracer option be instance specific")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-26 15:13:29 -05:00
Menglong Dong
7a6735cc9b ftrace: Avoid redundant initialization in register_ftrace_direct
The FTRACE_OPS_FL_INITIALIZED flag is cleared in register_ftrace_direct,
which can make it initialized by ftrace_ops_init() even if it is already
initialized. It seems that there is no big deal here, but let's still fix
it.

Link: https://patch.msgid.link/20251110121808.1559240-1-dongml2@chinatelecom.cn
Fixes: f64dd4627ec6 ("ftrace: Add multi direct register/unregister interface")
Acked-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-26 15:13:28 -05:00
Steven Rostedt
49c1364c7c tracing: Remove unused variable in tracing_trace_options_show()
The flags and opts used in tracing_trace_options_show() now come directly
from the trace array "current_trace_flags" and not the current_trace. The
variable "trace" was still being assigned to tr->current_trace but never
used. This caused a warning in clang.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251117120637.43ef995d@gandalf.local.home
Reported-by: Andy Shevchenko <andriy.shevchenko@intel.com>
Tested-by: Andy Shevchenko <andriy.shevchenko@intel.com>
Closes: https://lore.kernel.org/all/aRtHWXzYa8ijUIDa@black.igk.intel.com/
Fixes: 428add559b692 ("tracing: Have tracer option be instance specific")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-26 15:13:28 -05:00
Steven Rostedt
ac87b220a6 fgraph: Make fgraph_no_sleep_time signed
The variable fgraph_no_sleep_time changed from being a boolean to being a
counter. A check is made to make sure that it never goes below zero. But
the variable being unsigned makes the check always fail even if it does go
below zero.

Make the variable a signed int so that checking it going below zero still
works.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251125104751.4c9c7f28@gandalf.local.home
Fixes: 5abb6ccb58f0 ("tracing: Have function graph tracer option sleep-time be per instance")
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/all/aR1yRQxDmlfLZzoo@stanley.mountain/
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-26 15:13:28 -05:00
Richard Weinberger
84a8d467cc pagemap: update BUDDY flag documentation
Since v4.6 the BUDDY flag is set for _all_ pages in the block
and no longer just for the first one.
This change was introduced by:
commit 832fc1de01ae ("/proc/kpageflags: return KPF_BUDDY for "tail" buddy pages")

Strictly speaking, this was an ABI change, but as nobody has noticed
since 2016, let's just update the documentation.

Link: https://lkml.kernel.org/r/20251122211920.3410371-1-richard@nod.at
Signed-off-by: Richard Weinberger <richard@nod.at>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:56 -08:00
Youngjun Park
b7dd80f8f9 mm: swap: remove scan_swap_map_slots() references from comments
The scan_swap_map_slots() helper has been removed, but several comments
still referred to it in swap allocation and reclaim paths.  This patch
cleans up those outdated references and reflows the affected comment
blocks to match kernel coding style.

Link: https://lkml.kernel.org/r/20251031065011.40863-6-youngjun.park@lge.com
Signed-off-by: Youngjun Park <youngjun.park@lge.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: Barry Song <baohua@kernel.org>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:56 -08:00
Youngjun Park
4c239d5f59 mm: swap: change swap_alloc_slow() to void
swap_alloc_slow() does not need to return a bool, as all callers handle
allocation results via the entry parameter.  Update the function signature
and remove return statements accordingly.

Link: https://lkml.kernel.org/r/20251031065011.40863-5-youngjun.park@lge.com
Signed-off-by: Youngjun Park <youngjun.park@lge.com>
Reviewed-by: Kairui Song <kasong@tencent.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: Barry Song <baohua@kernel.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:56 -08:00
Youngjun Park
f1bae15c6a mm, swap: remove redundant comment for read_swap_cache_async
The function now manages get/put_swap_device() internally, making the
comment explaining this behavior to callers unnecessary.

Link: https://lkml.kernel.org/r/20251031065011.40863-4-youngjun.park@lge.com
Signed-off-by: Youngjun Park <youngjun.park@lge.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:56 -08:00
Youngjun Park
68f78bf55b mm, swap: use SWP_SOLIDSTATE to determine if swap is rotational
The current non rotational check is unreliable as the device's rotational
status can be changed by a user via sysfs.

Use the more reliable SWP_SOLIDSTATE flag which is set at swapon time, to
ensure the nr_rotate_swap count remains consistent.  Plus, it is easy to
read and simple.

Link: https://lkml.kernel.org/r/20251031065011.40863-3-youngjun.park@lge.com
Fixes: 81a0298bdfab ("mm, swap: don't use VMA based swap readahead if HDD is used as swap")
Signed-off-by: Youngjun Park <youngjun.park@lge.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:56 -08:00
Youngjun Park
cb65082a0a mm, swap: fix memory leak in setup_clusters() error path
Patch series "mm: swap: small fixes and comment cleanups", v2.

This series provides a few small fixes and cleanups for the swap code.

The first patch fixes a memory leak in an error path that was recently
introduced. The subsequent patches include minor logic adjustments and
the removal of redundant comments.


This patch (of 5):

setup_clusters() could leak 'cluster_info' memory if an error occurred on
a path that did not jump to the 'err_free' label.

This patch simplifies the error handling by removing the goto label and
instead calling free_cluster_info() on all error exit paths.

The new logic is safe, as free_cluster_info() already handles NULL pointer
inputs.

Link: https://lkml.kernel.org/r/20251031065011.40863-1-youngjun.park@lge.com
Link: https://lkml.kernel.org/r/20251031065011.40863-2-youngjun.park@lge.com
Fixes: 07adc4cf1ecd ("mm, swap: implement dynamic allocation of swap table")
Signed-off-by: Youngjun Park <youngjun.park@lge.com>
Reviewed-by: Kairui Song <kasong@tencent.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:56 -08:00
Youngjun Park
c230719523 mm/swap: fix wrong plist empty check in swap_alloc_slow()
swap_alloc_slow() was checking `si->avail_list` instead of
`next->avail_list` when verifying if the next swap device is still in the
list, which could cause unnecessary restarts during allocation.

Link: https://lkml.kernel.org/r/20251119114136.594108-1-youngjun.park@lge.com
Fixes: 8e689f8ea45f ("mm/swap: do not choose swap device according to numa node")
Signed-off-by: Youngjun Park <youngjun.park@lge.com>
Acked-by: Kairui Song <kasong@tencent.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:56 -08:00
Dan Carpenter
a9ce09b157 mm/damon/tests/sysfs-kunit: fix use after free on error path
Re-order these frees to avoid dereferencing "sysfs_target" after it has
been freed.

Link: https://lkml.kernel.org/r/aSBq5uSPIqsqH8zO@stanley.mountain
Fixes: ee131696794c ("mm/damon/tests/sysfs-kunit: handle alloc failures on damon_sysfs_test_add_targets()")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:55 -08:00
Gregory Price
348ced3da5 hugetlb: add __read_mostly to sysctl_hugetlb_shm_group
sysctl bits are mostly-read values.

Link: https://lkml.kernel.org/r/20251121194859.265259-2-gourry@gourry.net
Signed-off-by: Gregory Price <gourry@gourry.net>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:55 -08:00
Lorenzo Stoakes
ccf9eb326b tools/testing/vma: add missing stub
vm_flags_reset() is not available in the userland VMA tests, so add a stub
which const-casts vma->vm_flags and avoids the upcoming removal of the
vma->__vm_flags field.

Link: https://lkml.kernel.org/r/4aff8bf7-d367-4ba3-90ad-13eef7a063fa@lucifer.local
Fixes: c5c67c1de357 ("tools/testing/vma: eliminate dependency on vma->__vm_flags")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:55 -08:00
Yuwen Chen
04d31610a7 zram: fix the issue that the write - back limits might overflow
When the page size exceeds 4KB, if bd_wb_limit is set to a value that is
not aligned with the page size, it will cause a numerical wrap-around
issue for bd_wb_limit.  For example, when the page size is set to 16KB and
bd_wb_limit is set to 3, after one write-back operation, the value of
bd_wb_limit will become -1.  More seriously, since bd_wb_limit is an
unsigned number, its value may become as large as 2^64 - 1.

The core reason for this problem is that the unit of bd_wb_limit is 4KB. 
For example, when a write-back occurs on a system with a page size of
16KB, 4 needs to be subtracted from bd_wb_limit.  This operation takes
place in the zram_account_writeback_submit function.

This patch fixes the issue by limiting bd_wb_limit to be an integer
multiple of PAGE_SIZE / 4096.

Link: https://lkml.kernel.org/r/tencent_5936CFE72BAB2BA76887BB69DCC1B5E67C05@qq.com
Fixes: 1d69a3f8ae77 ("zram: idle writeback fixes and cleanup")
Signed-off-by: Yuwen Chen <ywen.chen@foxmail.com>
Acked-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Brian Geffon <bgeffon@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Richard Chang <richardycc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:55 -08:00
Matthew Wilcox (Oracle)
ecf371b2ca mm: tweak __vma_enter_locked()
Move the commentary on how __vma_enter_locked() behaves from the body of
__vma_start_write() to the head of __vma_enter_locked() and merge it with
the existing documentation.  Also add a call to
mmap_assert_write_locked().

Link: https://lkml.kernel.org/r/20251119042639.3937024-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:55 -08:00
Xie Yuanbin
31807483d3 mm/memory-failure: remove the selection of RAS
commit 97f0b13452198290799f ("tracing: add trace event for
memory-failure") introduces the selection of RAS in memory-failure.  This
commit is just a tracing feature; in reality, there is no dependency
between memory-failure and RAS.  RAS increases the size of the bzImage
image by 8k, which is very valuable for embedded devices.

Move the memory-failure traceing code from ras_event.h to
memory-failure.h and remove the selection of RAS.

Link: https://lkml.kernel.org/r/20251119095943.67125-1-xieyuanbin1@huawei.com
Signed-off-by: Xie Yuanbin <xieyuanbin1@huawei.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Acked-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Borislav Petkov <bp@alien8.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:55 -08:00
Chunyan Zhang
519912bdae dt-bindings: riscv: Add Svrsw60t59b extension description
Add description for the Svrsw60t59b extension (PTE Reserved for SW
bits 60:59) extension which was ratified recently in
riscv-non-isa/riscv-iommu.

Link: https://lkml.kernel.org/r/20251113072806.795029-7-zhangchunyan@iscas.ac.cn
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Jones <ajones@ventanamicro.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Conor Dooley <conor@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yuanchu Xie <yuanchu@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:55 -08:00
Chunyan Zhang
c64da3950c riscv: mm: add userfaultfd write-protect support
The Svrsw60t59b extension allows to free the PTE reserved bits 60 and 59
for software, this patch uses bit 60 for uffd-wp tracking

Additionally for tracking the uffd-wp state as a PTE swap bit, we borrow
bit 4 which is not involved into swap entry computation.

Link: https://lkml.kernel.org/r/20251113072806.795029-6-zhangchunyan@iscas.ac.cn
Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Jones <ajones@ventanamicro.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Conor Dooley <conor.dooley@microchip.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yuanchu Xie <yuanchu@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:55 -08:00
Chunyan Zhang
2a3ebad4db riscv: mm: add soft-dirty page tracking support
The Svrsw60t59b extension allows to free the PTE reserved bits 60 and 59
for software, this patch uses bit 59 for soft-dirty.

To add swap PTE soft-dirty tracking, we borrow bit 3 which is available
for swap PTEs on RISC-V systems.

Link: https://lkml.kernel.org/r/20251113072806.795029-5-zhangchunyan@iscas.ac.cn
Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
Reviewed-by: Deepak Gupta <debug@rivosinc.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Jones <ajones@ventanamicro.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Conor Dooley <conor.dooley@microchip.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yuanchu Xie <yuanchu@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:55 -08:00
Chunyan Zhang
59f6acb4be riscv: add RISC-V Svrsw60t59b extension support
The Svrsw60t59b extension allows to free the PTE reserved bits 60 and 59
for software to use.

Link: https://lkml.kernel.org/r/20251113072806.795029-4-zhangchunyan@iscas.ac.cn
Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Deepak Gupta <debug@rivosinc.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Conor Dooley <conor.dooley@microchip.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yuanchu Xie <yuanchu@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:55 -08:00
Chunyan Zhang
f59c0924d6 mm: userfaultfd: add pgtable_supports_uffd_wp()
Some platforms can customize the PTE/PMD entry uffd-wp bit making it
unavailable even if the architecture provides the resource.  This patch
adds a macro API pgtable_supports_uffd_wp() that allows architectures to
define their specific implementations to check if the uffd-wp bit is
available on which device the kernel is running.

Also this patch is removing "ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP" and
"ifdef CONFIG_PTE_MARKER_UFFD_WP" in favor of pgtable_supports_uffd_wp()
and uffd_supports_wp_marker() checks respectively that default to
IS_ENABLED(CONFIG_HAVE_ARCH_USERFAULTFD_WP) and
"IS_ENABLED(CONFIG_HAVE_ARCH_USERFAULTFD_WP) &&
IS_ENABLED(CONFIG_PTE_MARKER_UFFD_WP)" if not overridden by the
architecture, no change in behavior is expected.

Link: https://lkml.kernel.org/r/20251113072806.795029-3-zhangchunyan@iscas.ac.cn
Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Jones <ajones@ventanamicro.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Conor Dooley <conor.dooley@microchip.com>
Cc: Conor Dooley <conor@kernel.org>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yuanchu Xie <yuanchu@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:54 -08:00
Chunyan Zhang
277a1ae387 mm: softdirty: add pgtable_supports_soft_dirty()
Patch series "mm: Add soft-dirty and uffd-wp support for RISC-V", v15.

This patchset adds support for Svrsw60t59b [1] extension which is ratified
now, also add soft dirty and userfaultfd write protect tracking for
RISC-V.

The patches 1 and 2 add macros to allow architectures to define their own
checks if the soft-dirty / uffd_wp PTE bits are available, in other words
for RISC-V, the Svrsw60t59b extension is supported on which device the
kernel is running.  Also patch1-2 are removing "ifdef
CONFIG_MEM_SOFT_DIRTY" "ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP" and "ifdef
CONFIG_PTE_MARKER_UFFD_WP" in favor of checks which if not overridden by
the architecture, no change in behavior is expected.

This patchset has been tested with kselftest mm suite in which soft-dirty,
madv_populate, test_unmerge_uffd_wp, and uffd-unit-tests run and pass, and
no regressions are observed in any of the other tests.


This patch (of 6):

Some platforms can customize the PTE PMD entry soft-dirty bit making it
unavailable even if the architecture provides the resource.

Add an API which architectures can define their specific implementations
to detect if soft-dirty bit is available on which device the kernel is
running.

This patch is removing "ifdef CONFIG_MEM_SOFT_DIRTY" in favor of
pgtable_supports_soft_dirty() checks that defaults to
IS_ENABLED(CONFIG_MEM_SOFT_DIRTY), if not overridden by the architecture,
no change in behavior is expected.

We make sure to never set VM_SOFTDIRTY if !pgtable_supports_soft_dirty(),
so we will never run into VM_SOFTDIRTY checks.

[lorenzo.stoakes@oracle.com: fix VMA selftests]
  Link: https://lkml.kernel.org/r/dac6ddfe-773a-43d5-8f69-021b9ca4d24b@lucifer.local
Link: https://lkml.kernel.org/r/20251113072806.795029-1-zhangchunyan@iscas.ac.cn
Link: https://lkml.kernel.org/r/20251113072806.795029-2-zhangchunyan@iscas.ac.cn
Link: https://github.com/riscv-non-isa/riscv-iommu/pull/543 [1]
Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Conor Dooley <conor@kernel.org>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Andrew Jones <ajones@ventanamicro.com>
Cc: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:54 -08:00
Vishal Moola (Oracle)
d85b653f2c mm/vmalloc: cleanup gfp flag use in new_vmap_block()
The only caller, vb_alloc(), passes GFP_KERNEL into new_vmap_block() which
is a subset of GFP_RECLAIM_MASK.  Since there's no reason to use this mask
here, remove it.

Link: https://lkml.kernel.org/r/20251121094405.40628-5-vishal.moola@gmail.com
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Acked-by: SeongJae Park <sj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:54 -08:00
Vishal Moola (Oracle)
75f20b1744 mm/vmalloc: cleanup large_gfp in vm_area_alloc_pages()
Now that we have already checked for unsupported flags, we can use the
helper function to set the necessary gfp flags for the large order
allocation optimization.

Link: https://lkml.kernel.org/r/20251121094405.40628-4-vishal.moola@gmail.com
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Acked-by: SeongJae Park <sj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:54 -08:00
Vishal Moola (Oracle)
bb4d3c7686 mm/vmalloc: add a helper to optimize vmalloc allocation gfps
vm_area_alloc_pages() attempts to use different gfp flags as a way to
optimize allocations.  This has been done inline which makes things harder
to read.

Add a helper function to make the code more readable.

Link: https://lkml.kernel.org/r/20251121094405.40628-3-vishal.moola@gmail.com
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Acked-by: SeongJae Park <sj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:54 -08:00
Vishal Moola (Oracle)
07003531e0 mm/vmalloc: warn on invalid vmalloc gfp flags
Patch series "make vmalloc gfp flags usage more apparent", v4.

We should do a better job at enforcing gfp flags for vmalloc.  Right now,
we have a kernel-doc for __vmalloc_node_range(), and hope callers pass in
supported flags.  If a caller were to pass in an unsupported flag, we may
BUG, silently clear it, or completely ignore it.

If we are more proactive about enforcing gfp flags, we can making sure
callers know when they may be asking for unsupported behavior.

This patchset lets vmalloc control the incoming gfp flags, and cleans up
some hard to read gfp code.


This patch (of 4):

Vmalloc explicitly supports a list of flags, but we never enforce them. 
vmalloc has been trying to handle unsupported flags by clearing and
setting flags wherever necessary.  This is messy and makes the code harder
to understand, when we could simply check for a supported input
immediately instead.

Define a helper mask and function telling callers they have passed in
invalid flags, and clear those unsupported vmalloc flags.

Link: https://lkml.kernel.org/r/20251121094405.40628-1-vishal.moola@gmail.com
Link: https://lkml.kernel.org/r/20251121094405.40628-2-vishal.moola@gmail.com
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Acked-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:54 -08:00
Shakeel Butt
c1bd09994c memcg: remove __lruvec_stat_mod_folio
__lruvec_stat_mod_folio() is already safe against irqs, so there is no
need to have a separate interface (i.e.  lruvec_stat_mod_folio) which
wraps calls to it with irq disabling and reenabling.  Let's rename
__lruvec_stat_mod_folio() to lruvec_stat_mod_folio().

Link: https://lkml.kernel.org/r/20251110232008.1352063-5-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:54 -08:00
Shakeel Butt
5b3eb779a2 memcg: remove __mod_lruvec_state
__mod_lruvec_state() is already safe against irqs, so there is no need to
have a separate interface (i.e.  mod_lruvec_state) which wraps calls to it
with irq disabling and reenabling.  Let's rename __mod_lruvec_state() to
mod_lruvec_state().

Link: https://lkml.kernel.org/r/20251110232008.1352063-4-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:54 -08:00
Shakeel Butt
469241fe76 memcg: remove __mod_lruvec_kmem_state
__mod_lruvec_kmem_state() is already safe against irqs, so there is no
need to have a separate interface (i.e.  mod_lruvec_kmem_state) which
wraps calls to it with irq disabling and reenabling.  Let's rename
__mod_lruvec_kmem_state() to mod_lruvec_kmem_state().

Link: https://lkml.kernel.org/r/20251110232008.1352063-3-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:54 -08:00
Shakeel Butt
7e44d00a13 memcg: use mod_node_page_state to update stats
Patch series "memcg: cleanup the memcg stats interfaces".

The memcg stats are safe against irq (and nmi) context and thus does not
require disabling irqs.  However for some stats which are also maintained
at node level, it is using irq unsafe interface and thus requiring the
users to still disables irqs or use interfaces which explicitly disables
irqs.  Let's move memcg code to use irq safe node level stats function
which is already optimized for architectures with HAVE_CMPXCHG_LOCAL (all
major ones), so there will not be any performance penalty for its usage.


This patch (of 4):

The memcg stats are safe against irq (and nmi) context and thus does not
require disabling irqs.  However some code paths for memcg stats also
update the node level stats and use irq unsafe interface and thus require
the users to disable irqs.  However node level stats, on architectures
with HAVE_CMPXCHG_LOCAL (all major ones), has interface which does not
require irq disabling.  Let's move memcg stats code to start using that
interface for node level stats.

Link: https://lkml.kernel.org/r/20251110232008.1352063-1-shakeel.butt@linux.dev
Link: https://lkml.kernel.org/r/20251110232008.1352063-2-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:53 -08:00
Peng Li
3e700b715e selftests/mm: gup_test: fix comment regarding origin of FOLL_WRITE
The 'FOLL_WRITE' of the copied source is located in mm_types.h of mm, not
mm.h, so fix it.

Link: https://lkml.kernel.org/r/20251117154012.197499-2-peng8420.li@gmail.com
Signed-off-by: Peng Li <peng8420.li@gmail.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:53 -08:00
Peng Li
218fbfad16 selftests/mm: gup_test: stop testing FOLL_TOUCH
commit 0f20bba1688b ("mm/gup: explicitly define and check internal GUP
flags, disallow FOLL_TOUCH") marked FOLL_TOUCH as a GUP-internal flag.

This causes a warning to fire when running gup_test, for example:

    $ ./gup_test -L -r 100 -z

dmesg:
    WARNING: CPU: 1 PID: 117 at mm/gup.c:2512 is_valid_gup_args+0x66/0x8c

Therefore, remove the "FOLL_TOUCH" test code from gup_test.c.

Link: https://lkml.kernel.org/r/20251117154012.197499-1-peng8420.li@gmail.com
Signed-off-by: Peng Li <peng8420.li@gmail.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:53 -08:00
Balbir Singh
cab812d9c9 mm/huge_memory.c: introduce folio_split_unmapped
Unmapped was added as a parameter to __folio_split() and related call
sites to support splitting of folios already in the midst of a migration. 
This special case arose for device private folio migration since during
migration there could be a disconnect between source and destination on
the folio size.

Introduce folio_split_unmapped() to handle this special case.  Also
refactor code and add __folio_freeze_and_split_unmapped() helper that is
common to both __folio_split() and folio_split_unmapped().

This in turn removes the special casing introduced by the unmapped
parameter in __folio_split().

[balbirs@nvidia.com: v2]
  Link: https://lkml.kernel.org/r/20251115084041.3914728-1-balbirs@nvidia.com
[balbirs@nvidia.com: fix clang-20 build]
  Link: https://lkml.kernel.org/r/20251120134232.3588203-1-balbirs@nvidia.com
[akpm@linux-foundation.org: add `inline' to shmem_uncharge() stub, per Balbir]
Link: https://lkml.kernel.org/r/20251114012228.2634882-1-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Suggested-by: Zi Yan <ziy@nvidia.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:53 -08:00
Baolin Wang
8826f09616 mm: shmem: allow fallback to smaller large orders for tmpfs mmap() access
After commit 69e0a3b49003 ("mm: shmem: fix the strategy for the tmpfs
'huge=' options"), we have fixed the large order allocation strategy for
tmpfs, which always tries PMD-sized large folios first, and if that fails,
falls back to smaller large folios.  For tmpfs large folio allocation via
mmap(), we should maintain the same strategy as well.  Let's unify the
large order allocation strategy for tmpfs.

There is no functional change for large folio allocation of anonymous shmem.

Link: https://lkml.kernel.org/r/283a0bdfd6ac7aa334a491422bcae70919c572bd.1763008453.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:53 -08:00
Sergey Senozhatsky
1b1a4e4d67 zram: read slot block idx under slot lock
Read slot's block id under slot-lock.  We release the slot-lock for bdev
read so, technically, slot still can get freed in the meantime, but at
least we will read bdev block (page) that holds previous know slot data,
not from slot->handle bdev block, which can be anything at that point.

Link: https://lkml.kernel.org/r/20251122074029.3948921-7-senozhatsky@chromium.org
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Brian Geffon <bgeffon@google.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Richard Chang <richardycc@google.com>
Cc: Yuwen Chen <ywen.chen@foxmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:53 -08:00
Sergey Senozhatsky
e87ddea345 zram: rework bdev block allocation
First, writeback bdev ->bitmap bits are set only from one context, as we
can have only one single task performing writeback, so we cannot race with
anything else.  Remove retry path.

Second, we always check ZRAM_WB flag to distinguish writtenback slots, so
we should not confuse 0 bdev block index and 0 handle.  We can use first
bdev block (0 bit) for writeback as well.

While at it, give functions slightly more accurate names, as we don't
alloc/free anything there, we reserve a block for async writeback or
release the block.

Link: https://lkml.kernel.org/r/20251122074029.3948921-6-senozhatsky@chromium.org
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Brian Geffon <bgeffon@google.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Richard Chang <richardycc@google.com>
Cc: Yuwen Chen <ywen.chen@foxmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:53 -08:00
Sergey Senozhatsky
a4f506c569 zram: drop wb_limit_lock
We don't need wb_limit_lock.  Writeback limit setters take an exclusive
write zram init_lock, while wb_limit modifications happen only from a
single task and under zram read init_lock.  No concurrent wb_limit
modifications are possible (we permit only one post-processing task at a
time).  Add lockdep assertions to wb_limit mutators.

While at it, fixup coding styles.

Link: https://lkml.kernel.org/r/20251122074029.3948921-5-senozhatsky@chromium.org
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Brian Geffon <bgeffon@google.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Richard Chang <richardycc@google.com>
Cc: Yuwen Chen <ywen.chen@foxmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:53 -08:00
Sergey Senozhatsky
7c929664fd zram: take write lock in wb limit store handlers
Write device attrs handlers should take write zram init_lock.  While at
it, fixup coding styles.

Link: https://lkml.kernel.org/r/20251122074029.3948921-4-senozhatsky@chromium.org
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Brian Geffon <bgeffon@google.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Richard Chang <richardycc@google.com>
Cc: Yuwen Chen <ywen.chen@foxmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:53 -08:00
Sergey Senozhatsky
e828cccb72 zram: add writeback batch size device attr
Introduce writeback_batch_size device attribute so that the maximum number
of in-flight writeback bio requests can be configured at run-time
per-device.  This essentially enables batched bio writeback.

Link: https://lkml.kernel.org/r/20251122074029.3948921-3-senozhatsky@chromium.org
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Reviewed-by: Brian Geffon <bgeffon@google.com>
Cc: Minchan Kim <minchan@google.com>
Cc: Richard Chang <richardycc@google.com>
Cc: Yuwen Chen <ywen.chen@foxmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:53 -08:00
Sergey Senozhatsky
f405066a1f zram: introduce writeback bio batching
Patch series "zram: introduce writeback bio batching", v6.

As writeback is becoming more and more common the longstanding limitations
of zram writeback throughput are becoming more visible.  Introduce
writeback bio batching so that multiple writeback bios can be processed
simultaneously.


This patch (of 6):

As was stated in a comment [1] a single page writeback IO is not
efficient, but it works.  It's time to address this throughput limitation
as writeback becomes used more often.  Introduce batched (multiple) bio
writeback support to take advantage of parallel requests processing and
better requests scheduling.

Approach used in this patch doesn't use a dedicated kthread like in [2],
or blk-plug like in [3].  Dedicated kthread adds complexity, which can be
avoided.  Apart from that not all zram setups use writeback, so having
numerous per-device kthreads (on systems that create multiple zram
devices) hanging around is not the most optimal thing to do.  blk-plug, on
the other hand, works best when request are sequential, which doesn't
particularly fit zram writebck IO patterns: zram writeback IO patterns are
expected to be random, due to how bdev block reservation/release are
handled.  blk-plug approach also works in cycles: idle IO, when zram sets
up requests in a batch, is followed by bursts of IO, when zram submits the
entire batch.

Instead we use a batch of requests and submit new bio as soon as one of
the in-flight requests completes.

For the time being the writeback batch size (maximum number of in-flight
bio requests) is set to 32 for all devices.  A follow up patch adds a
writeback_batch_size device attribute, so the batch size becomes run-time
configurable.

Link: https://lkml.kernel.org/r/20251122074029.3948921-1-senozhatsky@chromium.org
Link: https://lkml.kernel.org/r/20251122074029.3948921-2-senozhatsky@chromium.org
Link: https://lore.kernel.org/all/20181203024045.153534-6-minchan@kernel.org/ [1]
Link: https://lore.kernel.org/all/20250731064949.1690732-1-richardycc@google.com/ [2]
Link: https://lore.kernel.org/all/tencent_78FC2C4FE16BA1EBAF0897DB60FCD675ED05@qq.com/ [3]
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Co-developed-by: Yuwen Chen <ywen.chen@foxmail.com>
Co-developed-by: Richard Chang <richardycc@google.com>
Suggested-by: Minchan Kim <minchan@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: Richard Chang <richardycc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:52 -08:00
Wei Yang
9e01407708 mm/khugepaged: unify SCAN_PMD_NONE and SCAN_PMD_NULL into SCAN_NO_PTE_TABLE
The current hugepage collapse scan results include two separate values,
SCAN_PMD_NONE and SCAN_PMD_NULL, which are handled identically by the
consuming code.

To reduce confusion and improve long-term maintenance, this commit merges
these two functionally equivalent states into a single, clearer
identifier: SCAN_NO_PTE_TABLE

Link: https://lkml.kernel.org/r/20251114030028.7035-4-richard.weiyang@gmail.com
Suggested-by: "David Hildenbrand (Red Hat)" <david@kernel.org>
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Nico Pache <npache@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:52 -08:00
Wei Yang
f1040f8898 mm/khugepaged: continue to collapse on SCAN_PMD_NONE
SCAN_PMD_NONE means current pmd is empty, but we can still continue
collapse next pmd range.

Link: https://lkml.kernel.org/r/20251114030028.7035-3-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Barry Song <baohua@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:52 -08:00
Wei Yang
eaa4c8063f mm/khugepaged: remove redundant clearing of struct collapse_control
Patch series "unify PMD scan results and remove redundant cleanup", v2.

This small series addresses two minor cleanup opportunities in the
hugepage collapse logic.

The initial motivation arose during a code review of madvise_collapse(),
where it was noted that the function was missing a handler for
SCAN_PMD_NONE.  This oversight exposed the inconsistent handling of
SCAN_PMD_NULL and SCAN_PMD_NONE.

Since both scan results are functionally identical (they indicate the
absence of a PTE table), the primary patch unifies them into a single,
clearer identifier, SCAN_NO_PTE_TABLE.

The series also takes the opportunity to remove a redundant clearing of
the struct collapse_control.


This patch (of 3):

The structure struct collapse_control is being unnecessarily cleared twice
during the huge page collapse process.

Both hpage_collapse_scan_file() and hpage_collapse_scan_pmd() currently
perform a clear operation on this structure.

Remove the redundant clear operation.

Link: https://lkml.kernel.org/r/20251114030028.7035-1-richard.weiyang@gmail.com
Link: https://lkml.kernel.org/r/20251114030028.7035-2-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Nico Pache <npache@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:52 -08:00
Qi Zheng
46156dba32 mm: thp: reparent the split queue during memcg offline
Similar to list_lru, the split queue is relatively independent and does
not need to be reparented along with objcg and LRU folios (holding objcg
lock and lru lock).  So let's apply the similar mechanism as list_lru to
reparent the split queue separately when memcg is offine.

This is also a preparation for reparenting LRU folios.

Link: https://lkml.kernel.org/r/8703f907c4d1f7e8a2ef2bfed3036a84fa53028b.1762762324.git.zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:52 -08:00
Muchun Song
776bde7caf mm: thp: use folio_batch to handle THP splitting in deferred_split_scan()
The maintenance of the folio->_deferred_list is intricate because it's
reused in a local list.

Here are some peculiarities:

   1) When a folio is removed from its split queue and added to a local
      on-stack list in deferred_split_scan(), the ->split_queue_len isn't
      updated, leading to an inconsistency between it and the actual
      number of folios in the split queue.

   2) When the folio is split via split_folio() later, it's removed from
      the local list while holding the split queue lock. At this time,
      the lock is not needed as it is not protecting anything.

   3) To handle the race condition with a third-party freeing or migrating
      the preceding folio, we must ensure there's always one safe (with
      raised refcount) folio before by delaying its folio_put(). More
      details can be found in commit e66f3185fa04 ("mm/thp: fix deferred
      split queue not partially_mapped"). It's rather tricky.

We can use the folio_batch infrastructure to handle this clearly.  In this
case, ->split_queue_len will be consistent with the real number of folios
in the split queue.  If list_empty(&folio->_deferred_list) returns false,
it's clear the folio must be in its split queue (not in a local list
anymore).

In the future, we will reparent LRU folios during memcg offline to
eliminate dying memory cgroups, which requires reparenting the split queue
to its parent first.  So this patch prepares for using
folio_split_queue_lock_irqsave() as the memcg may change then.

Link: https://lkml.kernel.org/r/59cb6b6fb5ffcff9d23b81890b252960139ad8e7.1762762324.git.zhengqi.arch@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nico Pache <npache@redhat.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:52 -08:00
Muchun Song
ad7c7f4576 mm: thp: introduce folio_split_queue_lock and its variants
In future memcg removal, the binding between a folio and a memcg may
change, making the split lock within the memcg unstable when held.

A new approach is required to reparent the split queue to its parent. 
This patch starts introducing a unified way to acquire the split lock for
future work.

It's a code-only refactoring with no functional changes.

Link: https://lkml.kernel.org/r/a31a90bcac04dc754f775e87ae3205be3170b571.1762762324.git.zhengqi.arch@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nico Pache <npache@redhat.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:52 -08:00
Muchun Song
fd603ae11e mm: thp: replace folio_memcg() with folio_memcg_charged()
Patch series "reparent the THP split queue", v6.

In the future, we will reparent LRU folios during memcg offline to
eliminate dying memory cgroups, which requires reparenting the THP split
queue to its parent memcg.

Similar to list_lru, the split queue is relatively independent and does
not need to be reparented along with objcg and LRU folios (holding objcg
lock and lru lock).  Therefore, we can apply the same mechanism as
list_lru to reparent the split queue first when memcg is offine.

The first three patches in this series are separated from the series
"Eliminate Dying Memory Cgroup" [1], mainly to do some cleanup and
preparatory work.

The last patch reparents the THP split queue to its parent memcg during
memcg offline.


This patch (of 4):

folio_memcg_charged() is intended for use when the user is unconcerned
about the returned memcg pointer.  It is more efficient than
folio_memcg().  Therefore, replace folio_memcg() with
folio_memcg_charged().

Link: https://lkml.kernel.org/r/56624d537520e33e5a6b3755238b3dfb959a52ee.1762762324.git.zhengqi.arch@bytedance.com
Link: https://lore.kernel.org/all/20250415024532.26632-1-songmuchun@bytedance.com/ [1]
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:52 -08:00
Lorenzo Stoakes
a3a3e215c9 mm: replace remaining pte_to_swp_entry() with softleaf_from_pte()
There are straggler invocations of pte_to_swp_entry() lying around,
replace all of these with the software leaf entry equivalent -
softleaf_from_pte().

With those removed, eliminate pte_to_swp_entry() altogether.

No functional change intended.

Link: https://lkml.kernel.org/r/d8ee5ccefe4c42d7c4fe1a2e46f285ac40421cd3.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:52 -08:00
Lorenzo Stoakes
93976a2034 mm: eliminate further swapops predicates
Having converted so much of the code base to software leaf entries, we can
mop up some remaining cases.

We replace is_pfn_swap_entry(), pfn_swap_entry_to_page(),
is_writable_device_private_entry(), is_device_exclusive_entry(),
is_migration_entry(), is_writable_migration_entry(),
is_readable_migration_entry(), swp_offset_pfn() and pfn_swap_entry_folio()
with softleaf equivalents.

No functional change intended.

Link: https://lkml.kernel.org/r/956bc9c031604811c0070d2f4bf2f1373f230213.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:52 -08:00
Lorenzo Stoakes
03bfbc3ad6 mm: remove is_hugetlb_entry_[migration, hwpoisoned]()
We do not need to have explicit helper functions for these, it adds a
level of confusion and indirection when we can simply use software leaf
entry logic here instead and spell out the special huge_pte_none() case we
must consider.

No functional change intended.

Link: https://lkml.kernel.org/r/0e92d6924d3de88cd014ce1c53e20edc08fc152e.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:51 -08:00
Lorenzo Stoakes
9ff30bb9ab mm: remove non_swap_entry() and use softleaf helpers instead
There is simply no need for the hugely confusing concept of 'non-swap'
swap entries now we have the concept of softleaf entries and relevant
softleaf_xxx() helpers.

Adjust all callers to use these instead and remove non_swap_entry()
altogether.

No functional change intended.

Link: https://lkml.kernel.org/r/2562093f37f4a9cffea0447058014485eb50aaaf.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:51 -08:00
Lorenzo Stoakes
c0a80c2ce6 mm: remove remaining is_swap_pmd() users and is_swap_pmd()
Update copy_huge_pmd() and change_huge_pmd() to use
pmd_is_valid_softleaf() - as this checks for the only valid non-present
huge PMD states.

Also update mm/debug_vm_pgtable.c to explicitly test for a valid leaf PMD
entry (which it was not before, which was incorrect), and have it test
against pmd_is_huge() and pmd_is_valid_softleaf() rather than
is_swap_pmd().

With these changes done there are no further users of is_swap_pmd(), so
remove it.

Link: https://lkml.kernel.org/r/1628b00b00c8498bbd2c20b82117ee87845fb738.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:51 -08:00
Lorenzo Stoakes
15eabc898d mm: introduce pmd_is_huge() and use where appropriate
The leaf entry PMD case is confusing as only migration entries and device
private entries are valid at PMD level, not true swap entries.

We repeatedly perform checks of the form is_swap_pmd() || pmd_trans_huge()
which is itself confusing - it implies that leaf entries at PMD level
exist and are different from huge entries.

Address this confusion by introduced pmd_is_huge() which checks for either
case.  Sadly due to header dependency issues (huge_mm.h is included very
early on in headers and cannot really rely on much else) we cannot use
pmd_is_valid_softleaf() here.

However since these are the only valid, handled cases the function is
still achieving what it intends to do.

We then replace all instances of is_swap_pmd() || pmd_trans_huge() with
pmd_is_huge() invocations and adjust logic accordingly to accommodate
this.

No functional change intended.

Link: https://lkml.kernel.org/r/00f79db3b15293cac8f7040a48d69c52d00117e4.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:51 -08:00
Lorenzo Stoakes
0ac881efe1 mm: replace pmd_to_swp_entry() with softleaf_from_pmd()
Introduce softleaf_from_pmd() to do the equivalent operation for PMDs that
softleaf_from_pte() fulfils, and cascade changes through code base
accordingly, introducing helpers as necessary.

We are then able to eliminate pmd_to_swp_entry(),
is_pmd_migration_entry(), is_pmd_device_private_entry() and
is_pmd_non_present_folio_entry().

This further establishes the use of leaf operations throughout the code
base and further establishes the foundations for eliminating
is_swap_pmd().

No functional change intended.

[lorenzo.stoakes@oracle.com: check writable, not readable/writable, per Vlastimil]
  Link: https://lkml.kernel.org/r/cd97b6ec-00f9-45a4-9ae0-8f009c212a94@lucifer.local
Link: https://lkml.kernel.org/r/3fb431699639ded8fdc63d2210aa77a38c8891f1.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: SeongJae Park <sj@kernel.org>\
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:51 -08:00
Lorenzo Stoakes
5dfa791605 mm/huge_memory: refactor change_huge_pmd() non-present logic
Similar to copy_huge_pmd(), there is a large mass of open-coded logic for
the CONFIG_ARCH_ENABLE_THP_MIGRATION non-present entry case that does not
use thp_migration_supported() consistently.

Resolve this by separating out this logic and introduce
change_non_present_huge_pmd().

No functional change intended.

Link: https://lkml.kernel.org/r/451b85636ad711e307fdfbff19af699fdab4d05f.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:51 -08:00
Lorenzo Stoakes
e244d82d02 mm/huge_memory: refactor copy_huge_pmd() non-present logic
Right now we are inconsistent in our use of thp_migration_supported():

static inline bool thp_migration_supported(void)
{
	return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
}

And simply having arbitrary and ugly #ifdef
CONFIG_ARCH_ENABLE_THP_MIGRATION blocks in code.

This is exhibited in copy_huge_pmd(), which inserts a large #ifdef
CONFIG_ARCH_ENABLE_THP_MIGRATION block and an if-branch which is difficult
to follow

It's difficult to follow the logic of such a large function and the
non-present PMD logic is clearly separate as it sits in a giant if-branch.

Therefore this patch both separates out the logic and utilises
thp_migration_supported().

No functional change intended.

Link: https://lkml.kernel.org/r/6eaadc23ed512d370ede65561e34e96241c54b9d.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:51 -08:00
Lorenzo Stoakes
aa62204cb6 mm: avoid unnecessary use of is_swap_pmd()
PMD 'non-swap' swap entries are currently used for PMD-level migration
entries and device private entries.

To add to the confusion in this terminology we use is_swap_pmd() in an
inconsistent way similar to how is_swap_pte() was being used - sometimes
adopting the convention that !pmd_none(), !pmd_present() implies PMD 'swap'
entry, sometimes not.

This patch handles the low-hanging fruit of cases where we can simply
substitute other predicates for is_swap_pmd().

No functional change intended.

Link: https://lkml.kernel.org/r/8a1704b36a009c18032d5bea4cb68e71448fbbe5.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:51 -08:00
Lorenzo Stoakes
de4d6c9491 fs/proc/task_mmu: refactor pagemap_pmd_range()
Separate out THP logic so we can drop an indentation level and reduce the
amount of noise in this function.

We add pagemap_pmd_range_thp() for this purpose.

While we're here, convert the VM_BUG_ON() to a VM_WARN_ON_ONCE() at the
same time.

No functional change intended.

Link: https://lkml.kernel.org/r/f9ce7f3bb57e3627288225e23f2498cc5315f5ab.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:51 -08:00
Lorenzo Stoakes
fb410d8b89 mm: use leaf entries in debug pgtable + remove is_swap_pte()
Remove invocations of is_swap_pte() in mm/debug_vm_pgtable.c and use
softleaf_from_pte() and softleaf_is_swap() as necessary to replace this
usage.

We update the test code to use a 'true' swap entry throughout so we are
guaranteed this is not a non-swap entry, so all asserts continue to
operate correctly.

With this change in place, we no longer use is_swap_pte() anywhere, so
remove it.

Link: https://lkml.kernel.org/r/222f352e7a99191b4bdfa77e835f2fc0dd83fa72.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:50 -08:00
Lorenzo Stoakes
06fb61462b mm: eliminate is_swap_pte() when softleaf_from_pte() suffices
In cases where we can simply utilise the fact that softleaf_from_pte()
treats present entries as if they were none entries and thus eliminate
spurious uses of is_swap_pte(), do so.

No functional change intended.

Link: https://lkml.kernel.org/r/92ebab9567978155116804c67babc3c64636c403.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:50 -08:00
Lorenzo Stoakes
fb888710e2 mm: avoid unnecessary uses of is_swap_pte()
There's an established convention in the kernel that we treat PTEs as
containing swap entries (and the unfortunately named non-swap swap
entries) should they be neither empty (i.e.  pte_none() evaluating true)
nor present (i.e.  pte_present() evaluating true).

However, there is some inconsistency in how this is applied, as we also
have the is_swap_pte() helper which explicitly performs this check:

	/* check whether a pte points to a swap entry */
	static inline int is_swap_pte(pte_t pte)
	{
		return !pte_none(pte) && !pte_present(pte);
	}

As this represents a predicate, and it's logical to assume that in order
to establish that a PTE entry can correctly be manipulated as a
swap/non-swap entry, this predicate seems as if it must first be checked.

But we instead, we far more often utilise the established convention of
checking pte_none() / pte_present() before operating on entries as if they
were swap/non-swap.

This patch works towards correcting this inconsistency by removing all
uses of is_swap_pte() where we are already in a position where we perform
pte_none()/pte_present() checks anyway or otherwise it is clearly logical
to do so.

We also take advantage of the fact that pte_swp_uffd_wp() is only set on
swap entries.

Additionally, update comments referencing to is_swap_pte() and
non_swap_entry().

No functional change intended.

Link: https://lkml.kernel.org/r/17fd6d7f46a846517fd455fadd640af47fcd7c55.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:50 -08:00
Lorenzo Stoakes
68aa2fdbf5 mm: introduce leaf entry type and use to simplify leaf entry logic
The kernel maintains leaf page table entries which contain either:

The kernel maintains leaf page table entries which contain either:

 - Nothing ('none' entries)
 - Present entries*
 - Everything else that will cause a fault which the kernel handles

* Present entries are either entries the hardware can navigate without page
  fault or special cases like NUMA hint protnone or PMD with cleared
  present bit which contain hardware-valid entries modulo the present bit.

In the 'everything else' group we include swap entries, but we also
include a number of other things such as migration entries, device private
entries and marker entries.

Unfortunately this 'everything else' group expresses everything through a
swp_entry_t type, and these entries are referred to swap entries even
though they may well not contain a...  swap entry.

This is compounded by the rather mind-boggling concept of a non-swap swap
entry (checked via non_swap_entry()) and the means by which we twist and
turn to satisfy this.

This patch lays the foundation for reducing this confusion.

We refer to 'everything else' as a 'software-define leaf entry' or
'softleaf'.  for short And in fact we scoop up the 'none' entries into
this concept also so we are left with:

- Present entries.
- Softleaf entries (which may be empty).

This allows for radical simplification across the board - one can simply
convert any leaf page table entry to a leaf entry via softleaf_from_pte().

If the entry is present, we return an empty leaf entry, so it is assumed
the caller is aware that they must differentiate between the two
categories of page table entries, checking for the former via
pte_present().

As a result, we can eliminate a number of places where we would otherwise
need to use predicates to see if we can proceed with leaf page table entry
conversion and instead just go ahead and do it unconditionally.

We do so where we can, adjusting surrounding logic as necessary to
integrate the new softleaf_t logic as far as seems reasonable at this
stage.

We typedef swp_entry_t to softleaf_t for the time being until the
conversion can be complete, meaning everything remains compatible
regardless of which type is used.  We will eventually remove swp_entry_t
when the conversion is complete.

We introduce a new header file to keep things clear - leafops.h - this
imports swapops.h so can direct replace swapops imports without issue, and
we do so in all the files that require it.

Additionally, add new leafops.h file to core mm maintainers entry.

Link: https://lkml.kernel.org/r/c879383aac77d96a03e4d38f7daba893cd35fc76.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:50 -08:00
Lorenzo Stoakes
c093cf4510 mm: correctly handle UFFD PTE markers
Patch series "mm: remove is_swap_[pte, pmd]() + non-swap entries,
introduce leaf entries", v3.

There's an established convention in the kernel that we treat leaf page
tables (so far at the PTE, PMD level) as containing 'swap entries' should
they be neither empty (i.e.  p**_none() evaluating true) nor present (i.e.
p**_present() evaluating true).

However, at the same time we also have helper predicates - is_swap_pte(),
is_swap_pmd() - which are inconsistently used.

This is problematic, as it is logical to assume that should somebody wish
to operate upon a page table swap entry they should first check to see if
it is in fact one.

It also implies that perhaps, in future, we might introduce a non-present,
none page table entry that is not a swap entry.

This series resolves this issue by systematically eliminating all use of
the is_swap_pte() and is swap_pmd() predicates so we retain only the
convention that should a leaf page table entry be neither none nor present
it is a swap entry.

We also have the further issue that 'swap entry' is unfortunately a really
rather overloaded term and in fact refers to both entries for swap and for
other information such as migration entries, page table markers, and
device private entries.

We therefore have the rather 'unique' concept of a 'non-swap' swap entry.

This series therefore introduces the concept of 'software leaf entries',
of type softleaf_t, to eliminate this confusion.

A software leaf entry in this sense is any page table entry which is
non-present, and represented by the softleaf_t type.  That is - page table
leaf entries which are software-controlled by the kernel.

This includes 'none' or empty entries, which are simply represented by an
zero leaf entry value.

In order to maintain compatibility as we transition the kernel to this new
type, we simply typedef swp_entry_t to softleaf_t.

We introduce a number of predicates and helpers to interact with software
leaf entries in include/linux/leafops.h which, as it imports swapops.h,
can be treated as a drop-in replacement for swapops.h wherever leaf entry
helpers are used.

Since softleaf_from_[pte, pmd]() treats present entries as they were
empty/none leaf entries, this allows for a great deal of simplification of
code throughout the code base, which this series utilises a great deal.

We additionally change from swap entry to software leaf entry handling
where it makes sense to and eliminate functions from swapops.h where
software leaf entries obviate the need for the functions.


This patch (of 16):

PTE markers were previously only concerned with UFFD-specific logic - that
is, PTE entries with the UFFD WP marker set or those marked via
UFFDIO_POISON.

However since the introduction of guard markers in commit 7c53dfbdb024
("mm: add PTE_MARKER_GUARD PTE marker"), this has no longer been the case.

Issues have been avoided as guard regions are not permitted in conjunction
with UFFD, but it still leaves very confusing logic in place, most notably
the misleading and poorly named pte_none_mostly() and
huge_pte_none_mostly().

This predicate returns true for PTE entries that ought to be treated as
none, but only in certain circumstances, and on the assumption we are
dealing with H/W poison markers or UFFD WP markers.

This patch removes these functions and makes each invocation of these
functions instead explicitly check what it needs to check.

As part of this effort it introduces is_uffd_pte_marker() to explicitly
determine if a marker in fact is used as part of UFFD or not.

In the HMM logic we note that the only time we would need to check for a
fault is in the case of a UFFD WP marker, otherwise we simply encounter a
fault error (VM_FAULT_HWPOISON for H/W poisoned marker, VM_FAULT_SIGSEGV
for a guard marker), so only check for the UFFD WP case.

While we're here we also refactor code to make it easier to understand.

[akpm@linux-foundation.org: fix comment typo, per Mike]
Link: https://lkml.kernel.org/r/cover.1762812360.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/c38625fd9a1c1f1cf64ae8a248858e45b3dcdf11.1762812360.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Xu <weixugc@google.com>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:50 -08:00
Wei Yang
8a0e4bdddd mm/huge_memory: merge uniform_split_supported() and non_uniform_split_supported()
uniform_split_supported() and non_uniform_split_supported() share
significantly similar logic.

The only functional difference is that uniform_split_supported() includes
an additional check on the requested @new_order.

The reason for this check comes from the following two aspects:

  * some file system or swap cache just supports order-0 folio
  * the behavioral difference between uniform/non-uniform split

The behavioral difference between uniform split and non-uniform:

  * uniform split splits folio directly to @new_order
  * non-uniform split creates after-split folios with orders from
    folio_order(folio) - 1 to new_order.

This means for non-uniform split or !new_order split we should check the
file system and swap cache respectively.

This commit unifies the logic and merge the two functions into a single
combined helper, removing redundant code and simplifying the split
support checking mechanism.

Link: https://lkml.kernel.org/r/20251106034155.21398-3-richard.weiyang@gmail.com
Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages")
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: "David Hildenbrand (Red Hat)" <david@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:50 -08:00
Wei Yang
c467061fbb mm/huge_memory: introduce enum split_type for clarity
Patch series "mm/huge_memory: Define split_type and consolidate split
support checks", v3.

This two-patch series focuses on improving code clarity and removing
redundancy in the huge memory handling logic related to folio splitting.

The series is based on an original proposal to merge two significantly
identical functions that check folio split support[1].  During this
process, we found an opportunity to improve readability by explicitly
defining the split types.  

Patch 1: define split_type and use it
Patch 2: merge uniform_split_supported() and non_uniform_split_supported()


This patch (of 2):

We currently handle two distinct types of large folio splitting:
  * uniform split
  * non-uniform split

Differentiating between these types using a simple boolean variable is not
obvious and can harm code readability.

This commit introduces enum split_type to explicitly define these two
types.  Replacing the existing boolean variable with this enumeration
significantly improves code clarity and expressiveness when dealing with
folio splitting logic.

No functional change is expected.

[akpm@linux-foundation.org: tweak layout, per David]
Link: https://lkml.kernel.org/r/20251106034155.21398-1-richard.weiyang@gmail.com
Link: https://lkml.kernel.org/r/20251106034155.21398-2-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: "David Hildenbrand (Red Hat)" <david@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:50 -08:00
Ma Ke
fe9d31fd1a mm/hmm/test: fix error handling in dmirror_device_init
dmirror_device_init() calls device_initialize() which sets the device
reference count to 1, but fails to call put_device() when error occurs
after dev_set_name() or cdev_device_add() failures.  This results in
memory leaks of struct device objects.  Additionally,
dmirror_device_remove() lacks the final put_device() call to properly
release the device reference.

Found by code review.

Link: https://lkml.kernel.org/r/20251108115346.6368-1-make24@iscas.ac.cn
Fixes: 6a760f58c792 ("mm/hmm/test: use char dev with struct device to get device node")
Signed-off-by: Ma Ke <make24@iscas.ac.cn>
Cc: Haoxiang Li <make24@iscas.ac.cn>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Mika Penttilä <mpenttil@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:50 -08:00
Zi Yan
50d0598cf2 mm/huge_memory: fix kernel-doc comments for folio_split() and related
try_folio_split_to_order(), folio_split, __folio_split(), and
__split_unmapped_folio() do not have correct kernel-doc comment format. 
Fix them.

[ziy@nvidia.com: kernel-doc fixup]
  Link: https://lkml.kernel.org/r/BE7AC5F3-9E64-4923-861D-C2C4E0CB91EB@nvidia.com
[ziy@nvidia.com: add newline to fix an error and a warning from docutils]
  Link: https://lkml.kernel.org/r/040B38C0-23C6-4AEA-B069-69AE6DAA828B@nvidia.com
Link: https://lkml.kernel.org/r/20251031162001.670503-4-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Barry Song <baohua@kernel.org>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Luis Chamberalin <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Pankaj Raghav <kernel@pankajraghav.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:49 -08:00
Zi Yan
689b898677 mm/memory-failure: improve large block size folio handling
Large block size (LBS) folios cannot be split to order-0 folios but
min_order_for_folio().  Current split fails directly, but that is not
optimal.  Split the folio to min_order_for_folio(), so that, after split,
only the folio containing the poisoned page becomes unusable instead.

For soft offline, do not split the large folio if its
min_order_for_folio() is not 0.  Since the folio is still accessible from
userspace and premature split might lead to potential performance loss.

Link: https://lkml.kernel.org/r/20251031162001.670503-3-ziy@nvidia.com
Signed-off-by: Zi Yan <ziy@nvidia.com>
Suggested-by: Jane Chu <jane.chu@oracle.com>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Pankaj Raghav <kernel@pankajraghav.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:49 -08:00
Zi Yan
a7ef12c64f mm/huge_memory: add split_huge_page_to_order()
Patch series "Optimize folio split in memory failure", v5.

This patchset optimizes folio split operations in memory failure code by
always splitting a folio to min_order_for_split() to minimize unusable
pages, even if min_order_for_split() is non zero and memory failure code
would take the failed path eventually for a successfully split folio. 

This means instead of making the entire original folio unusable memory
failure code would only make its after-split folio, which has order of
min_order_for_split() and contains HWPoison page, unusable.

For soft offline case, since the original folio is still accessible, no
split is performed if the folio cannot be split to order-0 to prevent
potential performance loss.

In addition, add split_huge_page_to_order() to improve code readability
and fix kernel-doc comment format for folio_split() and other related
functions.

Background
==========

This patchset is a follow-up of "[PATCH v3] mm/huge_memory: do not change
split_huge_page*() target order silently."[1] and [PATCH v4]
mm/huge_memory: preserve PG_has_hwpoisoned if a folio is split to >0
order[2], since both are separated out as hotfixes.  It improves how
memory failure code handles large block size(LBS) folios with
min_order_for_split() > 0.  By splitting a large folio containing HW
poisoned pages to min_order_for_split(), the after-split folios without HW
poisoned pages could be freed for reuse.  To achieve this, folio split
code needs to set has_hwpoisoned on after-split folios containing HW
poisoned pages and it is done in the hotfix in [2].

This patchset includes:
1. A patch adds split_huge_page_to_order(),
2. Patch 2 and Patch 3 of "[PATCH v2 0/3] Do not change split folio target
   order"[3],


This patch (of 3):

When the caller does not supply a list to
split_huge_page_to_list_to_order(), use split_huge_page_to_order()
instead.

Link: https://lkml.kernel.org/r/20251031162001.670503-1-ziy@nvidia.com
Link: https://lkml.kernel.org/r/20251031162001.670503-2-ziy@nvidia.com
Link: https://lore.kernel.org/all/20251017013630.139907-1-ziy@nvidia.com/ [1]
Link: https://lore.kernel.org/all/20251023030521.473097-1-ziy@nvidia.com/ [2]
Link: https://lore.kernel.org/all/20251016033452.125479-1-ziy@nvidia.com/ [3]
Signed-off-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Luis Chamberalin <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Pankaj Raghav <kernel@pankajraghav.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:49 -08:00
Wei Yang
d87f4a8f19 mm/huge_memory: only get folio_order() once during __folio_split()
Before splitting folio, its order keeps the same.

It is only necessary to get folio_order() once.

Also rename order to old_order to represent the original folio order.

Link: https://lkml.kernel.org/r/20251010141142.1349-1-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: Lance Yang <lance.yang@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Barry Song <baohua@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:49 -08:00
Wei Yang
ac7756771a mm/khugepaged: unify pmd folio installation with map_anon_folio_pmd()
Currently we install pmd folio with map_anon_folio_pmd() in
__do_huge_pmd_anonymous_page() and do_huge_zero_wp_pmd().  While in
collapse_huge_page(), it is done with identical code except statistics
adjustment.

Unify the process with map_anon_folio_pmd() to install pmd folio.  Split
it to map_anon_folio_pmd_pf() and map_anon_folio_pmd_nopf() to be used in
page fault or not respectively.

No functional change is intended.

[akpm@linux-foundation.org: remove unneeded map_anon_folio_pmd_nopf() stub, per Wei & David]
Link: https://lkml.kernel.org/r/20251008095453.18772-3-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: Lance Yang <lance.yang@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:49 -08:00
Wei Yang
2a1351cd41 mm/huge_memory: add pmd folio to ds_queue in do_huge_zero_wp_pmd()
We add pmd folio into ds_queue on the first page fault in
__do_huge_pmd_anonymous_page(), so that we can split it in case of memory
pressure.  This should be the same for a pmd folio during wp page fault.

Commit 1ced09e0331f ("mm: allocate THP on hugezeropage wp-fault") miss to
add it to ds_queue, which means system may not reclaim enough memory in
case of memory pressure even the pmd folio is under used.

Move deferred_split_folio() into map_anon_folio_pmd() to make the pmd
folio installation consistent.

Link: https://lkml.kernel.org/r/20251008095453.18772-2-richard.weiyang@gmail.com
Fixes: 1ced09e0331f ("mm: allocate THP on hugezeropage wp-fault")
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Acked-by: Usama Arif <usamaarif642@gmail.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:49 -08:00
Balbir Singh
c322874710 gpu/drm/nouveau: enable THP support for GPU memory migration
Enable MIGRATE_VMA_SELECT_COMPOUND support in nouveau driver to take
advantage of THP zone device migration capabilities.

Update migration and eviction code paths to handle compound page sizes
appropriately, improving memory bandwidth utilization and reducing
migration overhead for large GPU memory allocations.

[balbirs@nvidia.com: fix sparse error]
  Link: https://lkml.kernel.org/r/20251115003333.3516870-1-balbirs@nvidia.com
Link: https://lkml.kernel.org/r/20251001065707.920170-17-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:49 -08:00
Balbir Singh
271a7b2e3c selftests/mm/hmm-tests: new throughput tests including THP
Add new benchmark style support to test transfer bandwidth for zone device
memory operations.

Link: https://lkml.kernel.org/r/20251001065707.920170-16-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:49 -08:00
Matthew Brost
24c2c5b8ff selftests/mm/hmm-tests: partial unmap, mremap and anon_write tests
Add partial unmap test case which munmaps memory while in the device.

Add tests exercising mremap on faulted-in memory (CPU and GPU) at various
offsets and verify correctness.

Update anon_write_child to read device memory after fork verifying this
flow works in the kernel.

Both THP and non-THP cases are updated.

Link: https://lkml.kernel.org/r/20251001065707.920170-15-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:49 -08:00
Balbir Singh
519071529d selftests/mm/hmm-tests: new tests for zone device THP migration
Add new tests for migrating anon THP pages, including anon_huge,
anon_huge_zero and error cases involving forced splitting of pages during
migration.

Link: https://lkml.kernel.org/r/20251001065707.920170-14-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:49 -08:00
Balbir Singh
aa3ade4295 lib/test_hmm: add large page allocation failure testing
Add HMM_DMIRROR_FLAG_FAIL_ALLOC flag to simulate large page allocation
failures, enabling testing of split migration code paths.

This test flag allows validation of the fallback behavior when destination
device cannot allocate compound pages.  This is useful for testing the
split migration functionality.

Link: https://lkml.kernel.org/r/20251001065707.920170-13-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:48 -08:00
Balbir Singh
4265d67e40 mm/migrate_device: add THP splitting during migration
Implement migrate_vma_split_pages() to handle THP splitting during the
migration process when destination cannot allocate compound pages.

This addresses the common scenario where migrate_vma_setup() succeeds with
MIGRATE_PFN_COMPOUND pages, but the destination device cannot allocate
large pages during the migration phase.

Key changes:
- migrate_vma_split_pages(): Split already-isolated pages during migration
- Enhanced folio_split() and __split_unmapped_folio() with isolated
  parameter to avoid redundant unmap/remap operations

This provides a fallback mechansim to ensure migration succeeds even when
large page allocation fails at the destination.

[matthew.brost@intel.com: add THP splitting during migration]
  Link: https://lkml.kernel.org/r/20251120230825.181072-2-matthew.brost@intel.com
Link: https://lkml.kernel.org/r/20251001065707.920170-12-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:48 -08:00
Balbir Singh
56ef398996 mm/memremap: add driver callback support for folio splitting
When a zone device page is split (via huge pmd folio split).  The driver
callback for folio_split is invoked to let the device driver know that the
folio size has been split into a smaller order.

Provide a default implementation for drivers that do not provide this
callback that copies the pgmap and mapping fields for the split folios.

Update the HMM test driver to handle the split.

Link: https://lkml.kernel.org/r/20251001065707.920170-11-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:48 -08:00
Balbir Singh
775465fd26 lib/test_hmm: add zone device private THP test infrastructure
Enhance the hmm test driver (lib/test_hmm) with support for THP pages.

A new pool of free_folios() has now been added to the dmirror device,
which can be allocated when a request for a THP zone device private page
is made.

Add compound page awareness to the allocation function during normal
migration and fault based migration.  These routines also copy
folio_nr_pages() when moving data between system memory and device memory.

args.src and args.dst used to hold migration entries are now dynamically
allocated (as they need to hold HPAGE_PMD_NR entries or more).

Split and migrate support will be added in future patches in this series.

Link: https://lkml.kernel.org/r/20251001065707.920170-10-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:48 -08:00
Balbir Singh
4964099163 mm/memory/fault: add THP fault handling for zone device private pages
Implement CPU fault handling for zone device THP entries through
do_huge_pmd_device_private(), enabling transparent migration of
device-private large pages back to system memory on CPU access.

When the CPU accesses a zone device THP entry, the fault handler calls the
device driver's migrate_to_ram() callback to migrate the entire large page
back to system memory.

Link: https://lkml.kernel.org/r/20251001065707.920170-9-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:48 -08:00
Balbir Singh
a30b48bf1b mm/migrate_device: implement THP migration of zone device pages
MIGRATE_VMA_SELECT_COMPOUND will be used to select THP pages during
migrate_vma_setup() and MIGRATE_PFN_COMPOUND will make migrating device
pages as compound pages during device pfn migration.

migrate_device code paths go through the collect, setup and finalize
phases of migration.

The entries in src and dst arrays passed to these functions still remain
at a PAGE_SIZE granularity.  When a compound page is passed, the first
entry has the PFN along with MIGRATE_PFN_COMPOUND and other flags set
(MIGRATE_PFN_MIGRATE, MIGRATE_PFN_VALID), the remaining entries
(HPAGE_PMD_NR - 1) are filled with 0's.  This representation allows for
the compound page to be split into smaller page sizes.

migrate_vma_collect_hole(), migrate_vma_collect_pmd() are now THP page
aware.  Two new helper functions migrate_vma_collect_huge_pmd() and
migrate_vma_insert_huge_pmd_page() have been added.

migrate_vma_collect_huge_pmd() can collect THP pages, but if for some
reason this fails, there is fallback support to split the folio and
migrate it.

migrate_vma_insert_huge_pmd_page() closely follows the logic of
migrate_vma_insert_page()

Support for splitting pages as needed for migration will follow in later
patches in this series.

Link: https://lkml.kernel.org/r/20251001065707.920170-8-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:48 -08:00
Balbir Singh
022a12deda mm/migrate_device: handle partially mapped folios during collection
Extend migrate_vma_collect_pmd() to handle partially mapped large folios
that require splitting before migration can proceed.

During PTE walk in the collection phase, if a large folio is only
partially mapped in the migration range, it must be split to ensure the
folio is correctly migrated.

[matthew.brost@intel.com: handle partially mapped folios during split]
  Link: https://lkml.kernel.org/r/20251120230825.181072-1-matthew.brost@intel.com
Link: https://lkml.kernel.org/r/20251001065707.920170-7-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:48 -08:00
Balbir Singh
1462872900 mm/huge_memory: implement device-private THP splitting
Add support for splitting device-private THP folios, enabling fallback
to smaller page sizes when large page allocation or migration fails.

Key changes:
- split_huge_pmd(): Handle device-private PMD entries during splitting
- Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios
- Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they
  don't support shared zero page semantics

Link: https://lkml.kernel.org/r/20251001065707.920170-6-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:48 -08:00
Balbir Singh
65edfda6f3 mm/rmap: extend rmap and migration support device-private entries
Add device-private THP support to reverse mapping infrastructure, enabling
proper handling during migration and walk operations.

The key changes are:
- add_migration_pmd()/remove_migration_pmd(): Handle device-private
  entries during folio migration and splitting
- page_vma_mapped_walk(): Recognize device-private THP entries during
  VMA traversal operations

This change supports folio splitting and migration operations on
device-private entries.

[balbirs@nvidia.com: fix override of entry in remove_migration_pmd]
  Link: https://lkml.kernel.org/r/20251114012153.2634497-2-balbirs@nvidia.com
[balbirs@nvidia.com: follow pattern used in remove_migration_pte()]
  Link: https://lkml.kernel.org/r/20251115002835.3515194-1-balbirs@nvidia.com
Link: https://lkml.kernel.org/r/20251001065707.920170-5-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Acked-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:48 -08:00
Balbir Singh
368076f52e mm/huge_memory: add device-private THP support to PMD operations
Extend core huge page management functions to handle device-private THP
entries.  This enables proper handling of large device-private folios in
fundamental MM operations.

The following functions have been updated:

- copy_huge_pmd(): Handle device-private entries during fork/clone
- zap_huge_pmd(): Properly free device-private THP during munmap
- change_huge_pmd(): Support protection changes on device-private THP
- __pte_offset_map(): Add device-private entry awareness

Link: https://lkml.kernel.org/r/20251001065707.920170-4-balbirs@nvidia.com
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:48 -08:00
Balbir Singh
3a5a065545 mm/zone_device: rename page_free callback to folio_free
Change page_free to folio_free to make the folio support for
zone device-private more consistent. The PCI P2PDMA callback
has also been updated and changed to folio_free() as a result.

For drivers that do not support folios (yet), the folio is
converted back into page via &folio->page and the page is used
as is, in the current callback implementation.

Link: https://lkml.kernel.org/r/20251001065707.920170-3-balbirs@nvidia.com
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "Christian König" <christian.koenig@amd.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:47 -08:00
Balbir Singh
d245f9b4ab mm/zone_device: support large zone device private folios
Patch series "mm: support device-private THP", v7.

This patch series introduces support for Transparent Huge Page (THP)
migration in zone device-private memory.  The implementation enables
efficient migration of large folios between system memory and
device-private memory

Background

Current zone device-private memory implementation only supports PAGE_SIZE
granularity, leading to:
- Increased TLB pressure
- Inefficient migration between CPU and device memory

This series extends the existing zone device-private infrastructure to
support THP, leading to:
- Reduced page table overhead
- Improved memory bandwidth utilization
- Seamless fallback to base pages when needed

In my local testing (using lib/test_hmm) and a throughput test, the series
shows a 350% improvement in data transfer throughput and a 80% improvement
in latency

These patches build on the earlier posts by Ralph Campbell [1]

Two new flags are added in vma_migration to select and mark compound
pages.  migrate_vma_setup(), migrate_vma_pages() and
migrate_vma_finalize() support migration of these pages when
MIGRATE_VMA_SELECT_COMPOUND is passed in as arguments.

The series also adds zone device awareness to (m)THP pages along with
fault handling of large zone device private pages.  page vma walk and the
rmap code is also zone device aware.  Support has also been added for
folios that might need to be split in the middle of migration (when the
src and dst do not agree on MIGRATE_PFN_COMPOUND), that occurs when src
side of the migration can migrate large pages, but the destination has not
been able to allocate large pages.  The code supported and used
folio_split() when migrating THP pages, this is used when
MIGRATE_VMA_SELECT_COMPOUND is not passed as an argument to
migrate_vma_setup().

The test infrastructure lib/test_hmm.c has been enhanced to support THP
migration.  A new ioctl to emulate failure of large page allocations has
been added to test the folio split code path.  hmm-tests.c has new test
cases for huge page migration and to test the folio split path.  A new
throughput test has been added as well.

The nouveau dmem code has been enhanced to use the new THP migration
capability.  

mTHP support:

The patches hard code, HPAGE_PMD_NR in a few places, but the code has been
kept generic to support various order sizes.  With additional refactoring
of the code support of different order sizes should be possible.

The future plan is to post enhancements to support mTHP with a rough
design as follows:

1. Add the notion of allowable thp orders to the HMM based test driver
2. For non PMD based THP paths in migrate_device.c, check to see if
   a suitable order is found and supported by the driver
3. Iterate across orders to check the highest supported order for migration
4. Migrate and finalize

The mTHP patches can be built on top of this series, the key design
elements that need to be worked out are infrastructure and driver support
for multiple ordered pages and their migration.

HMM support for large folios was added in 10b9feee2d0d ("mm/hmm:
populate PFNs from PMD swap entry").


This patch (of 16)

Add routines to support allocation of large order zone device folios and
helper functions for zone device folios, to check if a folio is device
private and helpers for setting zone device data.

When large folios are used, the existing page_free() callback in pgmap is
called when the folio is freed, this is true for both PAGE_SIZE and higher
order pages.

Zone device private large folios do not support deferred split and scan
like normal THP folios.

Link: https://lkml.kernel.org/r/20251001065707.920170-1-balbirs@nvidia.com
Link: https://lkml.kernel.org/r/20251001065707.920170-2-balbirs@nvidia.com
Link: https://lore.kernel.org/linux-mm/20201106005147.20113-1-rcampbell@nvidia.com/ [1]
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Felix Kuehling <Felix.Kuehling@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "Christian König" <christian.koenig@amd.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:47 -08:00
Claudio Imbrenda
1452468447 KVM: s390: fix missing present bit for gmap puds
For hugetlbs, gmap puds have the present bit set.  For normal puds (which
point to ptes), the bit is not set.  This is in contrast to the normal
userspace puds, which always have the bit set for present pmds.

This causes issues when ___pte_offset_map() is modified to only check for
the present bit.

The solution to the problem is simply to always set the present bit for
present gmap pmds.

Link: https://lkml.kernel.org/r/20251028130150.57379-2-imbrenda@linux.ibm.com
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Link: https://lore.kernel.org/lkml/20251017144924.10034-1-borntraeger@linux.ibm.com/
Tested-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Acked-by: Balbir Singh <balbirs@nvidia.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dave Airlie <airlied@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Lyude <lyude@redhat.com>
Cc: Mathew Brost <matthew.brost@intel.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-24 15:08:47 -08:00
Andrew Morton
87fcafc4e2 Merge branch 'mm-hotfixes-stable' into mm-stable in order to merge
"mm/huge_memory: only get folio_order() once during __folio_split()" into
mm-stable.
2025-11-24 15:07:34 -08:00
Kees Cook
7454048db2 kbuild: Enable GCC diagnostic context for value-tracking warnings
Enable GCC 16's coming "-fdiagnostics-show-context=N" option[1] to
provide enhanced diagnostic information for value-tracking warnings,
which displays the control flow chain leading to the diagnostic. This
covers our existing use of -Wrestrict and -Wstringop-overread, and
gets us closer to enabling -Warray-bounds, -Wstringop-overflow, and
-Wstringop-truncation, so we can track the rationale for the warning,
letting us more quickly identify actual issues vs what have looked in
the past like false positives. Fixes based on this work have already
been landing, e.g.:

  4a6f18f28627 ("net/mlx4_core: Avoid impossible mlx4_db_alloc() order value")
  8a39f1c870e9 ("ovl: Check for NULL d_inode() in ovl_dentry_upper()")
  e5f7e4e0a445 ("drm/amdgpu/atom: Work around vbios NULL offset false positive")

The context depth ("=N") provides the immediate decision path that led
to the problematic code location, showing conditional checks and branch
decisions that caused the warning. This will help us understand why
GCC's value-tracking analysis triggered the warning and makes it easier
to determine whether warnings are legitimate issues or false positives.

For example, an array bounds warning will now show the conditional
statements (like "if (i >= 4)") that established the out-of-bounds access
range, directly connecting the control flow to the warning location.
This is particularly valuable when GCC's interprocedural analysis can
generate warnings that are difficult to understand without seeing the
inferred control flow.

While my testing has shown that "=1" reports enough for finding
the origin of most bounds issues, I have used "=2" here just to be
conservative. Build time measurements with this option off, =1, and =2
are all with noise of each other, so there seems to be no harm in "turning
it up". If we need to, we can make this value configurable in the future.

Link: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=6faa3cfe60ff9769d1bebfffdd2c7325217d7389 [1]
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Link: https://patch.msgid.link/20251121184342.it.626-kees@kernel.org
Signed-off-by: Kees Cook <kees@kernel.org>
2025-11-24 12:44:05 -08:00
Kriish Sharma
645b9ad2dc string: Add missing kernel-doc return descriptions
While running kernel-doc validation on linux-next, warnings were emitted
for functions in include/linux/string.h due to missing return value
documentation:

    Warning: include/linux/string.h:375 No description found for return value of 'kbasename'
    Warning: include/linux/string.h:560 No description found for return value of 'strstarts'

This patch adds the missing return value descriptions for both functions
and clears the related kernel-doc warnings.

Signed-off-by: Kriish Sharma <kriish.sharma2006@gmail.com>
Reviewed-by: Andy Shevchenko <andy@kernel.org>
Link: https://patch.msgid.link/20251118184828.2621595-1-kriish.sharma2006@gmail.com
Signed-off-by: Kees Cook <kees@kernel.org>
2025-11-24 12:44:05 -08:00
Kees Cook
fbcc2150aa media: iris: Cast iris_hfi_gen2_get_instance() allocation type
In preparation for making the kmalloc family of allocators type aware,
we need to make sure that the returned type from the allocation matches
the type of the variable being assigned. (Before, the allocator would
always return "void *", which can be implicitly cast to any pointer type.)

The assigned type is "struct iris_inst *", but the returned type is
"struct iris_inst_hfi_gen2 *". The allocation is intentionally larger as
the first member of struct iris_inst_hfi_gen2 is struct iris_inst, so
this is by design. Cast the allocation type to match the assignment.

Link: https://patch.msgid.link/20250426061526.work.106-kees@kernel.org
Signed-off-by: Kees Cook <kees@kernel.org>
2025-11-24 12:44:05 -08:00
Kees Cook
961c989c5f drm/plane: Remove const qualifier from plane->modifiers allocation type
In preparation for making the kmalloc family of allocators type aware,
we need to make sure that the returned type from the allocation matches
the type of the variable being assigned. (Before, the allocator would
always return "void *", which can be implicitly cast to any pointer type.)

The assigned type is "uint64_t *", but the returned type, while matching,
will be const qualified. As there is no general way to remove const
qualifiers, adjust the allocation type to match the assignment.

Link: https://patch.msgid.link/20250426061325.work.665-kees@kernel.org
Signed-off-by: Kees Cook <kees@kernel.org>
2025-11-24 12:43:52 -08:00
Kees Cook
5146f56dee comedi: Adjust range_table_list allocation type
In preparation for making the kmalloc family of allocators type aware,
we need to make sure that the returned type from the allocation matches
the type of the variable being assigned. (Before, the allocator would
always return "void *", which can be implicitly cast to any pointer type.)

The returned type is "struct comedi_lrange **", but the assigned type,
while technically matching, is const qualified. Since there is no general
way to remove const qualifiers, switch the returned type to match the
assign type. No change in allocation size results.

Link: https://patch.msgid.link/20250426061015.work.971-kees@kernel.org
Signed-off-by: Kees Cook <kees@kernel.org>
2025-11-24 12:43:28 -08:00
Crystal Wood
3138df6f0c rtla/timerlat: Exit top main loop on any non-zero wait_retval
Comparing to exactly 1 will fail if more than one ring buffer
event was seen since the last call to timerlat_bpf_wait(), which
can happen in some race scenarios.

Signed-off-by: Crystal Wood <crwood@redhat.com>
Link: https://lore.kernel.org/r/20251112152529.956778-5-crwood@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
2025-11-21 10:30:27 +01:00
Crystal Wood
61f1fd5d69 rtla/tests: Don't rely on matching ^1ALL
The timerlat "top stop at failed action" test was relying on "ALL" being
printed immediately after the "1" from the threshold action.  Besides being
fragile, this depends on stdbuf behavior, which is easy to miss when
recreating the test outside of the framework for debugging purposes.

Instead, use the expected/unexpected text mechanism from the
corresponding osnoise test.

Signed-off-by: Crystal Wood <crwood@redhat.com>
Link: https://lore.kernel.org/r/20251112152529.956778-2-crwood@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
2025-11-21 10:30:27 +01:00
Ivan Pravdin
ddb6e42494 rtla: Fix -a overriding -t argument
When running rtla as

    `rtla <timerlat|osnoise> <top|hist> -t custom_file.txt -a 100`

-a options override trace output filename specified by -t option.
Running the command above will create <timerlat|osnoise>_trace.txt file
instead of custom_file.txt. Fix this by making sure that -a option does
not override trace output filename even if it's passed after trace
output filename is specified.

Fixes: 173a3b014827 ("rtla/timerlat: Add the automatic trace option")
Signed-off-by: Ivan Pravdin <ipravdin.official@gmail.com>
Reviewed-by: Tomas Glozar <tglozar@redhat.com>
Link: https://lore.kernel.org/r/b6ae60424050b2c1c8709e18759adead6012b971.1762186418.git.ipravdin.official@gmail.com
[ use capital letter in subject, as required by tracing subsystem ]
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
2025-11-21 10:30:27 +01:00
Ivan Pravdin
7b71f3a698 rtla: Fix -C/--cgroup interface
Currently, user can only specify cgroup to the tracer's thread the
following ways:

    `-C[cgroup]`
    `-C[=cgroup]`
    `--cgroup[=cgroup]`

If user tries to specify cgroup as `-C [cgroup]` or `--cgroup [cgroup]`,
the parser silently fails and rtla's cgroup is used for the tracer
threads.

To make interface more user-friendly, allow user to specify cgroup in
the aforementioned way, i.e. `-C [cgroup]` and `--cgroup [cgroup]`.

Refactor identical logic between -t/--trace and -C/--cgroup into a
common function.

Change documentation to reflect this user interface change.

Fixes: a957cbc02531 ("rtla: Add -C cgroup support")
Signed-off-by: Ivan Pravdin <ipravdin.official@gmail.com>
Reviewed-by: Tomas Glozar <tglozar@redhat.com>
Link: https://lore.kernel.org/r/16132f1565cf5142b5fbd179975be370b529ced7.1762186418.git.ipravdin.official@gmail.com
[ use capital letter in subject, as required by tracing subsystem ]
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
2025-11-21 10:30:27 +01:00
Costa Shulyupin
49c1579419 tools/rtla: Replace osnoise_hist_usage("...") with fatal("...")
A long time ago, when the usage help was short, it was a favor
to the user to show it on error. Now that the usage help has
become very long, it is too noisy to dump the complete help text
for each typo after the error message itself.

Replace osnoise_hist_usage("...") with fatal("...") on errors.

Remove the already unused 'usage' argument from osnoise_hist_usage().

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Reviewed-by: Tomas Glozar <tglozar@redhat.com>
Link: https://lore.kernel.org/r/20251011082738.173670-6-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
2025-11-21 10:30:27 +01:00
Costa Shulyupin
92b5b55e5e tools/rtla: Replace osnoise_top_usage("...") with fatal("...")
A long time ago, when the usage help was short, it was a favor
to the user to show it on error. Now that the usage help has
become very long, it is too noisy to dump the complete help text
for each typo after the error message itself.

Replace osnoise_top_usage("...") with fatal("...") on errors.

Remove the already unused 'usage' argument from osnoise_top_usage().

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Reviewed-by: Tomas Glozar <tglozar@redhat.com>
Link: https://lore.kernel.org/r/20251011082738.173670-5-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
2025-11-21 10:30:27 +01:00
Costa Shulyupin
8f4264e046 tools/rtla: Replace timerlat_hist_usage("...") with fatal("...")
A long time ago, when the usage help was short, it was a favor
to the user to show it on error. Now that the usage help has
become very long, it is too noisy to dump the complete help text
for each typo after the error message itself.

Replace timerlat_hist_usage("...\n") with fatal("...") on errors.

Remove the already unused 'usage' argument from timerlat_hist_usage().

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Reviewed-by: Tomas Glozar <tglozar@redhat.com>
Link: https://lore.kernel.org/r/20251011082738.173670-4-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
2025-11-21 10:30:27 +01:00
Costa Shulyupin
4e5e7210f9 tools/rtla: Replace timerlat_top_usage("...") with fatal("...")
A long time ago, when the usage help was short, it was a favor
to the user to show it on error. Now that the usage help has
become very long, it is too noisy to dump the complete help text
for each typo after the error message itself.

Replace timerlat_top_usage("...\n") with fatal("...") on errors.

Remove the already unused 'usage' argument from timerlat_top_usage().

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Reviewed-by: Tomas Glozar <tglozar@redhat.com>
Link: https://lore.kernel.org/r/20251011082738.173670-3-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
2025-11-21 10:30:27 +01:00
Costa Shulyupin
8cbb25db81 tools/rtla: Add fatal() and replace error handling pattern
The code contains some technical debt in error handling,
which complicates the consolidation of duplicated code.

Introduce an fatal() function to replace the common pattern of
err_msg() followed by exit(EXIT_FAILURE), reducing the length of an
already long function.

Further patches using fatal() follow.

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Reviewed-by: Tomas Glozar <tglozar@redhat.com>
Link: https://lore.kernel.org/r/20251011082738.173670-2-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
2025-11-21 10:30:27 +01:00
Tomas Glozar
34c170ae5c rtla/tests: Fix osnoise test calling timerlat
osnoise test "top stop at failed action" is calling timerlat instead of
osnoise by mistake.

Fix it so that it calls the correct RTLA subcommand.

Fixes: 05b7e10687c6 ("tools/rtla: Add remaining support for osnoise actions")
Reviewed-by: Wander Lairson Costa <wander@redhat.com>
Link: https://lore.kernel.org/r/20251007095341.186923-3-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
2025-11-21 10:30:00 +01:00
Tomas Glozar
d649e9f04c rtla/tests: Extend action tests to 5s
In non-BPF mode, it takes up to 1 second for RTLA to notice that tracing
has been stopped. That means that action tests cannot have a 1 second
duration, as the SIGALRM will be racing with the threshold overflow.

Previously, non-BPF mode actions were buggy and always executed
the action, even when stopping on duration or SIGINT, preventing
this issue from manifesting. Now that this has been fixed, the tests
have become flaky, and this has to be adjusted.

Fixes: 4e26f84abfbb ("rtla/tests: Add tests for actions")
Fixes: 05b7e10687c6 ("tools/rtla: Add remaining support for osnoise actions")
Reviewed-by: Wander Lairson Costa <wander@redhat.com>
Link: https://lore.kernel.org/r/20251007095341.186923-2-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
2025-11-21 10:29:01 +01:00
Lorenzo Stoakes
c7ba92bcfe testing/selftests/mm: add soft-dirty merge self-test
Assert that we correctly merge VMAs containing VM_SOFTDIRTY flags now that
we correctly handle these as sticky.

In order to do so, we have to account for the fact the pagemap interface
checks soft dirty PTEs and additionally that newly merged VMAs are marked
VM_SOFTDIRTY.

We do this by using use unfaulted anon VMAs, establishing one and clearing
references on that one, before establishing another and merging the two
before checking that soft-dirty is propagated as expected.

We check that this functions correctly with mremap() and mprotect() as
sample cases, because VMA merge of adjacent newly mapped VMAs will
automatically be made soft-dirty due to existing logic which does so.

We are therefore exercising other means of merging VMAs.

Link: https://lkml.kernel.org/r/d5a0f735783fb4f30a604f570ede02ccc5e29be9.1763399675.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Andrey Vagin <avagin@gmail.com>
Cc: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:01 -08:00
Lorenzo Stoakes
6707915e03 mm: propagate VM_SOFTDIRTY on merge
Patch series "make VM_SOFTDIRTY a sticky VMA flag", v2.

Currently we set VM_SOFTDIRTY when a new mapping is set up (whether by
establishing a new VMA, or via merge) as implemented in __mmap_complete()
and do_brk_flags().

However, when performing a merge of existing mappings such as when
performing mprotect(), we may lose the VM_SOFTDIRTY flag.

Now we have the concept of making VMA flags 'sticky', that is that they
both don't prevent merge and, importantly, are propagated to merged VMAs,
this seems a sensible alternative to the existing special-casing of
VM_SOFTDIRTY.

We additionally add a self-test that demonstrates that this logic behaves
as expected.


This patch (of 2):

Currently we set VM_SOFTDIRTY when a new mapping is set up (whether by
establishing a new VMA, or via merge) as implemented in __mmap_complete()
and do_brk_flags().

However, when performing a merge of existing mappings such as when
performing mprotect(), we may lose the VM_SOFTDIRTY flag.

This is because currently we simply ignore VM_SOFTDIRTY for the purposes
of merge, so one VMA may possess the flag and another not, and whichever
happens to be the target VMA will be the one upon which the merge is
performed which may or may not have VM_SOFTDIRTY set.

Now we have the concept of 'sticky' VMA flags, let's make VM_SOFTDIRTY one
which solves this issue.

Additionally update VMA userland tests to propagate changes.

[akpm@linux-foundation.org: update comments, per Lorenzo]
  Link: https://lkml.kernel.org/r/0019e0b8-ee1e-4359-b5ee-94225cbe5588@lucifer.local
Link: https://lkml.kernel.org/r/cover.1763399675.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/955478b5170715c895d1ef3b7f68e0cd77f76868.1763399675.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Acked-by: Andrey Vagin <avagin@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:01 -08:00
SeongJae Park
6e57c1ce81 Docs/mm/damon/maintainer-profile: fix grammatical errors
Fix a few grammatical errors on DAMON maintainer-profile.

Link: https://lkml.kernel.org/r/20251112154114.66053-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Bill Wendling <morbo@google.com>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:01 -08:00
SeongJae Park
7ad58e009d Docs/mm/damon/maintainer-profile: fix a typo on mm-untable link
Commit 0b473f9e6eac ("Docs/mm/damon/maintainer-profile: update for mm-new
tree") mistakenly forgot putting a space between a link and the next word.
Fix it.

Link: https://lkml.kernel.org/r/20251112154114.66053-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Bill Wendling <morbo@google.com>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:01 -08:00
SeongJae Park
675774adbe selftests/damon/sysfs.py: merge DAMON status dumping into commitment assertion
For each test case, sysfs.py makes changes to DAMON, dumps DAMON internal
status and asserts the expectation is met.  The dumping part should be the
same for all cases, so it is duplicated for each test case.  Which means
it is easy to make mistakes.  Actually a few of those duplicates are not
turning DAMON off in case of the dumping failure.  It makes following
selftests that need to turn DAMON on fails with -EBUSY.  Merge the status
dumping into commitment assertion with proper dumping failure handling, to
deduplicate and avoid the unnecessary following tests failures.

Link: https://lkml.kernel.org/r/20251112154114.66053-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Bill Wendling <morbo@google.com>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:01 -08:00
SeongJae Park
10e8c7ba64 mm/damon/tests/core-kunit: remove DAMON_MIN_REGION redefinition
A few DAMON core functions including damon_set_regions() were hard-coded
to use DAMON_MIN_REGION as their regions management granularity.  For
simple and human-readable unit tests' expectations, DAMON core layer kunit
test re-defines DAMON_MIN_REGION to '1'.

A previous patch series [1] has removed the hard-coded part but kept the
redefinition and updated related function calls to explicitly use
DAMON_MIN_REGION.  Remove the unnecessary redefinition and update relevant
function calls to pass literals (number '1') instead of the
DAMON_MIN_REGION.

Link: https://lkml.kernel.org/r/20251112154114.66053-7-sj@kernel.org
Link: https://lore.kernel.org/20250828171242.59810-1-sj@kernel.org [1]
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Bill Wendling <morbo@google.com>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:01 -08:00
SeongJae Park
09efc56a3b mm/damon/vaddr: consistently use only pmd_entry for damos_migrate
For page table walks, it is usual [1] to have only one pmd entry function.
The vaddr.c code for DAMOS_MIGRATE_{HOT,COLD} is not following the
pattern.  Instead, it uses both pmd and pte entry functions without a
special reason.  Refactor it to use only the pmd entry function, to make
the code under mm/ more consistent.

Link: https://lkml.kernel.org/r/20251112154114.66053-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Suggested-by: David Hildenbrand <david@kernel.org>
Cc: Bill Wendling <morbo@google.com>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:01 -08:00
SeongJae Park
f0eb046cd3 mm/damon/vaddr: use vm_normal_folio{,_pmd}() instead of damon_get_folio()
A few page table walk entry callback functions in vaddr.c uses
damon_get_folio() with p{te,md}_pfn() to get the folio, and then
put_folio().  Simplify and drop unnecessary folio get/put by using
vm_normal_folio() and its friends instead.

Note that this cleanup was suggested by David Hildenbrand during a review
of another patch series [1] and the patch was updated following the
suggestion.  This patch further applies the cleanup to DAMON code that
merged before the patch.

Link: https://lkml.kernel.org/r/20251112154114.66053-5-sj@kernel.org
Link: https://lore.kernel.org/0cb3d5a5-683b-4dba-90a8-b45ab83eec53@redhat.com [1]
Signed-off-by: SeongJae Park <sj@kernel.org>
Suggested-by: David Hildenbrand <david@kernel.org>
Cc: Bill Wendling <morbo@google.com>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:01 -08:00
SeongJae Park
96549d56b8 mm/damon/vaddr: cleanup using pmd_trans_huge_lock()
Three pmd walk functions in vaddr.c are using pmd_trans_huge() and
pmd_lock() to handle THPs.  Simplify the code by replacing the two
function calls with a single pmd_trans_huge_lock() call.

Note that this cleanup is not only reducing the lines of code, but also
simplifies code execution flows for migration entries case, as kindly
explained [1] by Hugh, who suggested this cleanup.

[sj@kernel.org: provide lvalue to pmd_present()]
  Link: https://lkml.kernel.org/r/20251117154415.11041-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20251112154114.66053-4-sj@kernel.org
Link: https://lore.kernel.org/296c2b3f-6748-158f-b85d-2952165c0588@google.com [1]
Signed-off-by: SeongJae Park <sj@kernel.org>
Suggested-by: Hugh Dickins <hughd@google.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: kernel test robot <lkp@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:01 -08:00
SeongJae Park
53298afe45 mm/damon: rename damos->filters to damos->core_filters
DAMOS filters that are handled by the ops layer are linked to
damos->ops_filters.  Owing to the ops_ prefix on the name, it is easy to
understand it is for ops layer handled filters.  The other types of
filters, which are handled by the core layer, are linked to
damos->filters.  Because of the name, it is easy to confuse the list is
there for not only core layer handled ones but all filters.  Avoid such
confusions by renaming the field to core_filters.

Link: https://lkml.kernel.org/r/20251112154114.66053-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Bill Wendling <morbo@google.com>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:01 -08:00
SeongJae Park
8b02baf373 mm/damon: rename damos core filter helpers to have word core
Patch series "mm/damon: misc cleanups".

Yet another batch of misc cleanups and refactoring for DAMON code, tests,
and documents.

First two patches (1and 2) rename DAMOS core filters related code for
readability.

Three following patches (3-5) refactor page table walk callback functions
in DAMON, as suggested by Hugh and David, and I promised.

Next two patches (6 and 7) refactor DAMON core layer kunit test and sysfs
interface selftest to be simple and deduplicated.

Final two patches (8 and 9) fix up sphinx and grammatical errors on
documents.


This patch (of 9):

DAMOS filters handled by the core layer are called core filters, while
those handled by the ops layer are called ops filters.  They share the
same type but are managed in different places since core filters are
evaluated before the ops filters.  They also have different helper
functions that depend on their managed places.

The helper functions for ops filters have '_ops_' keyword on their name,
so it is easy to know they are for ops filters.  Meanwhile, the helper
functions for core filters are not having the 'core' keyword on their
name.  This makes it easy to be mistakenly used for ops filters.  Actually
there was such a bug.

To avoid future mistakes from similar confusions, rename DAMOS core
filters helper functions to have a keyword 'core' on their names.

Link: https://lkml.kernel.org/r/20251112154114.66053-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20251112154114.66053-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Bill Wendling <morbo@google.com>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:00 -08:00
Mehdi Ben Hadj Khelifa
1ec5d5810b selftests/mm/uffd: remove static address usage in shmem_allocate_area()
The current shmem_allocate_area() implementation uses a hardcoded virtual
base address (BASE_PMD_ADDR) as a hint for mmap() when creating
shmem-backed test areas.  This approach is fragile and may fail on systems
with ASLR or different virtual memory layouts, where the chosen address is
unavailable.

Replace the static base address with a dynamically reserved address range
obtained via mmap(NULL, ..., PROT_NONE).  The memfd-backed areas and their
alias are then mapped into that reserved region using MAP_FIXED,
preserving the original layout and aliasing semantics while avoiding
collisions with unrelated mappings.

This change improves robustness and portability of the test suite without
altering its behavior or coverage.

[mehdi.benhadjkhelifa@gmail.com: make cleanup code more clear, per Mike]
  Link: https://lkml.kernel.org/r/20251113142050.108638-1-mehdi.benhadjkhelifa@gmail.com
Link: https://lkml.kernel.org/r/20251111205739.420009-1-mehdi.benhadjkhelifa@gmail.com
Signed-off-by: Mehdi Ben Hadj Khelifa <mehdi.benhadjkhelifa@gmail.com>
Suggested-by: Mike Rapoport <rppt@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Hunter <david.hunter.linux@gmail.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:00 -08:00
SeongJae Park
603f67eb91 mm/damon/tests/core-kunit: add damon_commit_target_regions() test
Add a new test for damon_commit_target_regions().

Link: https://lkml.kernel.org/r/20251111184415.141757-12-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:00 -08:00
SeongJae Park
299a88f6ec mm/damon/tests/core-kunit: add damos_commit() test
Add a new unit test for damos_commit().

Link: https://lkml.kernel.org/r/20251111184415.141757-11-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:00 -08:00
SeongJae Park
eec573b8dd mm/damon/tests/core-kunit: add damos_commit_dests() test
Add a new unit test for damos_commit_dests().

Link: https://lkml.kernel.org/r/20251111184415.141757-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:00 -08:00
SeongJae Park
c1cefda776 mm/damon/core: pass migrate_dests to damos_commit_dests()
damos_commit_dests() receives 'struct damos' pointers, while it uses only
their ->migrate_dests fields.  This makes code unnecessarily difficult to
read.  It also makes unit tests writing complicated.  Refactor the
function to receive pointers to the ->migrate_dests fields.

Link: https://lkml.kernel.org/r/20251111184415.141757-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:00 -08:00
SeongJae Park
11bb980d41 mm/damon/tests/core-kunit: add damos_commit_quota() test
Add a new unit test for damos_commit_quota().

Link: https://lkml.kernel.org/r/20251111184415.141757-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:00 -08:00
SeongJae Park
d9adfb8a28 mm/damon/tests/core-kunit: add damos_commit_quota_goals() test
Add a new unit test for damos_commit_quota_goals().

Link: https://lkml.kernel.org/r/20251111184415.141757-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:00 -08:00
SeongJae Park
99f89debaf mm/damon/tests/core-kunit: add damos_commit_quota_goal() test
Add a new unit test for damos_commit_quota_goal().

Link: https://lkml.kernel.org/r/20251111184415.141757-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:00 -08:00
SeongJae Park
3caf767e21 mm/damon/tests/core-kunit: add test cases to damos_test_commit_filter()
damos_test_commit_filter() is covering only a single test case.  Extend it
to cover multiple combinations of inputs.

Link: https://lkml.kernel.org/r/20251111184415.141757-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:44:00 -08:00
SeongJae Park
1b43b7950d mm/damon/tests/core-kunit: extend damos_test_commit_filter_for() for union fields
damos_commit_filter() also updates union fields of 'struct damos_filter'. 
Extend damos_test_commit_filter_for() to cover the expectations of the
union fields.

Link: https://lkml.kernel.org/r/20251111184415.141757-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:59 -08:00
SeongJae Park
1968236f75 mm/damon/tests/core-kunit: split out damos_test_commit_filter() core logic
damos_test_commit_filter() is written for only a single test case.  Split
out the core logic of damos_test_commit_filter() as a general one so that
it can be reused for multiple test cases.

Link: https://lkml.kernel.org/r/20251111184415.141757-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:59 -08:00
SeongJae Park
37104286f9 mm/damon/tests/core-kunit: remove dynamic allocs on damos_test_commit_filter()
Patch series "mm/damon/tests: add more tests for online parameters commit".

A DAMON feature called parameters "commit" allows DAMON API callers and
ABI users to update nearly every DAMON parameter while DAMON is running. 
This is being used for flexible DAMON use cases such as taking a snapshot
of the monitoring results with minimum overhead, or adjusting access-aware
system operations (DAMOS) for user-space driven auto-tuning or
investigations.

Compared to the usefulness of the feature and size of the implementation,
the test coverage is pretty small.  Only the filter commit part has a
single test case, namely damos_test_commit_filter().  Actually, we found
and fixed a few bugs of the feature in the past.  The single existing test
was also added to avoid reintroduction of a found bug.

Add more unit tests for the feature.

First four patches (1-4) refactor and extend the existing test for DAMOS
filter commit for multiple test cases.

Next three patches (5-7) add tests for DAMOS quota commit.

Next two patches (8 and 9) refactor damos_commit_dests() for ease of code
reading and test writing, and implement a new unit test of the function
that is being refactored in a test-friendly way.

Final two patches (10 and 11) further add new unit tests for
damos_commit() and damon_commit_target_regions().


This patch (of 11):

damos_test_commit_filter() is dynamically allocating test-purpose DAMOS
filters.  Allocation failure checks are making the code longer,
complicated, and difficult to extend for more test cases.  Refactor the
code to remove the dynamic allocation.

Link: https://lkml.kernel.org/r/20251111184415.141757-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20251111184415.141757-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:59 -08:00
Matthew Wilcox (Oracle)
7370f8e1b3 mm: use vma_start_write_killable() in dup_mmap()
Allow waiting for the VMA write lock to be interrupted by fatal signals. 
The explicit check for fatal_signal_pending() can be removed as it is
checked during vma_start_write_killable().  Improves the latency of
killing the task as we do not wait for the reader to finish before
checking for signals.

Link: https://lkml.kernel.org/r/20251110203204.1454057-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Chris Li <chriscli@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:59 -08:00
Matthew Wilcox (Oracle)
2197bb60f8 mm: add vma_start_write_killable()
Patch series "vma_start_write_killable"", v2.

When we added the VMA lock, we made a major oversight in not adding a
killable variant.  That can run us into trouble where a thread takes the
VMA lock for read (eg handling a page fault) and then goes out to lunch
for an hour (eg doing reclaim).  Another thread tries to modify the VMA,
taking the mmap_lock for write, then attempts to lock the VMA for write. 
That blocks on the first thread, and ensures that every other page fault
now tries to take the mmap_lock for read.  Because everything's in an
uninterruptible sleep, we can't kill the task, which makes me angry.

This patchset just adds vma_start_write_killable() and converts one caller
to use it.  Most users are somewhat tricky to convert, so expect follow-up
individual patches per call-site which need careful analysis to make sure
we've done proper cleanup.


This patch (of 2):

The vma can be held read-locked for a substantial period of time, eg if
memory allocation needs to go into reclaim.  It's useful to be able to
send fatal signals to threads which are waiting for the write lock.

Link: https://lkml.kernel.org/r/20251110203204.1454057-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20251110203204.1454057-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Chris Li <chriscli@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:59 -08:00
Qi Zheng
3a47e8771c mm: vmstat: correct the comment above preempt_disable_nested()
The comment explaining why these parts use preempt_disable_nested() is in
__mod_zone_page_state(), not in __mod_node_page_state(), so we should see
__mod_zone_page_state(). Just correct it.

Link: https://lkml.kernel.org/r/20251110084437.46701-1-qi.zheng@linux.dev
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Harry Yoo <harry.yoo@oracle.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:59 -08:00
Lorenzo Stoakes
2ab7f1bbaf mm/madvise: allow guard page install/remove under VMA lock
We only need to keep the page table stable so we can perform this
operation under the VMA lock.  PTE installation is stabilised via the PTE
lock.

One caveat is that, if we prepare vma->anon_vma we must hold the mmap read
lock.  We can account for this by adapting the VMA locking logic to
explicitly check for this case and prevent a VMA lock from being acquired
should it be the case.

This check is safe, as while we might be raced on anon_vma installation,
this would simply make the check conservative, there's no way for us to
see an anon_vma and then for it to be cleared, as doing so requires the
mmap/VMA write lock.

We abstract the VMA lock validity logic to is_vma_lock_sufficient() for
this purpose, and add prepares_anon_vma() to abstract the anon_vma logic.

In order to do this we need to have a way of installing page tables
explicitly for an identified VMA, so we export walk_page_range_vma() in an
unsafe variant - walk_page_range_vma_unsafe() and use this should the VMA
read lock be taken.

We additionally update the comments in madvise_guard_install() to more
accurately reflect the cases in which the logic may be reattempted,
specifically THP huge pages being present.

Link: https://lkml.kernel.org/r/cca1edbd99cd1386ad20556d08ebdb356c45ef91.1762795245.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: SeongJae Park <sj@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:59 -08:00
Lorenzo Stoakes
f4af67ff4f mm: rename walk_page_range_mm()
Patch series "mm: perform guard region install/remove under VMA lock", v2.

There is no reason why can't perform guard region operations under the VMA
lock, as long we take proper precautions to ensure that we do so in a safe
manner.

This is fine, as VMA lock acquisition is always best-effort, so if we are
unable to do so, we can simply fall back to using the mmap read lock.

Doing so will reduce mmap lock contention for callers performing guard
region operations and help establish a precedent of trying to use the VMA
lock where possible.

As part of this change we perform a trivial rename of page walk functions
which bypass safety checks (i.e.  whether or not mm_walk_ops->install_pte
is specified) in order that we can keep naming consistent with the mm
walk.

This is because we need to expose a VMA-specific walk that still allows us
to install PTE entries.


This patch (of 2):

Make it clear we're referencing an unsafe variant of this function
explicitly.

This is laying the foundation for exposing more such functions and
maintaining a consistent naming scheme.

As a part of this change, rename check_ops_valid() to check_ops_safe() for
consistency.

Link: https://lkml.kernel.org/r/cover.1762795245.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/c684d91464a438d6e31172c9450416a373f10649.1762795245.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: SeongJae Park <sj@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:59 -08:00
Shakeel Butt
bc8e51c05a mm: memcg: dump memcg protection info on oom or alloc failures
Currently kernel dumps memory state on oom and allocation failures.  One
of the question usually raised on those dumps is why the kernel has not
reclaimed the reclaimable memory instead of triggering oom.  One potential
reason is the usage of memory protection provided by memcg.  So, let's
also dump the memory protected by the memcg in such reports to ease the
debugging.

Link: https://lkml.kernel.org/r/20251107234041.3632644-1-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:59 -08:00
Huacai Chen
05be028795 mm: remove unnecessary __GFP_HIGHMEM in __p*d_alloc_one_*()
__{pgd,p4d,pud,pmd,pte}_alloc_one_*() always allocate pages with GFP flag
GFP_PGTABLE_KERNEL/GFP_PGTABLE_USER.  These two macros are defined as
follows:

 #define GFP_PGTABLE_KERNEL	(GFP_KERNEL | __GFP_ZERO)
 #define GFP_PGTABLE_USER	(GFP_PGTABLE_KERNEL | __GFP_ACCOUNT)

There is no __GFP_HIGHMEM in them, so we needn't to clear __GFP_HIGHMEM
explicitly.

Link: https://lkml.kernel.org/r/20251109021817.346181-1-chenhuacai@loongson.cn
Link: https://lkml.kernel.org/r/20251107095536.3101371-1-chenhuacai@loongson.cn
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:59 -08:00
Lorenzo Stoakes
c0ae966fac tools/testing/selftests/mm: add smaps visibility guard region test
Assert that we observe guard regions appearing in /proc/$pid/smaps as
expected, and when split/merge is performed too (with expected sticky
behaviour).

Also add handling for file systems which don't sanely handle mmap() VMA
merging so we don't incorrectly encounter a test failure in this
situation.

Link: https://lkml.kernel.org/r/059e62b8c67e55e6d849878206a95ea1d7c1e885.1763460113.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:58 -08:00
Lorenzo Stoakes
89330ec897 tools/testing/selftests/mm: add MADV_COLLAPSE test case
To ensure the retract_page_tables() logic functions correctly with the
introduction of VM_MAYBE_GUARD, add a test to assert that madvise collapse
fails when guard regions are established in the collapsed range in all
cases.

Unfortunately we cannot differentiate between e.g. 
CONFIG_READ_ONLY_THP_FOR_FS not being set vs.  a file-backed VMA having
collapse correctly disallowed, so in each instance we will get an assert
pass here.

We add an additional check to see whether guard regions are preserved
across collapse in case of a bug causing the collapse to succeed, which
will give us more data to debug with should this occur in future.

Link: https://lkml.kernel.org/r/0748beeb864525b8ddfa51adad7128dd32eb3ac4.1763460113.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:58 -08:00
Lorenzo Stoakes
29bef05e6d tools/testing/vma: add VMA sticky userland tests
Modify existing merge new/existing userland VMA tests to assert that
sticky VMA flags behave as expected.

We do so by generating every possible permutation of VMAs being
manipulated being sticky/not sticky and asserting that VMA flags with this
property retain are retained upon merge.

Link: https://lkml.kernel.org/r/5e2c7244485867befd052f8afc8188be6a4be670.1763460113.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:58 -08:00
Lorenzo Stoakes
49e14dabed mm: set the VM_MAYBE_GUARD flag on guard region install
Now we have established the VM_MAYBE_GUARD flag and added the capacity to
set it atomically, do so upon MADV_GUARD_INSTALL.

The places where this flag is used currently and matter are:

* VMA merge - performed under mmap/VMA write lock, therefore excluding
  racing writes.

* /proc/$pid/smaps - can race the write, however this isn't meaningful
  as the flag write is performed at the point of the guard region being
  established, and thus an smaps reader can't reasonably expect to avoid
  races.  Due to atomicity, a reader will observe either the flag being
  set or not.  Therefore consistency will be maintained.

In all other cases the flag being set is irrelevant and atomicity
guarantees other flags will be read correctly.

Note that non-atomic updates of unrelated flags do not cause an issue with
this flag being set atomically, as writes of other flags are performed
under mmap/VMA write lock, and these atomic writes are performed under
mmap/VMA read lock, which excludes the write, avoiding RMW races.

Note that we do not encounter issues with KCSAN by adjusting this flag
atomically, as we are only updating a single bit in the flag bitmap and
therefore we do not need to annotate these changes.

We intentionally set this flag in advance of actually updating the page
tables, to ensure that any racing atomic read of this flag will only
return false prior to page tables being updated, to allow for
serialisation via page table locks.

Note that we set vma->anon_vma for anonymous mappings.  This is because
the expectation for anonymous mappings is that an anon_vma is established
should they possess any page table mappings.  This is also consistent with
what we were doing prior to this patch (unconditionally setting anon_vma
on guard region installation).

We also need to update retract_page_tables() to ensure that madvise(...,
MADV_COLLAPSE) doesn't incorrectly collapse file-backed ranges contain
guard regions.

This was previously guarded by anon_vma being set to catch MAP_PRIVATE
cases, but the introduction of VM_MAYBE_GUARD necessitates that we check
this flag instead.

We utilise vma_flag_test_atomic() to do so - we first perform an
optimistic check, then after the PTE page table lock is held, we can check
again safely, as upon guard marker install the flag is set atomically
prior to the page table lock being taken to actually apply it.

So if the initial check fails either:

* Page table retraction acquires page table lock prior to VM_MAYBE_GUARD
  being set - guard marker installation will be blocked until page table
  retraction is complete.

OR:

* Guard marker installation acquires page table lock after setting
  VM_MAYBE_GUARD, which raced and didn't pick this up in the initial
  optimistic check, blocking page table retraction until the guard regions
  are installed - the second VM_MAYBE_GUARD check will prevent page table
  retraction.

Either way we're safe.

We refactor the retraction checks into a single
file_backed_vma_is_retractable(), there doesn't seem to be any reason that
the checks were separated as before.

Note that VM_MAYBE_GUARD being set atomically remains correct as
vma_needs_copy() is invoked with the mmap and VMA write locks held,
excluding any race with madvise_guard_install().

Link: https://lkml.kernel.org/r/e9e9ce95b6ac17497de7f60fc110c7dd9e489e8d.1763460113.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:58 -08:00
Lorenzo Stoakes
ab04b530e7 mm: introduce copy-on-fork VMAs and make VM_MAYBE_GUARD one
Gather all the VMA flags whose presence implies that page tables must be
copied on fork into a single bitmap - VM_COPY_ON_FORK - and use this
rather than specifying individual flags in vma_needs_copy().

We also add VM_MAYBE_GUARD to this list, as it being set on a VMA implies
that there may be metadata contained in the page tables (that is - guard
markers) which would will not and cannot be propagated upon fork.

This was already being done manually previously in vma_needs_copy(), but
this makes it very explicit, alongside VM_PFNMAP, VM_MIXEDMAP and
VM_UFFD_WP all of which imply the same.

Note that VM_STICKY flags ought generally to be marked VM_COPY_ON_FORK too
- because equally a flag being VM_STICKY indicates that the VMA contains
metadat that is not propagated by being faulted in - i.e.  that the VMA
metadata does not fully describe the VMA alone, and thus we must propagate
whatever metadata there is on a fork.

However, for maximum flexibility, we do not make this necessarily the case
here.

Link: https://lkml.kernel.org/r/5d41b24e7bc622cda0af92b6d558d7f4c0d1bc8c.1763460113.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:58 -08:00
Lorenzo Stoakes
64212ba02e mm: implement sticky VMA flags
It is useful to be able to designate that certain flags are 'sticky', that
is, if two VMAs are merged one with a flag of this nature and one without,
the merged VMA sets this flag.

As a result we ignore these flags for the purposes of determining VMA flag
differences between VMAs being considered for merge.

This patch therefore updates the VMA merge logic to perform this action,
with flags possessing this property being described in the VM_STICKY
bitmap.

Those flags which ought to be ignored for the purposes of VMA merge are
described in the VM_IGNORE_MERGE bitmap, which the VMA merge logic is also
updated to use.

As part of this change we place VM_SOFTDIRTY in VM_IGNORE_MERGE as it
already had this behaviour, alongside VM_STICKY as sticky flags by
implication must not disallow merge.

Ultimately it seems that we should make VM_SOFTDIRTY a sticky flag in its
own right, but this change is out of scope for this series.

The only sticky flag designated as such is VM_MAYBE_GUARD, so as a result
of this change, once the VMA flag is set upon guard region installation,
VMAs with guard ranges will now not have their merge behaviour impacted as
a result and can be freely merged with other VMAs without VM_MAYBE_GUARD
set.

Also update the comments for vma_modify_flags() to directly reference
sticky flags now we have established the concept.

We also update the VMA userland tests to account for the changes.

Link: https://lkml.kernel.org/r/22ad5269f7669d62afb42ce0c79bad70b994c58d.1763460113.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:58 -08:00
Lorenzo Stoakes
9119d6c209 mm: update vma_modify_flags() to handle residual flags, document
The vma_modify_*() family of functions each either perform splits, a merge
or no changes at all in preparation for the requested modification to
occur.

When doing so for a VMA flags change, we currently don't account for any
flags which may remain (for instance, VM_SOFTDIRTY) despite the requested
change in the case that a merge succeeded.

This is made more important by subsequent patches which will introduce the
concept of sticky VMA flags which rely on this behaviour.

This patch fixes this by passing the VMA flags parameter as a pointer and
updating it accordingly on merge and updating callers to accommodate for
this.

Additionally, while we are here, we add kdocs for each of the
vma_modify_*() functions, as the fact that the requested modification is
not performed is confusing so it is useful to make this abundantly clear.

We also update the VMA userland tests to account for this change.

Link: https://lkml.kernel.org/r/23b5b549b0eaefb2922625626e58c2a352f3e93c.1763460113.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:58 -08:00
Lorenzo Stoakes
5688225023 mm: add atomic VMA flags and set VM_MAYBE_GUARD as such
This patch adds the ability to atomically set VMA flags with only the mmap
read/VMA read lock held.

As this could be hugely problematic for VMA flags in general given that
all other accesses are non-atomic and serialised by the mmap/VMA locks, we
implement this with a strict allow-list - that is, only designated flags
are allowed to do this.

We make VM_MAYBE_GUARD one of these flags.

Link: https://lkml.kernel.org/r/97e57abed09f2663077ed7a36fb8206e243171a9.1763460113.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:58 -08:00
Lorenzo Stoakes
5dba5cc2e0 mm: introduce VM_MAYBE_GUARD and make visible in /proc/$pid/smaps
Patch series "introduce VM_MAYBE_GUARD and make it sticky", v4.

Currently, guard regions are not visible to users except through
/proc/$pid/pagemap, with no explicit visibility at the VMA level.

This makes the feature less useful, as it isn't entirely apparent which
VMAs may have these entries present, especially when performing actions
which walk through memory regions such as those performed by CRIU.

This series addresses this issue by introducing the VM_MAYBE_GUARD flag
which fulfils this role, updating the smaps logic to display an entry for
these.

The semantics of this flag are that a guard region MAY be present if set
(we cannot be sure, as we can't efficiently track whether an
MADV_GUARD_REMOVE finally removes all the guard regions in a VMA) - but if
not set the VMA definitely does NOT have any guard regions present.

It's problematic to establish this flag without further action, because
that means that VMAs with guard regions in them become non-mergeable with
adjacent VMAs for no especially good reason.

To work around this, this series also introduces the concept of 'sticky'
VMA flags - that is flags which:

a. if set in one VMA and not in another still permit those VMAs to be
   merged (if otherwise compatible).

b. When they are merged, the resultant VMA must have the flag set.

The VMA logic is updated to propagate these flags correctly.

Additionally, VM_MAYBE_GUARD being an explicit VMA flag allows us to solve
an issue with file-backed guard regions - previously these established an
anon_vma object for file-backed mappings solely to have vma_needs_copy()
correctly propagate guard region mappings to child processes.

We introduce a new flag alias VM_COPY_ON_FORK (which currently only
specifies VM_MAYBE_GUARD) and update vma_needs_copy() to check explicitly
for this flag and to copy page tables if it is present, which resolves
this issue.

Additionally, we add the ability for allow-listed VMA flags to be
atomically writable with only mmap/VMA read locks held.

The only flag we allow so far is VM_MAYBE_GUARD, which we carefully ensure
does not cause any races by being allowed to do so.

This allows us to maintain guard region installation as a read-locked
operation and not endure the overhead of obtaining a write lock here.

Finally we introduce extensive VMA userland tests to assert that the
sticky VMA logic behaves correctly as well as guard region self tests to
assert that smaps visibility is correctly implemented.


This patch (of 9):

Currently, if a user needs to determine if guard regions are present in a
range, they have to scan all VMAs (or have knowledge of which ones might
have guard regions).

Since commit 8e2f2aeb8b48 ("fs/proc/task_mmu: add guard region bit to
pagemap") and the related commit a516403787e0 ("fs/proc: extend the
PAGEMAP_SCAN ioctl to report guard regions"), users can use either
/proc/$pid/pagemap or the PAGEMAP_SCAN functionality to perform this
operation at a virtual address level.

This is not ideal, and it gives no visibility at a /proc/$pid/smaps level
that guard regions exist in ranges.

This patch remedies the situation by establishing a new VMA flag,
VM_MAYBE_GUARD, to indicate that a VMA may contain guard regions (it is
uncertain because we cannot reasonably determine whether a
MADV_GUARD_REMOVE call has removed all of the guard regions in a VMA, and
additionally VMAs may change across merge/split).

We utilise 0x800 for this flag which makes it available to 32-bit
architectures also, a flag that was previously used by VM_DENYWRITE, which
was removed in commit 8d0920bde5eb ("mm: remove VM_DENYWRITE") and hasn't
bee reused yet.

We also update the smaps logic and documentation to identify these VMAs.

Another major use of this functionality is that we can use it to identify
that we ought to copy page tables on fork.

We do not actually implement usage of this flag in mm/madvise.c yet as we
need to allow some VMA flags to be applied atomically under mmap/VMA read
lock in order to avoid the need to acquire a write lock for this purpose.

Link: https://lkml.kernel.org/r/cover.1763460113.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/cf8ef821eba29b6c5b5e138fffe95d6dcabdedb9.1763460113.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:58 -08:00
Hui Zhu
cdcb53e1de mm/hugetlb: extract sysctl into hugetlb_sysctl.c
Following the extraction of sysfs code, this patch moves the sysctl
interface implementation into a dedicated file to further improve code
organization and maintainability of the hugetlb subsystem.

The following components are moved to mm/hugetlb_sysctl.c:
- proc_hugetlb_doulongvec_minmax()
- hugetlb_sysctl_handler_common()
- hugetlb_sysctl_handler()
- hugetlb_mempolicy_sysctl_handler() (CONFIG_NUMA)
- hugetlb_overcommit_handler()
- hugetlb_table[] sysctl table definition
- hugetlb_sysctl_init()

The hugetlb_internal.h header file is updated to declare the sysctl
initialization function with proper #ifdef guards for configurations
without CONFIG_SYSCTL support.

The Makefile is updated to compile hugetlb_sysctl.o when CONFIG_HUGETLBFS
is enabled.  This refactoring reduces the size of hugetlb.c and logically
separates the sysctl interface from core hugetlb management code.

MAINTAINERS is updated to add new file hugetlb_sysctl.c.

No functional changes are introduced; all code is moved as-is from
hugetlb.c with consistent formatting.

Link: https://lkml.kernel.org/r/5bbee7ab5be71d0bb1aebec38642d7e83526bb7a.1762398359.git.zhuhui@kylinos.cn
Signed-off-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Hui Zhu <zhuhui@kylinos.cn>
Cc: David Hildenbrand <david@redhat.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:57 -08:00
Hui Zhu
ecd6703f64 mm/hugetlb: extract sysfs into hugetlb_sysfs.c
Patch series "mm/hugetlb: refactor sysfs/sysctl interfaces", v5.

hugetlb.c has grown significantly and become difficult to maintain.  This
patch series extracts the sysfs and sysctl interface code into separate
dedicated files to improve code organization.

The refactoring includes:
- Patch 1: Extract sysfs interface into mm/hugetlb_sysfs.c
- Patch 2: Extract sysctl interface into mm/hugetlb_sysctl.c

No functional changes are introduced in this series.  The code is moved
as-is, with only minor formatting adjustments for code style consistency. 
This should make future maintenance and enhancements to the hugetlb
subsystem easier.

Testing: The patch series has been compile-tested and maintains the same
functionality as the original code.


This patch (of 2):

Currently, hugetlb.c contains both core management logic and sysfs
interface implementations, making it difficult to maintain.  This patch
extracts the sysfs-related code into a dedicated file to improve code
organization.

The following components are moved to mm/hugetlb_sysfs.c:
- sysfs attribute definitions and handlers
- sysfs kobject management functions
- NUMA per-node hstate attribute registration

Several inline helper functions and macros are moved to
mm/hugetlb_internal.h:
- hstate_is_gigantic_no_runtime()
- next_node_allowed()
- get_valid_node_allowed()
- hstate_next_node_to_alloc()
- hstate_next_node_to_free()
- for_each_node_mask_to_alloc/to_free macros

To support code sharing, these functions are changed from static to
exported symbols:
- remove_hugetlb_folio()
- add_hugetlb_folio()
- init_new_hugetlb_folio()
- prep_and_add_allocated_folios()
- demote_pool_huge_page()
- __nr_hugepages_store_common()

The Makefile is updated to compile hugetlb_sysfs.o when CONFIG_HUGETLBFS
is enabled.  This maintains all existing functionality while improving
maintainability by separating concerns.

MAINTAINERS is updated to add new file hugetlb_sysfs.c.

Link: https://lkml.kernel.org/r/cover.1762398359.git.zhuhui@kylinos.cn
Link: https://lkml.kernel.org/r/656a03dff7e2bb20e24e841ede81fdca01d21410.1762398359.git.zhuhui@kylinos.cn
Signed-off-by: Geliang Tang <geliang@kernel.org>
Signed-off-by: Hui Zhu <zhuhui@kylinos.cn>
Cc: David Hildenbrand <david@redhat.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:57 -08:00
Matthew Wilcox (Oracle)
98be155451 mm: constify __dump_folio() arguments
These arguments aren't modified by the function; mark them as const to
help the compiler.

Link: https://lkml.kernel.org/r/20251106203526.2368275-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:57 -08:00
Matthew Wilcox (Oracle)
20605eb5bb memory_hotplug: optimise try_offline_memory_block()
Extract the zone number directly from the page instead of using the page's
zone number to look up the zone and asking the zone what its number is.

Link: https://lkml.kernel.org/r/20251106201452.2292631-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:57 -08:00
Matthew Wilcox (Oracle)
c537f0dd30 migrate: optimise alloc_migration_target()
Extract the zone number directly from the folio instead of using the
folio's zone number to look up the zone and asking the zone what its
number is.

[ziy@nvidia.com: fix folio_zonenum() return type]
  Link: https://lkml.kernel.org/r/26E8FF35-503E-4F14-98F7-7B4FA25FBD37@nvidia.com
Link: https://lkml.kernel.org/r/20251106201452.2292631-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Zi Yan <ziy@nvidia.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:57 -08:00
Matthew Wilcox (Oracle)
e24f66e87b hugetlb: optimise hugetlb_folio_init_tail_vmemmap()
Extract the zone number directly from the folio instead of using the
folio's zone number to look up the zone and asking the zone what its
number is.

Also we should use &folio->page instead of casting from folio to page

Link: https://lkml.kernel.org/r/20251106201452.2292631-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: David Hildenbrand <david@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:57 -08:00
Zeng Chi
135e541ae8 lib/alloc_tag: use %pe format specifier
The %pe format specifier is designed to print error pointers.  It prints a
symbolic error name (eg.  -EINVAL) and it makes the code simpler by
omitting PTR_ERR();

This patch fixes this cocci report:
lib/alloc_tag.c:776:63-70: WARNING: Consider using %pe to print PTR_ERR()

Link: https://lkml.kernel.org/r/20251105023925.1447482-1-zeng_chi911@163.com
Signed-off-by: Zeng Chi <zengchi@kylinos.cn>
Acked-by: SeongJae Park <sj@kernel.org>
Acked-by: Suren Baghdasaryan <surenb@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:57 -08:00
Kefeng Wang
340b59816b mm: kill mm_wr_locked from unmap_vmas() and unmap_single_vma()
Kill mm_wr_locked since commit f8e97613fed2 ("mm: convert VM_PFNMAP
tracking to pfnmap_track() + pfnmap_untrack()") remove the user.

Link: https://lkml.kernel.org/r/20251104085709.2688433-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:57 -08:00
Ankit Khushwaha
3b12a53b64 selftest/mm: fix pointer comparison in mremap_test
Pointer arthemitic with 'void * addr' and 'ulong dest_alignment' triggers
following warning:

mremap_test.c:1035:31: warning: pointer comparison always evaluates to
false [-Wtautological-compare]
 1035 |                 if (addr + c.dest_alignment < addr) {
      |                                             ^

this warning is raised from clang version 20.1.8 (Fedora 20.1.8-4.fc42).

use 'void *tmp_addr' to do the pointer arthemitic.

Link: https://lkml.kernel.org/r/20251108161829.25105-1-ankitkhushwaha.linux@gmail.com
Signed-off-by: Ankit Khushwaha <ankitkhushwaha.linux@gmail.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:57 -08:00
Baolin Wang
4f8961b295 mm: vmscan: simplify the folio refcount check in pageout()
Since we no longer attempt to write back filesystem folios in pageout()
(they will be filtered out by the following check in pageout()), and only
tmpfs/shmem folios and anonymous swapcache folios can be written back, we
can remove the redundant folio_test_private() when checking the folio's
refcount, as tmpfs/shmem and swapcache folios do not use the PG_private
flag.

While we're at it, we can open-code the folio refcount check instead of
adding a simple helper that has only one user.

Link: https://lkml.kernel.org/r/4cbbec5bb92397aa4597105f1f499aabf7a1901c.1758166683.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:56 -08:00
Baolin Wang
d94d9293a1 mm: vmscan: remove folio_test_private() check in pageout()
Patch series "some cleanups for pageout()", v2.

Since we no longer attempt to write back filesystem folios in pageout(),
and only tmpfs/shmem folios and anonymous swapcache folios can be written
back, we can remove the redundant folio_test_private() related logic to
simplify the logic of pageout(), as tmpfs/shmem and swapcache folios do
not use the PG_private flag.


This patch (of 2):

The folio_test_private() check in pageout() was introduced by commit
ce91b575332b ("orphaned pagecache memleak fix") in 2005 (checked from a
history tree[1]).  As the commit message mentioned, it was to address the
issue where reiserfs pagecache may be truncated while still pinned.  To
further explain, the truncation removes the page->mapping, but the page is
still listed in the VM queues because it still has buffers.

In 2008, commit a2b345642f530 ("Fix dirty page accounting leak with ext3
data=journal") seems to be dealing with a similar issue, where the page
becomes dirty after truncation, and it provides a very useful call stack:

truncate_complete_page()
      cancel_dirty_page() // PG_dirty cleared, decr. dirty pages
      do_invalidatepage()
        ext3_invalidatepage()
          journal_invalidatepage()
            journal_unmap_buffer()
              __dispose_buffer()
                __journal_unfile_buffer()
                  __journal_temp_unlink_buffer()
                    mark_buffer_dirty(); // PG_dirty set, incr. dirty pages

In this commit a2b345642f530, we forcefully clear the page's dirty flag
during truncation (in truncate_complete_page()).

Now it seems this was just a peculiar usage specific to reiserfs.  Maybe
reiserfs had some extra refcount on these pages, which caused them to pass
the is_page_cache_freeable() check.

With the fix provided by commit a2b345642f530 and reiserfs being removed
in 2024 by commit fb6f20ecb121 ("reiserfs: The last commit"), such a case
is unlikely to occur again.  So let's remove the redundant
folio_test_private() checks and related buffer_head release logic, and
just leave a warning here to catch such a bug.

[akpm@linux-foundation.org: redo comment, per David]
  Link: https://lkml.kernel.org/r/17d1b293-e393-4989-a357-7eea74b3c805@redhat.com
[baolin.wang@linux.alibaba.com: remove comment and WARNing, per Hugh and others]
  Link: https://lkml.kernel.org/r/392a9ca3-31ac-4447-bd44-3c656d63e4ca@linux.alibaba.com
Link: https://lkml.kernel.org/r/cover.1758166683.git.baolin.wang@linux.alibaba.com
Link: https://lkml.kernel.org/r/9ef0e560dc83650bc538eb5dcd1594e112c1369f.1758166683.git.baolin.wang@linux.alibaba.com
Link: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git [1]
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 13:43:56 -08:00
Tomas Glozar
417bd0d502 tools/rtla: Fix --on-threshold always triggering
Commit 8d933d5c89e8 ("rtla/timerlat: Add continue action") moved the
code performing on-threshold actions (enabled through --on-threshold
option) to inside the RTLA main loop.

The condition in the loop does not check whether the threshold was
actually exceeded or if stop tracing was requested by the user through
SIGINT or duration. This leads to a bug where on-threshold actions are
always performed, even when the threshold was not hit.

(BPF mode is not affected, since it uses a different condition in the
while loop.)

Add a condition that checks for !stop_tracing before executing the
actions. Also, fix incorrect brackets in hist_main_loop to match the
semantics of top_main_loop.

Fixes: 8d933d5c89e8 ("rtla/timerlat: Add continue action")
Fixes: 2f3172f9dd58 ("tools/rtla: Consolidate code between osnoise/timerlat and hist/top")
Reviewed-by: Crystal Wood <crwood@redhat.com>
Reviewed-by: Wander Lairson Costa <wander@redhat.com>
Link: https://lore.kernel.org/r/20251007095341.186923-1-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
2025-11-20 13:15:55 +01:00
Tomas Glozar
e4240db933 rtla/timerlat_bpf: Stop tracing on user latency
rtla-timerlat allows a *thread* latency threshold to be set via the
-T/--thread option. However, the timerlat tracer calls this *total*
latency (stop_tracing_total_us), and stops tracing also when the
return-to-user latency is over the threshold.

Change the behavior of the timerlat BPF program to reflect what the
timerlat tracer is doing, to avoid discrepancy between stopping
collecting data in the BPF program and stopping tracing in the timerlat
tracer.

Cc: stable@vger.kernel.org
Fixes: e34293ddcebd ("rtla/timerlat: Add BPF skeleton to collect samples")
Reviewed-by: Wander Lairson Costa <wander@redhat.com>
Link: https://lore.kernel.org/r/20251006143100.137255-1-tglozar@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
2025-11-20 13:15:55 +01:00
Costa Shulyupin
b4275b2301 tools/rtla: Fix unassigned nr_cpus
In recently introduced timerlat_free(),
the variable 'nr_cpus' is not assigned.

Assign it with sysconf(_SC_NPROCESSORS_CONF) as done elsewhere.
Remove the culprit: -Wno-maybe-uninitialized. The rest of the
code is clean.

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Reviewed-by: Tomas Glozar <tglozar@redhat.com>
Fixes: 2f3172f9dd58 ("tools/rtla: Consolidate code between osnoise/timerlat and hist/top")
Link: https://lore.kernel.org/r/20251002170846.437888-1-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
2025-11-20 13:15:54 +01:00
Costa Shulyupin
671314fce1 tools/rtla: Remove unused optional option_index
The longindex argument of getopt_long() is optional
and tied to the unused local variable option_index.

Remove it to shorten the four longest functions
and make the code neater.

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Reviewed-by: Tomas Glozar <tglozar@redhat.com>
Link: https://lore.kernel.org/r/20251002123553.389467-2-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
2025-11-20 13:15:53 +01:00
Costa Shulyupin
04fa6bf373 tools/rtla: Add for_each_monitored_cpu() helper
The rtla tools have many instances of iterating over CPUs while
checking if they are monitored.

Add a for_each_monitored_cpu() helper macro to make the code
more readable and reduce code duplication.

Signed-off-by: Costa Shulyupin <costa.shul@redhat.com>
Reviewed-by: Tomas Glozar <tglozar@redhat.com>
Link: https://lore.kernel.org/r/20251002123553.389467-1-costa.shul@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
2025-11-20 13:15:53 +01:00
SeongJae Park
80d725f96c mm/damon/tests/core-kunit: remove unused ctx in damon_test_split_regions_of()
damon_test_split_regions_of() dynamically allocates a 'struct damon_ctx'
object, but it is not really being used in the code other than handling
the allocation failure and deallocating it at the end of the function. 
Remove the unnecessary allocation and deallocation of the object.

Link: https://lkml.kernel.org/r/20251101182021.74868-23-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:35 -08:00
SeongJae Park
40b11d1eb1 mm/damon/tests/core-kunit: remove unnecessary damon_ctx variable on damon_test_split_at()
damon_test_split_at() dynamically allocates a 'struct damon_ctx' object,
but it is not really being used in the code other than handling the
allocation failure and deallocating it at the end of the function.  Remove
the unnecessary allocation and deallocation of the object.

Link: https://lkml.kernel.org/r/20251101182021.74868-22-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:35 -08:00
SeongJae Park
7d808bf139 mm/damon/tests/sysfs-kunit: handle alloc failures on damon_sysfs_test_add_targets()
damon_sysfs_test_add_targets() is assuming all dynamic memory allocation
in it will succeed.  Those are indeed likely in the real use cases since
those allocations are too small to fail, but theoretically those could
fail.  In the case, inappropriate memory access can happen.  Fix it by
appropriately cleanup pre-allocated memory and skip the execution of the
remaining tests in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-21-sj@kernel.org
Fixes: b8ee5575f763 ("mm/damon/sysfs-test: add a unit test for damon_sysfs_set_targets()")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[6.7+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:35 -08:00
SeongJae Park
0a63a0e757 mm/damon/tests/vaddr-kunit: handle alloc failures on damon_test_split_evenly_succ()
damon_test_split_evenly_succ() is assuming all dynamic memory allocation
in it will succeed.  Those are indeed likely in the real use cases since
those allocations are too small to fail, but theoretically those could
fail.  In the case, inappropriate memory access can happen.  Fix it by
appropriately cleanup pre-allocated memory and skip the execution of the
remaining tests in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-20-sj@kernel.org
Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[5.15+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:34 -08:00
SeongJae Park
7890e5b5bb mm/damon/tests/vaddr-kunit: handle alloc failures in damon_test_split_evenly_fail()
damon_test_split_evenly_fail() is assuming all dynamic memory allocation
in it will succeed.  Those are indeed likely in the real use cases since
those allocations are too small to fail, but theoretically those could
fail.  In the case, inappropriate memory access can happen.  Fix it by
appropriately cleanup pre-allocated memory and skip the execution of the
remaining tests in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-19-sj@kernel.org
Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[5.15+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:34 -08:00
SeongJae Park
2b22d0fcc6 mm/damon/tests/vaddr-kunit: handle alloc failures on damon_do_test_apply_three_regions()
damon_do_test_apply_three_regions() is assuming all dynamic memory
allocation in it will succeed.  Those are indeed likely in the real use
cases since those allocations are too small to fail, but theoretically
those could fail.  In the case, inappropriate memory access can happen. 
Fix it by appropriately cleanup pre-allocated memory and skip the
execution of the remaining tests in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-18-sj@kernel.org
Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[5.15+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:34 -08:00
SeongJae Park
84be856cc8 mm/damon/tests/core-kunit: handle alloc failures on damon_test_set_filters_default_reject()
damon_test_set_filters_default_reject() is assuming all dynamic memory
allocation in it will succeed.  Those are indeed likely in the real use
cases since those allocations are too small to fail, but theoretically
those could fail.  In the case, inappropriate memory access can happen. 
Fix it by appropriately cleanup pre-allocated memory and skip the
execution of the remaining tests in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-17-sj@kernel.org
Fixes: 094fb14913c7 ("mm/damon/tests/core-kunit: add a test for damos_set_filters_default_reject()")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[6.16+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:34 -08:00
SeongJae Park
d14d5671e7 mm/damon/tests/core-kunit: handle alloc failures on damos_test_filter_out()
damon_test_filter_out() is assuming all dynamic memory allocation in it
will succeed.  Those are indeed likely in the real use cases since those
allocations are too small to fail, but theoretically those could fail.  In
the case, inappropriate memory access can happen.  Fix it by appropriately
cleanup pre-allocated memory and skip the execution of the remaining tests
in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-16-sj@kernel.org
Fixes: 26713c890875 ("mm/damon/core-test: add a unit test for __damos_filter_out()")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[6.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:33 -08:00
SeongJae Park
3e5c4a1a17 mm/damon/tests/core-kunit: handle alloc failure on damos_test_commit_filter()
damon_test_commit_filter() is assuming all dynamic memory allocation in it
will succeed.  Those are indeed likely in the real use cases since those
allocations are too small to fail, but theoretically those could fail.  In
the case, inappropriate memory access can happen.  Fix it by appropriately
cleanup pre-allocated memory and skip the execution of the remaining tests
in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-15-sj@kernel.org
Fixes: f6a4a150f1ec ("mm/damon/tests/core-kunit: add damos_commit_filter test")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[6.18+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:33 -08:00
SeongJae Park
28ab2265e9 mm/damon/tests/core-kunit: handle alloc failres in damon_test_new_filter()
damon_test_new_filter() is assuming all dynamic memory allocation in it
will succeed.  Those are indeed likely in the real use cases since those
allocations are too small to fail, but theoretically those could fail.  In
the case, inappropriate memory access can happen.  Fix it by appropriately
cleanup pre-allocated memory and skip the execution of the remaining tests
in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-14-sj@kernel.org
Fixes: 2a158e956b98 ("mm/damon/core-test: add a test for damos_new_filter()")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[6.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:33 -08:00
SeongJae Park
915a2453d8 mm/damon/tests/core-kunit: handle alloc failure on damon_test_set_attrs()
damon_test_set_attrs() is assuming all dynamic memory allocation in it
will succeed.  Those are indeed likely in the real use cases since those
allocations are too small to fail, but theoretically those could fail.  In
the case, inappropriate memory access can happen.  Fix it by appropriately
cleanup pre-allocated memory and skip the execution of the remaining tests
in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-13-sj@kernel.org
Fixes: aa13779be6b7 ("mm/damon/core-test: add a test for damon_set_attrs()")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[6.5+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:33 -08:00
SeongJae Park
8cf298c01b mm/damon/tests/core-kunit: handle alloc failures in damon_test_update_monitoring_result()
damon_test_update_monitoring_result() is assuming all dynamic memory
allocation in it will succeed.  Those are indeed likely in the real use
cases since those allocations are too small to fail, but theoretically
those could fail.  In the case, inappropriate memory access can happen. 
Fix it by appropriately cleanup pre-allocated memory and skip the
execution of the remaining tests in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-12-sj@kernel.org
Fixes: f4c978b6594b ("mm/damon/core-test: add a test for damon_update_monitoring_results()")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[6.3+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:32 -08:00
SeongJae Park
74d5969995 mm/damon/tests/core-kunit: handle alloc failures in damon_test_set_regions()
damon_test_set_regions() is assuming all dynamic memory allocation in it
will succeed.  Those are indeed likely in the real use cases since those
allocations are too small to fail, but theoretically those could fail.  In
the case, inappropriate memory access can happen.  Fix it by appropriately
cleanup pre-allocated memory and skip the execution of the remaining tests
in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-11-sj@kernel.org
Fixes: 62f409560eb2 ("mm/damon/core-test: test damon_set_regions")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[6.1+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:32 -08:00
SeongJae Park
4f835f4e8c mm/damon/tests/core-kunit: handle alloc failures in damon_test_ops_registration()
damon_test_ops_registration() is assuming all dynamic memory allocation in
it will succeed.  Those are indeed likely in the real use cases since
those allocations are too small to fail, but theoretically those could
fail.  In the case, inappropriate memory access can happen.  Fix it by
appropriately cleanup pre-allocated memory and skip the execution of the
remaining tests in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-10-sj@kernel.org
Fixes: 4f540f5ab4f2 ("mm/damon/core-test: add a kunit test case for ops registration")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[5.19+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:32 -08:00
SeongJae Park
eded254cb6 mm/damon/tests/core-kunit: handle alloc failures on damon_test_split_regions_of()
damon_test_split_regions_of() is assuming all dynamic memory allocation in
it will succeed.  Those are indeed likely in the real use cases since
those allocations are too small to fail, but theoretically those could
fail.  In the case, inappropriate memory access can happen.  Fix it by
appropriately cleanup pre-allocated memory and skip the execution of the
remaining tests in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-9-sj@kernel.org
Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[5.15+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:32 -08:00
SeongJae Park
0998d27572 mm/damon/tests/core-kunit: handle alloc failures on dasmon_test_merge_regions_of()
damon_test_merge_regions_of() is assuming all dynamic memory allocation in
it will succeed.  Those are indeed likely in the real use cases since
those allocations are too small to fail, but theoretically those could
fail.  In the case, inappropriate memory access can happen.  Fix it by
appropriately cleanup pre-allocated memory and skip the execution of the
remaining tests in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-8-sj@kernel.org
Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[5.15+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:31 -08:00
SeongJae Park
3d443dd29a mm/damon/tests/core-kunit: handle alloc failures on damon_test_merge_two()
damon_test_merge_two() is assuming all dynamic memory allocation in it
will succeed.  Those are indeed likely in the real use cases since those
allocations are too small to fail, but theoretically those could fail.  In
the case, inappropriate memory access can happen.  Fix it by appropriately
cleanup pre-allocated memory and skip the execution of the remaining tests
in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-7-sj@kernel.org
Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[5.15+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:31 -08:00
SeongJae Park
5e80d73f22 mm/damon/tests/core-kunit: handle alloc failures on damon_test_split_at()
damon_test_split_at() is assuming all dynamic memory allocation in it will
succeed.  Those are indeed likely in the real use cases since those
allocations are too small to fail, but theoretically those could fail.  In
the case, inappropriate memory access can happen.  Fix it by appropriately
cleanup pre-allocated memory and skip the execution of the remaining tests
in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-6-sj@kernel.org
Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[5.15+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:31 -08:00
SeongJae Park
f79f2fc44e mm/damon/tests/core-kunit: handle memory alloc failure from damon_test_aggregate()
damon_test_aggregate() is assuming all dynamic memory allocation in it
will succeed.  Those are indeed likely in the real use cases since those
allocations are too small to fail, but theoretically those could fail.  In
the case, inappropriate memory access can happen.  Fix it by appropriately
cleanup pre-allocated memory and skip the execution of the remaining tests
in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-5-sj@kernel.org
Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[5.15+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:31 -08:00
SeongJae Park
fafe953de2 mm/damon/tests/core-kunit: handle memory failure from damon_test_target()
damon_test_target() is assuming all dynamic memory allocation in it will
succeed.  Those are indeed likely in the real use cases since those
allocations are too small to fail, but theoretically those could fail.  In
the case, inappropriate memory access can happen.  Fix it by appropriately
cleanup pre-allocated memory and skip the execution of the remaining tests
in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-4-sj@kernel.org
Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[5.15+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:30 -08:00
SeongJae Park
e16fdd4f75 mm/damon/tests/core-kunit: handle allocation failures in damon_test_regions()
damon_test_regions() is assuming all dynamic memory allocation in it will
succeed.  Those are indeed likely in the real use cases since those
allocations are too small to fail, but theoretically those could fail.  In
the case, inappropriate memory access can happen.  Fix it by appropriately
cleanup pre-allocated memory and skip the execution of the remaining tests
in the failure cases.

Link: https://lkml.kernel.org/r/20251101182021.74868-3-sj@kernel.org
Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[5.15+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:30 -08:00
SeongJae Park
b5ab490d85 mm/damon/tests/core-kunit: fix memory leak in damon_test_set_filters_default_reject()
Patch series "mm/damon/tests: fix memory bugs in kunit tests".

DAMON kunit tests were initially written assuming those will be run on
environments that are well controlled and therefore tolerant to transient
test failures and bugs in the test code itself.  The user-mode linux based
manual run of the tests is one example of such an environment.  And the
test code was written for adding more test coverage as fast as possible,
over making those safe and reliable.

As a result, the tests resulted in having a number of bugs including real
memory leaks, theoretical unhandled memory allocation failures, and unused
memory allocations.  The allocation failures that are not handled well are
unlikely in the real world, since those allocations are too small to fail.
But in theory, it can happen and cause inappropriate memory access.

It is arguable if bugs in test code can really harm users.  But, anyway
bugs are bugs that need to be fixed.  Fix the bugs one by one.  Also Cc
stable@ for the fixes of memory leak and unhandled memory allocation
failures.  The unused memory allocations are only a matter of memory
efficiency, so not Cc-ing stable@.

The first patch fixes memory leaks in the test code for the DAMON core
layer.

Following fifteen, three, and one patches respectively fix unhandled
memory allocation failures in the test code for DAMON core layer, virtual
address space DAMON operation set, and DAMON sysfs interface, one by one
per test function.

Final two patches remove memory allocations that are correctly deallocated
at the end, but not really being used by any code.


This patch (of 22):

Kunit test function for damos_set_filters_default_reject() allocates two
'struct damos_filter' objects and not deallocates those, so that the
memory for the two objects are leaked for every time the test runs.  Fix
this by deallocating those objects at the end of the test code.

Link: https://lkml.kernel.org/r/20251101182021.74868-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20251101182021.74868-2-sj@kernel.org
Fixes: 094fb14913c7 ("mm/damon/tests/core-kunit: add a test for damos_set_filters_default_reject()")
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Higgins <brendan.higgins@linux.dev>
Cc: David Gow <davidgow@google.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <stable@vger.kernel.org>	[6.16+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:30 -08:00
Ankit Agrawal
ebb9aeb980 vfio/nvgrace-gpu: register device memory for poison handling
The nvgrace-gpu-vfio-pci module [1] maps the device memory to the user VA
(Qemu) using remap_pfn_range() without adding the memory to the kernel. 
The device memory pages are not backed by struct page.  The previous patch
implements the mechanism to handle ECC/poison on memory page without
struct page.  This new mechanism is being used here.

The module registers its memory region and the address_space with the
kernel MM for ECC handling using the register_pfn_address_space()
registration API exposed by the kernel.

Link: https://lore.kernel.org/all/20240220115055.23546-1-ankita@nvidia.com/ [1]
Link: https://lkml.kernel.org/r/20251102184434.2406-4-ankita@nvidia.com
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Acked-by: Alex Williamson <alex@shazbot.org>
Cc: Aniket Agashe <aniketa@nvidia.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Kirti Wankhede <kwankhede@nvidia.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew R. Ochs <mochs@nvidia.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Neo Jia <cjia@nvidia.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shuai Xue <xueshuai@linux.alibaba.com>
Cc: Smita Koralahalli Channabasappa <smita.koralahallichannabasappa@amd.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tarun Gupta <targupta@nvidia.com>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Cc: Vikram Sethi <vsethi@nvidia.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:30 -08:00
Ankit Agrawal
2ec4196718 mm: handle poisoning of pfn without struct pages
Poison (or ECC) errors can be very common on a large size cluster.  The
kernel MM currently does not handle ECC errors / poison on a memory region
that is not backed by struct pages.  If a memory region mapped using
remap_pfn_range() for example, but not added to the kernel, MM will not
have associated struct pages.  Add a new mechanism to handle memory
failure on such memory.

Make kernel MM expose a function to allow modules managing the device
memory to register the device memory SPA and the address space associated
it.  MM maintains this information as an interval tree.  On poison, MM can
search for the range that the poisoned PFN belong and use the
address_space to determine the mapping VMA.

In this implementation, kernel MM follows the following sequence that is
largely similar to the memory_failure() handler for struct page backed
memory:

1. memory_failure() is triggered on reception of a poison error.  An
   absence of struct page is detected and consequently
   memory_failure_pfn() is executed.

2. memory_failure_pfn() collects the processes mapped to the PFN.

3. memory_failure_pfn() sends SIGBUS to all the processes mapping the
   faulty PFN using kill_procs().

Note that there is one primary difference versus the handling of the
poison on struct pages, which is to skip unmapping to the faulty PFN. 
This is done to handle the huge PFNMAP support added recently [1] that
enables VM_PFNMAP vmas to map at PMD or PUD level.  A poison to a PFN
mapped in such as way would need breaking the PMD/PUD mapping into PTEs
that will get mirrored into the S2.  This can greatly increase the cost of
table walks and have a major performance impact.

Link: https://lore.kernel.org/all/20240826204353.2228736-1-peterx@redhat.com/ [1]
Link: https://lkml.kernel.org/r/20251102184434.2406-3-ankita@nvidia.com
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Cc: Aniket Agashe <aniketa@nvidia.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Kirti Wankhede <kwankhede@nvidia.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew R. Ochs <mochs@nvidia.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Neo Jia <cjia@nvidia.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shuai Xue <xueshuai@linux.alibaba.com>
Cc: Smita Koralahalli Channabasappa <smita.koralahallichannabasappa@amd.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tarun Gupta <targupta@nvidia.com>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Cc: Vikram Sethi <vsethi@nvidia.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:29 -08:00
Ankit Agrawal
30d0a12910 mm: change ghes code to allow poison of non-struct pfn
Poison (or ECC) errors can be very common on a large size cluster.  The
kernel MM currently handles ECC errors / poison only on memory page backed
by struct page.  The handling is currently missing for the PFNMAP memory
that does not have struct pages.  The series adds such support.

Implement a new ECC handling for memory without struct pages.  Kernel MM
expose registration APIs to allow modules that are managing the device to
register its device memory region.  MM then tracks such regions using
interval tree.

The mechanism is largely similar to that of ECC on pfn with struct pages. 
If there is an ECC error on a pfn, all the mapping to it are identified
and a SIGBUS is sent to the user space processes owning those mappings. 
Note that there is one primary difference versus the handling of the
poison on struct pages, which is to skip unmapping to the faulty PFN. 
This is done to handle the huge PFNMAP support added recently [1] that
enables VM_PFNMAP vmas to map at PMD or PUD level.  A poison to a PFN
mapped in such as way would need breaking the PMD/PUD mapping into PTEs
that will get mirrored into the S2.  This can greatly increase the cost of
table walks and have a major performance impact.

nvgrace-gpu-vfio-pci module maps the device memory to user VA (Qemu) using
remap_pfn_range without being added to the kernel [2].  These device
memory PFNs are not backed by struct page.  So make nvgrace-gpu-vfio-pci
module make use of the mechanism to get poison handling support on the
device memory.


This patch (of 3):

The GHES code allows calling of memory_failure() on the PFNs that pass the
pfn_valid() check.  This contract is broken for the remapped PFNs which
fails the check and ghes_do_memory_failure() returns without triggering
memory_failure().

Update code to allow memory_failure() call on PFNs failing pfn_valid().

Link: https://lkml.kernel.org/r/20251102184434.2406-1-ankita@nvidia.com
Link: https://lkml.kernel.org/r/20251102184434.2406-2-ankita@nvidia.com
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Reviewed-by: Shuai Xue <xueshuai@linux.alibaba.com>
Cc: Aniket Agashe <aniketa@nvidia.com>
Cc: Ankit Agrawal <ankita@nvidia.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Kirti Wankhede <kwankhede@nvidia.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew R. Ochs <mochs@nvidia.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Neo Jia <cjia@nvidia.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Smita Koralahalli Channabasappa <smita.koralahallichannabasappa@amd.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tarun Gupta <targupta@nvidia.com>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Cc: Vikram Sethi <vsethi@nvidia.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:29 -08:00
Baokun Li
ee040cbd6e mm/page_alloc: don't warn about large allocations with __GFP_NOFAIL
Filesystems use __GFP_NOFAIL to allocate block-sized folios for metadata
reads at critical points, since they cannot afford to go read-only, shut
down, or enter an inconsistent state due to memory pressure.

Currently, attempting to allocate page units greater than order-1 with the
__GFP_NOFAIL flag triggers a WARN_ON() in __alloc_pages_slowpath(). 
However, filesystems supporting large block sizes (blocksize > PAGE_SIZE)
can easily require allocations larger than order-1.

As Matthew Wilcox noted in [1], if we have a filesystem with 64KiB
sectors, there will be many clean folios in the page cache that are 64KiB
or larger.  He also explained in [2] why kvmalloc isn't a valid approach
here.

With gfp flags and order already included in the OOM report, both
Vlastimil Babka and Michal Hocko suggested that we can take the risk of
removing this warning first and then observe whether a large number of
related OOM reports appear.

If that happens, we can consider adding special handling in other places.

Link: https://lkml.kernel.org/r/20251105085652.4081123-1-libaokun@huaweicloud.com
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Link: https://lore.kernel.org/all/aQPX1-XWQjKaMTZB@casper.infradead.org [1]
Link: https://lore.kernel.org/all/aQTHMI3t5mNXp0M1@casper.infradead.org [2]
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Link: https://lore.kernel.org/all/188a95ba-6384-4319-bb74-c0d9ec6c4079@suse.cz
Suggested-by: Michal Hocko <mhocko@suse.com>
Link: https://lore.kernel.org/all/aQotQBjnDDeL_wHx@tiehlicka
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: ErKun Yang <yangerkun@huawei.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: "zhangyi (F)" <yi.zhang@huawei.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:29 -08:00
Zhang Chujun
77a7cfd96c mm/debug: fix missing space in case statement
In setup_vm_debug() , the case statement for 'p' option is written as
'case'p':' without a space between 'case' and the character constant. 
While this is syntactically valid C, it violates the Linux kernel coding
style, which requires a space after 'case'.  This patch adds the missing
space to comply with coding standards.

Link: https://lkml.kernel.org/r/20251103065910.2196-1-zhangchujun@cmss.chinamobile.com
Signed-off-by: Zhang Chujun <zhangchujun@cmss.chinamobile.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:29 -08:00
Pedro Demarchi Gomes
05c3fa9c9f ksm: replace function unmerge_ksm_pages with break_ksm
Function unmerge_ksm_pages() is unnecessary since now break_ksm() walks an
address range.  So replace it with break_ksm().

Link: https://lkml.kernel.org/r/20251105184912.186329-4-pedrodemargomes@gmail.com
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:28 -08:00
Pedro Demarchi Gomes
5d4939fc22 ksm: perform a range-walk in break_ksm
Make break_ksm() receive an address range and change break_ksm_pmd_entry()
to perform a range-walk and return the address of the first ksm page
found.

This change allows break_ksm() to skip unmapped regions instead of
iterating every page address.  When unmerging large sparse VMAs, this
significantly reduces runtime.

In a benchmark unmerging a 32 TiB sparse virtual address space where only
one page was populated, the runtime dropped from 9 minutes to less then 5
seconds.

Link: https://lkml.kernel.org/r/20251105184912.186329-3-pedrodemargomes@gmail.com
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:28 -08:00
Pedro Demarchi Gomes
912aa82595 Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk"
Patch series "ksm: perform a range-walk to jump over holes in break_ksm",
v4.

When unmerging an address range, unmerge_ksm_pages function walks every
page address in the specified range to locate ksm pages.  This becomes
highly inefficient when scanning large virtual memory areas that contain
mostly unmapped regions, causing the process to get blocked for several
minutes.

This patch makes break_ksm, function called by unmerge_ksm_pages for every
page in an address range, perform a range walk, allowing it to skip over
entire unmapped holes in a VMA, avoiding unnecessary lookups.

As pointed out by David Hildenbrand in [1], unmerge_ksm_pages() is called
from:

* ksm_madvise() through madvise(MADV_UNMERGEABLE).  There are not a lot
  of users of that function.

* __ksm_del_vma() through ksm_del_vmas().  Effectively called when
  disabling KSM for a process either through the sysctl or from s390x gmap
  code when enabling storage keys for a VM.

Consider the following test program which creates a 32 TiB mapping in the
virtual address space but only populates a single page:

#include <unistd.h>
#include <stdio.h>
#include <sys/mman.h>

/* 32 TiB */
const size_t size = 32ul * 1024 * 1024 * 1024 * 1024;

int main() {
        char *area = mmap(NULL, size, PROT_READ | PROT_WRITE,
                          MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0);

        if (area == MAP_FAILED) {
                perror("mmap() failed\n");
                return -1;
        }

        /* Populate a single page such that we get an anon_vma. */
        *area = 0;

        /* Enable KSM. */
        madvise(area, size, MADV_MERGEABLE);
        madvise(area, size, MADV_UNMERGEABLE);
        return 0;
}


Without this patch, this program takes 9 minutes to finish, while with
this patch it finishes in less then 5 seconds.


This patch (of 3):

This reverts commit e317a8d8b4f600fc7ec9725e26417030ee594f52 and changes
function break_ksm_pmd_entry() to use folios.

This reverts break_ksm() to use walk_page_range_vma() instead of
folio_walk_start().

Change break_ksm_pmd_entry() to call is_ksm_zero_pte() only if we know the
folio is present, and also rename variable ret to found.  This will make
it easier to later modify break_ksm() to perform a proper range walk.

Link: https://lkml.kernel.org/r/20251105184912.186329-1-pedrodemargomes@gmail.com
Link: https://lkml.kernel.org/r/20251105184912.186329-2-pedrodemargomes@gmail.com
Link: https://lore.kernel.org/linux-mm/e0886fdf-d198-4130-bd9a-be276c59da37@redhat.com/ [1]
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:28 -08:00
Israel Batista
ed1f8855dd mm: change type of parameter for memory_notify
memory_notify() is responsible for sending events related to memory
hotplugging to a notification queue.  Since all the events must match one
of the values from the enum memory_block_state, it is appropriate to
change the function parameter type to make this condition explicit at
compile time.

Link: https://lkml.kernel.org/r/20251029195617.2210700-4-linux@israelbatista.dev.br
Signed-off-by: Israel Batista <linux@israelbatista.dev.br>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Omar Sandoval <osandov@osandov.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:28 -08:00
Israel Batista
8bc7ba3d26 mm: change type of state in struct memory_block
The state of a memory block should be restricted to values specified in
the documentation of the memory hotplug API.  However, since the state
field in the memory_block struct was defined as an unsigned long, this
restriction was not enforced at compile time.

With the introduction of the enum memory_block_state, it is now possible
to incorporate the desired semantics in the field declaration and enforce
these restrictions at compile time.

[akpm@linux-foundation.org: fix whitespace, per Randy]
Link: https://lkml.kernel.org/r/20251029195617.2210700-3-linux@israelbatista.dev.br
Signed-off-by: Israel Batista <linux@israelbatista.dev.br>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Omar Sandoval <osandov@osandov.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:27 -08:00
Israel Batista
1a4f70f685 mm: convert memory block states (MEM_*) macros to enum
Patch series "mm: Convert memory block states (MEM_*) macros to enums", v2.

The MEM_* constants indicating the state of a memory block are currently
defined as macros, meaning their definitions will be omitted from the
debuginfo on most kernel builds.  This makes it harder for debuggers to
correctly map the block state at runtime, which can be quite useful when
analysing errors related to memory hot plugging and unplugging with tools
such as drgn.

Converting the constants to an enum ensures the correct information is
emitted by the compiler and available for the debugger, without needing to
hard-code them into the debugger and track their changes.

This patch series aims to replace the current macros with a newly created
enum named memory_block_state, while also taking advantage of the compile
time guarantees that we get when using enums.

The first patch does the conversion of the macros to an enum, while the
2nd and 3rd patches use this enum to clean up some type declarations and
make sure that only valid values are used.


This patch (of 3):

Converting the MEM_* constants from macros to an enum ensures that their
values will be correctly emitted in the debug symbols, making it easier to
trace the meaning of each value when debugging with tools such as drgn,
without the need to hard-code the values.

Since the values are mutually exclusive and they are not exposed directly
to userspace, I also dropped the misleading pattern (1<<X) that made it
look like they were combinable flags.

Link: https://lkml.kernel.org/r/20251029195617.2210700-1-linux@israelbatista.dev.br
Link: https://lkml.kernel.org/r/20251029195617.2210700-2-linux@israelbatista.dev.br
Signed-off-by: Israel Batista <linux@israelbatista.dev.br>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Omar Sandoval <osandov@osandov.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:27 -08:00
Baoquan He
52f37efc59 mm/swap: select swap device with default priority round robin
Swap devices are assumed to have similar accessing speed when swapon if no
priority is specified.  It's unfair and doesn't make sense just because
one swap device is swapped on firstly, its priority will be higher than
the one swapped on later.

Here, set all swap devicess to have priority '-1' by default.  With this
change, swap device with default priority will be selected round robin
when swapping out.  This can improve the swapping efficiency a lot among
multiple swap devices with default priority.

Below are swapon output during the processes when high pressure
vm-scability test is being taken:

1) This is pre-commit a2468cc9bfdf, swap device is selectd one by one by
   priority from high to low when one swap device is exhausted:
------------------------------------
[root@hp-dl385g10-03 ~]# swapon
NAME       TYPE      SIZE   USED PRIO
/dev/zram0 partition  16G    16G   -1
/dev/zram1 partition  16G 966.2M   -2
/dev/zram2 partition  16G     0B   -3
/dev/zram3 partition  16G     0B   -4

2) This is behaviour with commit a2468cc9bfdf, on node, swap device
   sharing the same node id is selected firstly until exhausted; while
   on node no swap device sharing the node id it selects the one with
   highest priority until exhaustd:
------------------------------------
[root@hp-dl385g10-03 ~]# swapon
NAME       TYPE      SIZE  USED PRIO
/dev/zram0 partition  16G 15.7G   -2
/dev/zram1 partition  16G  3.4G   -3
/dev/zram2 partition  16G  3.4G   -4
/dev/zram3 partition  16G  2.6G   -5

3) After this patch applied, swap devices with default priority are selectd
   round robin:
------------------------------------
[root@hp-dl385g10-03 block]# swapon
NAME       TYPE      SIZE USED PRIO
/dev/zram0 partition  16G 6.6G   -1
/dev/zram1 partition  16G 6.6G   -1
/dev/zram2 partition  16G 6.6G   -1
/dev/zram3 partition  16G 6.6G   -1

With the change, about 18% efficiency promotion relative to node based
way as below. (Surely, the pre-commit a2468cc9bfdf way is the worst.)

vm-scability test:
==================
Test with:
usemem --init-time -O -y -x -n 31 2G (4G memcg, zram as swap)
                            one by one:      node based:      round robin:
System time:                1087.38 s        637.92 s         526.74 s     (lower is better)
Sum Throughput:             2036.55 MB/s     3546.56 MB/s     4207.56 MB/s (higher is better)
Single process Throughput:  65.69 MB/s       114.40 MB/s      135.72 MB/s  (high is better)
free latency:               15769409.48 us   10138455.99 us   6810119.01 us(lower is better)

Link: https://lkml.kernel.org/r/20251028034308.929550-3-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Suggested-by: Chris Li <chrisl@kernel.org>
Acked-by: Chris Li <chrisl@kernel.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Kairui Song <kasong@tencent.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:27 -08:00
Baoquan He
8e689f8ea4 mm/swap: do not choose swap device according to numa node
Patch series "mm/swapfile.c: select swap devices of default priority round
robin", v5.

Currently, on system with multiple swap devices, swap allocation will
select one swap device according to priority.  The swap device with the
highest priority will be chosen to allocate firstly.

People can specify a priority from 0 to 32767 when swapon a swap device,
or the system will set it from -2 then downwards by default.  Meanwhile,
on NUMA system, the swap device with node_id will be considered first on
that NUMA node of the node_id.

In the current code, an array of plist, swap_avail_heads[nid], is used to
organize swap devices on each NUMA node.  For each NUMA node, there is a
plist organizing all swap devices.  The 'prio' value in the plist is the
negated value of the device's priority due to plist being sorted from low
to high.  The swap device owning one node_id will be promoted to the front
position on that NUMA node, then other swap devices are put in order of
their default priority.

E.g I got a system with 8 NUMA nodes, and I setup 4 zram partition as
swap devices.

Current behaviour:
their priorities will be(note that -1 is skipped):
NAME       TYPE      SIZE USED PRIO
/dev/zram0 partition  16G   0B   -2
/dev/zram1 partition  16G   0B   -3
/dev/zram2 partition  16G   0B   -4
/dev/zram3 partition  16G   0B   -5

And their positions in the 8 swap_avail_lists[nid] will be:
swap_avail_lists[0]: /* node 0's available swap device list */
zram0   -> zram1   -> zram2   -> zram3
prio:1     prio:3     prio:4     prio:5
swap_avali_lists[1]: /* node 1's available swap device list */
zram1   -> zram0   -> zram2   -> zram3
prio:1     prio:2     prio:4     prio:5
swap_avail_lists[2]: /* node 2's available swap device list */
zram2   -> zram0   -> zram1   -> zram3
prio:1     prio:2     prio:3     prio:5
swap_avail_lists[3]: /* node 3's available swap device list */
zram3   -> zram0   -> zram1   -> zram2
prio:1     prio:2     prio:3     prio:4
swap_avail_lists[4-7]: /* node 4,5,6,7's available swap device list */
zram0   -> zram1   -> zram2   -> zram3
prio:2     prio:3     prio:4     prio:5

The adjustment for swap device with node_id intended to decrease the
pressure of lock contention for one swap device by taking different swap
device on different node.  The adjustment was introduced in commit
a2468cc9bfdf ("swap: choose swap device according to numa node"). 
However, the adjustment is a little coarse-grained.  On the node, the swap
device sharing the node's id will always be selected firstly by node's
CPUs until exhausted, then next one.  And on other nodes where no swap
device shares its node id, swap device with priority '-2' will be selected
firstly until exhausted, then next with priority '-3'.

This is the swapon output during the process high pressure vm-scability
test is being taken.  It's clearly showing zram0 is heavily exploited
until exhausted.

===================================
[root@hp-dl385g10-03 ~]# swapon
NAME       TYPE      SIZE  USED PRIO
/dev/zram0 partition  16G 15.7G   -2
/dev/zram1 partition  16G  3.4G   -3
/dev/zram2 partition  16G  3.4G   -4
/dev/zram3 partition  16G  2.6G   -5

The node based strategy on selecting swap device is much better then the
old way one by one selecting swap device.  However it is still
unreasonable because swap devices are assumed to have similar accessing
speed if no priority is specified when swapon.  It's unfair and doesn't
make sense just because one swap device is swapped on firstly, its
priority will be higher than the one swapped on later.

So in this patchset, change is made to select the swap device round robin
if default priority.  In code, the plist array swap_avail_heads[nid] is
replaced with a plist swap_avail_head which reverts commit a2468cc9bfdf. 
Meanwhile, on top of the revert, further change is taken to make any
device w/o specified priority get the same default priority '-1'.  Surely,
swap device with specified priority are always put foremost, this is not
impacted.  If you care about their different accessing speed, then use
'swapon -p xx' to deploy priority for your swap devices.

New behaviour:

swap_avail_list: /* one global available swap device list */
zram0   -> zram1   -> zram2   -> zram3
prio:1     prio:1     prio:1     prio:1

This is the swapon output during the process high pressure vm-scability
being taken, all is selected round robin:
=======================================
[root@hp-dl385g10-03 linux]# swapon
NAME       TYPE      SIZE  USED PRIO
/dev/zram0 partition  16G 12.6G   -1
/dev/zram1 partition  16G 12.6G   -1
/dev/zram2 partition  16G 12.6G   -1
/dev/zram3 partition  16G 12.6G   -1

With the change, we can see about 18% efficiency promotion as below:

vm-scability test:
==================
Test with:
usemem --init-time -O -y -x -n 31 2G (4G memcg, zram as swap)
                           Before:          After:
System time:               637.92 s         526.74 s      (lower is better)
Sum Throughput:            3546.56 MB/s     4207.56 MB/s  (higher is better)
Single process Throughput: 114.40 MB/s      135.72 MB/s   (higher is better)
free latency:              10138455.99 us   6810119.01 us (low is better)


This patch (of 2):

This reverts commit a2468cc9bfdf ("swap: choose swap device according to
numa node").

After this patch, the behaviour will change back to pre-commit
a2468cc9bfdf.  Means the priority will be set from -1 then downwards by
default, and when swapping, it will exhault swap device one by one
according to priority from high to low.  This is preparation work for
later change.

[root@hp-dl385g10-03 ~]# swapon
NAME       TYPE      SIZE   USED PRIO
/dev/zram0 partition  16G    16G   -1
/dev/zram1 partition  16G 966.2M   -2
/dev/zram2 partition  16G     0B   -3
/dev/zram3 partition  16G     0B   -4

Link: https://lkml.kernel.org/r/20251028034308.929550-1-bhe@redhat.com
Link: https://lkml.kernel.org/r/20251028034308.929550-2-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Suggested-by: Chris Li <chrisl@kernel.org>
Acked-by: Chris Li <chrisl@kernel.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Kairui Song <kasong@tencent.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:27 -08:00
Jiayuan Chen
6af766c86b mm: vmstat: output reserved_highatomic and free_highatomic in zoneinfo
The nr_free_highatomic is a key factor in calculating watermarks as it
affects the free pages count.  Adding this metric, along with
nr_reserved_highatomic, to /proc/zoneinfo facilitates easier diagnosis
memory watermark calculations and memory pressure states.

Sample output:
cat /proc/zoneinfo
......
pagesets
cpu: 0
		  count:    52069
		  high:     52675
		  batch:    63
		  high_min: 13971
		  high_max: 62284
vm stats threshold: 10
node_unreclaimable:  0
start_pfn:           4096
reserved_highatomic: 5120
free_highatomic:     2081

Link: https://lkml.kernel.org/r/20251027141818.283587-1-jiayuan.chen@linux.dev
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:26 -08:00
Anshuman Khandual
272239dc8f mm: make INVALID_PHYS_ADDR a generic macro
INVALID_PHYS_ADDR has very similar definitions across the code base. 
Hence just move that inside header <liux/mm.h> for more generic usage. 
Also drop the now redundant ones which are no longer required.

Link: https://lkml.kernel.org/r/20251021025638.2420216-1-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Alexander Gordeev <agordeev@linux.ibm.com>	[s390]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:26 -08:00
Lorenzo Stoakes
b734b9d973 mm/vma: small VMA lock cleanups
We declare vma_start_read() as a static function in mm/mmap_lock.c, so
there is no need to provide a stub for !CONFIG_PER_VMA_LOCK.

__is_vma_write_locked() is declared in a header and should therefore be
static inline.

Put parens around (refcnt & VMA_LOCK_OFFSET) in is_vma_writer_only() to
make precedence clear.

Link: https://lkml.kernel.org/r/20251024090902.1118174-1-lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam Howlett <liam.howlett@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:26 -08:00
Harry Yoo
ad8b2e0961 treewide: include linux/pgalloc.h instead of asm/pgalloc.h
For now, including <asm/pgalloc.h> instead of <linux/pgalloc.h> is
technically fine unless the .c file calls p*d_populate_kernel() helper
functions.

But it is a better practice to always include <linux/pgalloc.h>.  Include
<linux/pgalloc.h> instead of <asm/pgalloc.h> outside arch/.

Link: https://lkml.kernel.org/r/20251024113047.119058-3-harry.yoo@oracle.com
Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:25 -08:00
Harry Yoo
5e0fa7ed98 MAINTAINERS: add include/linux/pgalloc.h to MM CORE section
Patch series "mm: MISC follow-up patches for linux/pgalloc.h", v2.

This is a follow-up patch series for the patch series named: "[PATCH V5
mm-hotfixes 0/3] mm, x86: fix crash due to missing page table sync and
make it harder to miss".


This patch (of 2):

Since include/linux/pgtable.h is already listed in the MM CORE section,
add it to the section as well to keep it maintained by the appropriate
maintainers.

Link: https://lkml.kernel.org/r/20251024113047.119058-1-harry.yoo@oracle.com
Link: https://lkml.kernel.org/r/20251024113047.119058-2-harry.yoo@oracle.com
Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: SeongJae Park <sj@kernel.org>
Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:25 -08:00
SeongJae Park
809ba69f9f selftests/damon/sysfs: add obsolete_target test
A new DAMON sysfs file for pin-point target removal, namely
obsolete_target, has been added.  Add a test for the functionality.  It
starts DAMON with three monitoring target processes, mark one in the
middle as obsolete, commit it, and confirm the internal DAMON status is
updated to remove the target in the middle.

Link: https://lkml.kernel.org/r/20251023012535.69625-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Bijan Tabatabai <bijan311@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:25 -08:00
SeongJae Park
65a9033db7 sysfs.py: extend assert_ctx_committed() for monitoring targets
assert_ctx_committed() is not asserting monitoring targets commitment,
since all existing callers of the function assume no target changes. 
Extend it for future usage.

Link: https://lkml.kernel.org/r/20251023012535.69625-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Bijan Tabatabai <bijan311@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:25 -08:00
SeongJae Park
a00f18abef drgn_dump_damon_status: dump damon_target->obsolete
A new field of damon_target for pin-point target removal, namely obsolete,
has newly been added.  Extend drgn_dump_damon_status.py to dump it, for
easily writing a future DAMON selftests of it.

Link: https://lkml.kernel.org/r/20251023012535.69625-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Bijan Tabatabai <bijan311@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:24 -08:00
SeongJae Park
badfa4361c selftests/damon/_damon_sysfs: support obsolete_target file
A DAMON sysfs file, namely obsolete_target, has been newly introduced. 
Add a support of that file to _damon_sysfs.py so that DAMON selftests for
the file can be easily written.

Link: https://lkml.kernel.org/r/20251023012535.69625-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: Bijan Tabatabai <bijan311@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:24 -08:00
SeongJae Park
9abe8d0519 Docs/ABI/damon: document obsolete_target sysfs file
Update DAMON ABI document for the newly added obsolete_target DAMON sysfs
file.

Link: https://lkml.kernel.org/r/20251023012535.69625-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Bijan Tabatabai <bijan311@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:24 -08:00
SeongJae Park
e06469cdf1 Docs/admin-guide/mm/damon/usage: document obsolete_target file
Document the newly added obsolete_target DAMON sysfs file.

Link: https://lkml.kernel.org/r/20251023012535.69625-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Bijan Tabatabai <bijan311@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:24 -08:00
SeongJae Park
e35afdf228 mm/damon/sysfs: implement obsolete_target file
There is no good way to remove DAMON targets in the middle of the existing
targets list.  It restricts efficient and flexible DAMON use cases. 
Improve the usability by implementing a new DAMON sysfs interface file,
namely obsolete_target, under each target directory.  It is connected to
the obsolete field of parameters commit-source targets, so allows removing
arbitrary targets in the middle of existing targets list.

Note that the sysfs files are not automatically updated.  For example,
let's suppose there are three targets in the running context, and a user
removes the third target using this feature.  If the user writes 'commit'
to the kdamond 'state' file again, DAMON sysfs interface will again try to
remove the third target.  But because there is no matching target in the
running context, the commit will fail.  It is the user's responsibility to
understand resulting DAMON internal targets list change, and construct
sysfs files (using nr_targets and other sysfs files) to correctly
represent it.

Also note that this is arguably an improvement rather than a fix of broken
things.

Link: https://lkml.kernel.org/r/20251023012535.69625-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Reported-by: Bijan Tabatabai <bijan311@gmail.com>
Closes: https://github.com/damonitor/damo/issues/36
Reviewed-by: Bijan Tabatabai <bijan311@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:23 -08:00
SeongJae Park
60bd24f272 mm/damon/sysfs: test commit input against realistic destination
DAMON sysfs interface tests if given online parameters update request is
valid, by committing those using the DAMON kernel API, to a test-purpose
destination context.  The test-purpose destination context is constructed
using damon_new_ctx(), so it has no target, no scheme.

If a source target has the obsolete field set, the test-purpose commit
will fail because damon_commit_targets() fails when there is a source
obsolete target that cannot find its matching destination target.  DAMON
sysfs interface is not letting users set the field for now, so there is no
problem.  However, the following commit will support that.  Also there
could be similar future changes that making commit fails based on current
context structure.

Make the test purpose commit destination context similar to the current
running one, by committing the running one to the test purpose context,
before doing the real test-purpose commit.

Link: https://lkml.kernel.org/r/20251023012535.69625-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Bijan Tabatabai <bijan311@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:23 -08:00
SeongJae Park
adf7d6cdd7 mm/damon/core: add damon_target->obsolete for pin-point removal
Patch series "mm/damon: support pin-point targets removal".

DAMON maintains the targets in a list, and allows committing only an
entire list of targets having the new parameters.  Targets having same
index on the lists are treated as matching source and destination
targets.  If an existing target cannot find a matching one in the
sources list, the target is removed.  This means that there is no way to
remove only a specific monitoring target in the middle of the current
targets list.

Such pin-point target removal is really needed in some use cases,
though.  Monitoring access patterns on virtual address spaces of
processes that spawned from the same ancestor is one example.  If a
process of the group is terminated, the user may want to remove the
matching DAMON target as soon as possible, to save in-kernel memory
usage for the unnecessary target data.  The user may also want to do
that without turning DAMON off or removing unnecessary targets, to keep
the current monitoring results for other active processes.

Extend DAMON kernel API and sysfs ABI to support the pin-point removal
in the following way.  For API, add a new damon_target field, namely
'obsolete'.  If the field on parameters commit source target is set, it
means the matching destination target is obsolete.  Then the parameters
commit logic removes the destination target from the existing targets
list.  For sysfs ABI, add a new file under the target directory, namely
'obsolete_target'.  It is connected with the 'obsolete' field of the
commit source targets, so internally using the new API.

Also add a selftest for the new feature.  The related helper scripts for
manipulating the sysfs interface and dumping in-kernel DAMON status are
also extended for this.  Note that the selftest part was initially
posted as an individual RFC series [1], but now merged into this one.

Bijan Tabatabai has originally reported this issue, and participated in
this solution design on a GitHub issue [1] for DAMON user-space tool.


This patch (of 9):

DAMON's monitoring targets parameters update function,
damon_commit_targets(), is not providing a way to remove a target in the
middle of the existing targets list.  Extend the API by adding a field to
struct damon_target.  If the field of a damon_commit_targets() source
target is set, it indicates the matching target on the existing targets
list is obsolete.  damon_commit_targets() understands that and removes
those from the list, while respecting the index based matching for other
non-obsolete targets.

Link: https://lkml.kernel.org/r/20251023012535.69625-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20251023012535.69625-2-sj@kernel.org
Link: https://github.com/damonitor/damo/issues/36 [1]
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Bijan Tabatabai <bijan311@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:23 -08:00
Dmitry Ilvokhin
e97d7c5165 mm: shmem/tmpfs hugepage defaults config choice
Allow to override defaults for shemem and tmpfs at config time.  This is
consistent with how transparent hugepages can be configured.

Same results can be achieved with the existing
'transparent_hugepage_shmem' and 'transparent_hugepage_tmpfs' settings in
the kernel command line, but it is more convenient to define basic
settings at config time instead of changing kernel command line later.

Defaults for shmem and tmpfs were not changed.  They are remained the same
as before: 'never' for both cases.  Options 'deny' and 'force' are omitted
intentionally since these are special values and supposed to be used for
emergencies or testing and are not expected to be permanent ones.

Primary motivation for adding config option is to enable policy
enforcement at build time.  In large-scale production environments (Meta's
for example), the kernel configuration is often maintained centrally close
to the kernel code itself and owned by the kernel engineers, while boot
parameters are managed independently (e.g.  by provisioning systems).  In
such setups, the kernel build defines the supported and expected behavior
in a single place, but there is no reliable or uniform control over the
kernel command line options.

A build-time default allows kernel integrators to enforce a predictable
hugepage policy for shmem/tmpfs on a base layer, ensuring reproducible
behavior and avoiding configuration drift caused by possible boot-time
differences.

In short, primary benefit is mostly operational: it provides a way to
codify preferred policy in the kernel configuration, which is versioned,
reviewed, and tested as part of the kernel build process, rather than
depending on potentially variable boot parameters.

[d@ilvokhin.com: v2]
  Link: https://lkml.kernel.org/r/aQECPpjd-fU_TC79@shell.ilvokhin.com
Link: https://lkml.kernel.org/r/aPpv8sAa2sYgNu3L@shell.ilvokhin.com
Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Kiryl Shutsemau <kas@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:23 -08:00
SeongJae Park
f46dbea0d9 Docs/admin-guide/mm/damon/stat: document negative idle time
Commit a983a26d5298 ("mm/damon/stat: expose negative idle time")
introduced the negative idle time feature for DAMON_STAT.  But it is not
documented.  Document it on the usage document.

Link: https://lkml.kernel.org/r/20251026182216.118200-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:22 -08:00
SeongJae Park
da8644a476 Docs/admin-guide/mm/damon/stat: document aggr_interval_us parameter
Commit cc7ceb1d14b0 ("mm/damon/stat: expose the current tuned aggregation
interval"), has introduced 'aggr_interval_us' parameter for DAMON_STAT. 
But the new parameter is not yet documented.  Document it on the usage
document for the module.

Link: https://lkml.kernel.org/r/20251026182216.118200-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:22 -08:00
SeongJae Park
448666e418 Docs/admin-guide/mm/damon/lru_sort: document addr_unit parameter
Commit 2e0fe9245d6b ("mm/damon/lru_sort: support addr_unit for
DAMON_LRU_SORT") introduced the 'addr_unit' parameter for DAMON_LRU_SORT. 
But the usage document is not updated for that.  Update the document.

Link: https://lkml.kernel.org/r/20251026182216.118200-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:22 -08:00
SeongJae Park
bb01656e00 Docs/admin-guide/mm/damon/reclaim: document addr_unit parameter
Commit 7db551fcfb2a ("mm/damon/reclaim: support addr_unit for
DAMON_RECLAIM") introduced the 'addr_unit' parameter for DAMON_RECLAIM. 
But the usage document is not updated for that.  Update the document.

Link: https://lkml.kernel.org/r/20251026182216.118200-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:22 -08:00
SeongJae Park
29221406f0 Docs/admin-guide/mm/damon/usage: document empty target regions commit behavior
Committing a monitoring target with empty target regions is for keeping
the current monitoring results.  This behavior was introduced by commit
973233600676 ("mm/damon/sysfs: update monitoring target regions for online
input commit").  The behavior is not documented, though.  Update the usage
document for clarifying this behavior.

Link: https://lkml.kernel.org/r/20251026182216.118200-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:21 -08:00
SeongJae Park
a01386c16d Docs/admin-guide/mm/damon/stat: fix a typo: s/sampling events/sampling interval/
It is a contextual typo.  Fix it.

Link: https://lkml.kernel.org/r/20251026182216.118200-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:21 -08:00
SeongJae Park
d7484f6edd Docs/mm/damon/design: fix wrong link to intervals goal section
Commit b243d666d107 ("Docs/admin-guide/mm/damon/usage: add intervals_goal
directory on the hierarchy") mistakenly added a wrong reference for
intervals goal usage documentation on the design document.  Fix it.

Link: https://lkml.kernel.org/r/20251026182216.118200-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:21 -08:00
SeongJae Park
50ca642364 mm/damon/core: fix wrong comment of damon_call() return timing
Patch series "mm/damon: misc documentation fixups".

First three patches fix up issues in the documents, including wrong
explanation of a behavior, wrong link, and a contextual typo.  Following
five patches update documents for not yet documented features and
behaviors.


This patch (of 8):

damon_call() works asynchronously and synchronously for repeat and
non-repeat mode requests, respectively.  The comment about the behavior is
wrong, though.  Fix it.

The wrong comment was introduced together with the repeat mode, by commit
43df7676e550 ("mm/damon/core: introduce repeat mode damon_call()").

Link: https://lkml.kernel.org/r/20251026182216.118200-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20251026182216.118200-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:21 -08:00
Kairui Song
4fd58b51ef mm, swap: remove redundant argument for isolating a cluster
The order argument was introduced by an intermediate commit and was then
never used, just remove it.

Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-5-a709469052e7@tencent.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Chris Li <chrisl@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:20 -08:00
Kairui Song
ab61de9b78 mm/migrate, swap: drop usage of folio_index
This helper was used when swap cache was mixed with page cache.  Now they
are completely separate from each other, access to the swap cache is all
wrapped by the swap_cache_* helpers, which expect the folio's swap entry
as a parameter.

This helper is no longer used, remove the last redundant user and drop it.

Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-4-a709469052e7@tencent.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Acked-by: Chris Li <chrisl@kernel.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:20 -08:00
Kairui Song
a983471cfc mm, swap: cleanup swap entry allocation parameter
We no longer need this GFP parameter after commit 8578e0c00dcf ("mm, swap:
use the swap table for the swap cache and switch API").  Before that
commit the GFP parameter is already almost identical for all callers, so
nothing changed by that commit.  Swap table just moved the GFP to lower
layer and make it more defined and changes depend on atomic or sleep
allocation.

Now this parameter is no longer used, just remove it.  No behavior change.

Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-3-a709469052e7@tencent.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Acked-by: Chris Li <chrisl@kernel.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:20 -08:00
Kairui Song
e4adea27b9 mm, swap: rename helper for setup bad slots
The name inc_cluster_info_page is very confusing, as this helper is only
used during swapon to mark bad slots.  Rename it properly and turn the
VM_BUG_ON in it into WARN_ON to expose more potential issues.  Swapon is a
cold path, so adding more checks should be a good idea.

No feature change except new WARN_ON.

Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-2-a709469052e7@tencent.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Acked-by: Chris Li <chrisl@kernel.org>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:20 -08:00
Kairui Song
9fb749cd15 mm, swap: do not perform synchronous discard during allocation
Patch series "mm, swap: misc cleanup and bugfix", v2.

A few cleanups and a bugfix that are either suitable after the swap table
phase I or found during code review.

Patch 1 is a bugfix and needs to be included in the stable branch, the
rest have no behavioral change.


This patch (of 5):

Since commit 1b7e90020eb77 ("mm, swap: use percpu cluster as allocation
fast path"), swap allocation is protected by a local lock, which means we
can't do any sleeping calls during allocation.

However, the discard routine is not taken well care of.  When the swap
allocator failed to find any usable cluster, it would look at the pending
discard cluster and try to issue some blocking discards.  It may not
necessarily sleep, but the cond_resched at the bio layer indicates this is
wrong when combined with a local lock.  And the bio GFP flag used for
discard bio is also wrong (not atomic).

It's arguable whether this synchronous discard is helpful at all.  In most
cases, the async discard is good enough.  And the swap allocator is doing
very differently at organizing the clusters since the recent change, so it
is very rare to see discard clusters piling up.

So far, no issues have been observed or reported with typical SSD setups
under months of high pressure.  This issue was found during my code
review.  But by hacking the kernel a bit: adding a mdelay(500) in the
async discard path, this issue will be observable with WARNING triggered
by the wrong GFP and cond_resched in the bio layer for debug builds.

So now let's apply a hotfix for this issue: remove the synchronous discard
in the swap allocation path.  And when order 0 is failing with all cluster
list drained on all swap devices, try to do a discard following the swap
device priority list.  If any discards released some cluster, try the
allocation again.  This way, we can still avoid OOM due to swap failure if
the hardware is very slow and memory pressure is extremely high.

This may cause more fragmentation issues if the discarding hardware is
really slow.  Ideally, we want to discard pending clusters before
continuing to iterate the fragment cluster lists.  This can be implemented
in a cleaner way if we clean up the device list iteration part first.

Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-0-a709469052e7@tencent.com
Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-1-c5b0e1092927@tencent.com
Fixes: 1b7e90020eb7 ("mm, swap: use percpu cluster as allocation fast path")
Signed-off-by: Kairui Song <kasong@tencent.com>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Acked-by: Chris Li <chrisl@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:19 -08:00
Wei Yang
f0b1602871 mm/huge_memory: optimize old_order derivation during folio splitting
Folio splitting requires both the folio's original order (@old_order) and
the new target order (@split_order).

In the current implementation, @old_order is repeatedly retrieved using
folio_order().

However, for every iteration after the first, the folio being split is the
result of the previous split, meaning its order is already known to be
equal to the previous iteration's @split_order.

This commit optimizes the logic:

  * Instead of calling folio_order(), we now set @old_order directly to
    the value of @split_order from the previous iteration.

This change avoids unnecessary function calls and simplifies the loop
setup.

Also it removes a check for non-existent case, since for uniform splitting
we only do split when @split_order == @new_order.

Link: https://lkml.kernel.org/r/20251021212142.25766-5-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: wang lian <lianux.mm@gmail.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mariano Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:19 -08:00
Wei Yang
fc4f15ee0b mm/huge_memory: optimize and simplify folio stat update after split
The loop executed after a successful folio split currently has two
combined responsibilities:

  * updating statistics for the new folios
  * determining the folio for the next split iteration.

This commit refactors the logic to directly calculate and update folio
statistics, eliminating the need for the iteration step.

We can do this because all necessary information is already available:

  * All resulting new folios have the same order, which is @split_order.
  * The exact number of new folios can be calculated directly using
    @old_order and @split_order.
  * The folio for the subsequent split is simply the one containing
    @split_at.

By leveraging this knowledge, we can achieve the stat update more cleanly
and efficiently without the looping logic.

Link: https://lkml.kernel.org/r/20251021212142.25766-4-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: wang lian <lianux.mm@gmail.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mariano Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:19 -08:00
Wei Yang
092ef38997 mm/huge_memory: update folio stat after successful split
The current implementation complicates this process:

  * It iterates over the resulting new folios.
  * It uses a flag (@stop_split) to conditionally skip updating the stat
    for the folio at @split_at during the loop.
  * It then attempts to update the skipped stat on a subsequent failure
    path.

This logic is unnecessarily hard to follow.

This commit refactors the code to update the folio statistics only after a
successful split.  This makes the logic much cleaner and sets the stage
for further simplification of the stat-handling code.

Link: https://lkml.kernel.org/r/20251021212142.25766-3-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: wang lian <lianux.mm@gmail.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mariano Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:19 -08:00
Wei Yang
ef0258857d mm/huge_memory: avoid reinvoking folio_test_anon()
Patch series "mm/huge_memory: cleanup __split_unmapped_folio()", v3.

This patch series cleans up and optimizes the internal logic of the
__split_unmapped_folio() function.

The goal is to improve clarity and efficiency by eliminating redundant
checks, caching stable attribute values, and simplifying the iteration
logic used for updating folio statistics.

These changes make the code easier to follow and maintain.

The split_huge_page_test selftest pass.


This patch (of 4):

During the execution of __split_unmapped_folio(), the folio's anon/!anon
attribute is invariant (not expected to change).

Therefore, it is safe and more efficient to retrieve this attribute once
at the start and reuse it throughout the function.

Link: https://lkml.kernel.org/r/20251021212142.25766-1-richard.weiyang@gmail.com
Link: https://lkml.kernel.org/r/20251016004613.514-1-richard.weiyang@gmail.com
Link: https://lkml.kernel.org/r/20251016004613.514-2-richard.weiyang@gmail.com
Link: https://lkml.kernel.org/r/20251021212142.25766-2-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: wang lian <lianux.mm@gmail.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mariano Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Nico Pache <npache@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:18 -08:00
Anshuman Khandual
eca1fba233 mm/debug_vm_pgtable: add [pte|pmd]_mkwrite_novma() tests
Add some [pte|pmd]_mkwrite_novma() relevant tests.

[anshuman.khandual@arm.com: add a new test combination per Huang Ying]
  Link: https://lkml.kernel.org/r/20251024013137.136926-1-anshuman.khandual@arm.com
Link: https://lkml.kernel.org/r/20251022032951.3498553-1-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Huang Ying <ying.huang@linux.alibaba.com>
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:18 -08:00
Lu Baolu
e37d5a2d60 iommu/sva: invalidate stale IOTLB entries for kernel address space
Introduce a new IOMMU interface to flush IOTLB paging cache entries for
the CPU kernel address space.  This interface is invoked from the x86
architecture code that manages combined user and kernel page tables,
specifically before any kernel page table page is freed and reused.

This addresses the main issue with vfree() which is a common occurrence
and can be triggered by unprivileged users.  While this resolves the
primary problem, it doesn't address some extremely rare case related to
memory unplug of memory that was present as reserved memory at boot, which
cannot be triggered by unprivileged users.  The discussion can be found at
the link below.

Enable SVA on x86 architecture since the IOMMU can now receive
notification to flush the paging cache before freeing the CPU kernel page
table pages.

Link: https://lkml.kernel.org/r/20251022082635.2462433-9-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/linux-iommu/04983c62-3b1d-40d4-93ae-34ca04b827e5@intel.com/
Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Suggested-by: Jann Horn <jannh@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:18 -08:00
Dave Hansen
5ba2f0a155 mm: introduce deferred freeing for kernel page tables
This introduces a conditional asynchronous mechanism, enabled by
CONFIG_ASYNC_KERNEL_PGTABLE_FREE.  When enabled, this mechanism defers the
freeing of pages that are used as page tables for kernel address mappings.
These pages are now queued to a work struct instead of being freed
immediately.

This deferred freeing allows for batch-freeing of page tables, providing a
safe context for performing a single expensive operation (TLB flush) for a
batch of kernel page tables instead of performing that expensive operation
for each page table.

Link: https://lkml.kernel.org/r/20251022082635.2462433-8-baolu.lu@linux.intel.com
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:18 -08:00
Lu Baolu
bf9e4e30f3 x86/mm: use pagetable_free()
The kernel's memory management subsystem provides a dedicated interface,
pagetable_free(), for freeing page table pages.  Updates two call sites to
use pagetable_free() instead of the lower-level __free_page() or
free_pages().  This improves code consistency and clarity, and ensures the
correct freeing mechanism is used.

Link: https://lkml.kernel.org/r/20251022082635.2462433-7-baolu.lu@linux.intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:17 -08:00
Dave Hansen
0189429567 mm: introduce pure page table freeing function
The pages used for ptdescs are currently freed back to the allocator in a
single location.  They will shortly be freed from a second location.

Create a simple helper that just frees them back to the allocator.

Link: https://lkml.kernel.org/r/20251022082635.2462433-6-baolu.lu@linux.intel.com
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:17 -08:00
Dave Hansen
412d000346 x86/mm: use 'ptdesc' when freeing PMD pages
There are a billion ways to refer to a physical memory address.  One of
the x86 PMD freeing code location chooses to use a 'pte_t *' to point to a
PMD page and then call a PTE-specific freeing function for it.  That's a
bit wonky.

Just use a 'struct ptdesc *' instead.  Its entire purpose is to refer to
page table pages.  It also means being able to remove an explicit cast.

Right now, pte_free_kernel() is a one-liner that calls
pagetable_dtor_free().  Effectively, all this patch does is remove one
superfluous __pa(__va(paddr)) conversion and then call
pagetable_dtor_free() directly instead of through a helper.

Link: https://lkml.kernel.org/r/20251022082635.2462433-5-baolu.lu@linux.intel.com
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:17 -08:00
Dave Hansen
977870522a mm: actually mark kernel page table pages
Now that the API is in place, mark kernel page table pages just after they
are allocated.  Unmark them just before they are freed.

Note: Unconditionally clearing the 'kernel' marking (via
ptdesc_clear_kernel()) would be functionally identical to what is here. 
But having the if() makes it logically clear that this function can be
used for kernel and non-kernel page tables.

Link: https://lkml.kernel.org/r/20251022082635.2462433-4-baolu.lu@linux.intel.com
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:17 -08:00
Dave Hansen
27bfafac65 mm: add a ptdesc flag to mark kernel page tables
The page tables used to map the kernel and userspace often have very
different handling rules.  There are frequently *_kernel() variants of
functions just for kernel page tables.  That's not great and has lead to
code duplication.

Instead of having completely separate call paths, allow a 'ptdesc' to be
marked as being for kernel mappings.  Introduce helpers to set and clear
this status.

Note: this uses the PG_referenced bit.  Page flags are a great fit for
this since it is truly a single bit of information.  Use PG_referenced
itself because it's a fairly benign flag (as opposed to things like
PG_lock).  It's also (according to Willy) unlikely to go away any time
soon.

PG_referenced is not in PAGE_FLAGS_CHECK_AT_FREE.  It does not need to be
cleared before freeing the page, and pages coming out of the allocator
should have it cleared.  Regardless, introduce an API to clear it anyway. 
Having symmetry in the API makes it easier to change the underlying
implementation later, like if there was a need to move to a
PAGE_FLAGS_CHECK_AT_FREE bit.

Link: https://lkml.kernel.org/r/20251022082635.2462433-3-baolu.lu@linux.intel.com
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:16 -08:00
Lu Baolu
72f98ef9a4 iommu: disable SVA when CONFIG_X86 is set
Patch series "Fix stale IOTLB entries for kernel address space", v7.

This proposes a fix for a security vulnerability related to IOMMU Shared
Virtual Addressing (SVA).  In an SVA context, an IOMMU can cache kernel
page table entries.  When a kernel page table page is freed and
reallocated for another purpose, the IOMMU might still hold stale,
incorrect entries.  This can be exploited to cause a use-after-free or
write-after-free condition, potentially leading to privilege escalation or
data corruption.

This solution introduces a deferred freeing mechanism for kernel page
table pages, which provides a safe window to notify the IOMMU to
invalidate its caches before the page is reused.


This patch (of 8):

In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware
shares and walks the CPU's page tables.  The x86 architecture maps the
kernel's virtual address space into the upper portion of every process's
page table.  Consequently, in an SVA context, the IOMMU hardware can walk
and cache kernel page table entries.

The Linux kernel currently lacks a notification mechanism for kernel page
table changes, specifically when page table pages are freed and reused. 
The IOMMU driver is only notified of changes to user virtual address
mappings.  This can cause the IOMMU's internal caches to retain stale
entries for kernel VA.

Use-After-Free (UAF) and Write-After-Free (WAF) conditions arise when
kernel page table pages are freed and later reallocated.  The IOMMU could
misinterpret the new data as valid page table entries.  The IOMMU might
then walk into attacker-controlled memory, leading to arbitrary physical
memory DMA access or privilege escalation.  This is also a
Write-After-Free issue, as the IOMMU will potentially continue to write
Accessed and Dirty bits to the freed memory while attempting to walk the
stale page tables.

Currently, SVA contexts are unprivileged and cannot access kernel
mappings.  However, the IOMMU will still walk kernel-only page tables all
the way down to the leaf entries, where it realizes the mapping is for the
kernel and errors out.  This means the IOMMU still caches these
intermediate page table entries, making the described vulnerability a real
concern.

Disable SVA on x86 architecture until the IOMMU can receive notification
to flush the paging cache before freeing the CPU kernel page table pages.

Link: https://lkml.kernel.org/r/20251022082635.2462433-1-baolu.lu@linux.intel.com
Link: https://lkml.kernel.org/r/20251022082635.2462433-2-baolu.lu@linux.intel.com
Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices")
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yi Lai <yi1.lai@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:16 -08:00
Shakeel Butt
5ff592bec7 memcg: manually uninline __memcg_memory_event
__memcg_memory_event() has been unnecessarily marked inline even when it
is not really performance critical.  It is usually called to track extreme
conditions.  Over the time, it has evolved to include more functionality
and inlining it is causing more harm.

Before the patch:
$ size mm/memcontrol.o net/ipv4/tcp_input.o net/ipv4/tcp_output.o
   text    data     bss     dec     hex filename
  35645   10574    4192   50411    c4eb mm/memcontrol.o
  54738    1658       0   56396    dc4c net/ipv4/tcp_input.o
  34644    1065       0   35709    8b7d net/ipv4/tcp_output.o

After the patch:
$ size mm/memcontrol.o net/ipv4/tcp_input.o net/ipv4/tcp_output.o
   text    data     bss     dec     hex filename
  35137   10446    4192   49775    c26f mm/memcontrol.o
  54322    1562       0   55884    da4c net/ipv4/tcp_input.o
  34492    1017       0   35509    8ab5 net/ipv4/tcp_output.o

[akpm@linux-foundation.org: use EXPORT_SYMBOL_GPL for __memcg_memory_event, per Michal and Christoph]
Link: https://lkml.kernel.org/r/20251021234425.1885471-1-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: SeongJae Park <sj@kernel.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:16 -08:00
Vishal Moola (Oracle)
a061578043 mm/vmalloc: request large order pages from buddy allocator
Sometimes, vm_area_alloc_pages() will want many pages from the buddy
allocator.  Rather than making requests to the buddy allocator for at most
100 pages at a time, we can eagerly request large order pages a smaller
number of times.

We still split the large order pages down to order-0 as the rest of the
vmalloc code (and some callers) depend on it.  We still defer to the bulk
allocator and fallback path in case of order-0 pages or failure.

Running 1000 iterations of allocations on a small 4GB system finds:

1000 2mb allocations:
	[Baseline]			[This patch]
	real    46.310s			real    0m34.582
	user    0.001s			user    0.006s
	sys     46.058s			sys     0m34.365s

10000 200kb allocations:
	[Baseline]			[This patch]
	real    56.104s			real    0m43.696
	user    0.001s			user    0.003s
	sys     55.375s			sys     0m42.995s

Link: https://lkml.kernel.org/r/20251021194455.33351-2-vishal.moola@gmail.com
Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:15 -08:00
Jing Su
645a3c4243 mm/vmstat: fix indentation in fold_diff function
Adjust misaligned braces in fold_diff() to improve code readability and
maintain consistent coding style.

[akpm@linux-foundation.org: add braces, per Vlastimil & Liam]
Link: https://lkml.kernel.org/r/aPc4I/8zXCGyiapN@pilot-ThinkCentre-M930t-N000
Signed-off-by: Jing Su <jingsusu@didiglobal.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Donet Tom <donettom@linux.ibm.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:15 -08:00
William Kucharski
fe62415c9b mm: remove reference to destructor in comment in calculate_sizes()
The commit that removed support for destructors from kmem_cache_alloc()
never removed the comment regarding destructors in the explanation of the
possible relocation of the free pointer in calculate_sizes().

Link: https://lkml.kernel.org/r/20251021110004.2209008-1-william.kucharski@oracle.com
Fixes: 20c2df83d25c ("mm: Remove slab destructors from kmem_cache_create().")
Signed-off-by: William Kucharski <william.kucharski@oracle.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Christoph Lameter (Ampere) <cl@gentwo.org>
Acked-by: SeongJae Park <sj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Harry Yoo <harry.yoo@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:15 -08:00
Leon Hwang
2da6fe91c2 mm/khugepaged: factor out common logic in [scan,alloc]_sleep_millisecs_store()
Both scan_sleep_millisecs_store() and alloc_sleep_millisecs_store()
perform the same operations: parse the input value, update their
respective sleep interval, reset khugepaged_sleep_expire, and wake up the
khugepaged thread.

Factor out this duplicated logic into a helper function
__sleep_millisecs_store(), and simplify both store functions.

No functional change intended.

Link: https://lkml.kernel.org/r/20251021134431.26488-1-leon.hwang@linux.dev
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Nico Pache <npache@redhat.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:15 -08:00
Swaraj Gaikwad
fae4e86024 mm/damon/sysfs: remove misleading todo comment in nid_show()
The TODO comment in nid_show() suggested returning an error if the goal
was not using nid.  However, this comment was found to be inaccurate and
misleading.This patch removes the TODO comment without changing any
existing behavior.

This change follows feedback from SJ who pointed out [1] that wiring-order
independence is expected and the function should simply show the last set
value.  and [2] checkpatch.pl complain about number of chars per line

No functional code changes were made.

Tested with KUnit:
- Built kernel with KUnit and DAMON sysfs tests enabled.
- Executed KUnit tests:
  ./tools/testing/kunit/kunit.py run --kunitconfig ./mm/damon/tests/
- All 25 tests passed, including damon_sysfs_test_add_targets.

Link: https://lkml.kernel.org/r/20251021215323.29734-2-swarajgaikwad1925@gmail.com
Link: https://lore.kernel.org/lkml/20251020151315.66260-1-sj@kernel.org/ [1]
Link: https://lore.kernel.org/lkml/20251021010847.68473-1-sj@kernel.org/ [2]
Signed-off-by: Swaraj Gaikwad <swarajgaikwad1925@gmail.com>
Suggested-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: David Hunter <david.hunter.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:14 -08:00
Mehdi Ben Hadj Khelifa
f0c74b6cb9 mm/vmalloc: use kmalloc_array() instead of kmalloc()
The number of NUMA nodes (nr_node_ids) is bounded, so overflow is not a
practical concern here.  However, using kmalloc_array() better reflects
the intent to allocate an array of unsigned ints, and improves consistency
with other NUMA-related allocations.

No functional change intended.

Link: https://lkml.kernel.org/r/20251018201207.27441-1-mehdi.benhadjkhelifa@gmail.com
Signed-off-by: Mehdi Ben Hadj Khelifa <mehdi.benhadjkhelifa@gmail.com>
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: Khalid Aziz <khalid@kernel.org>
Cc: David Hunter <david.hunter.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:14 -08:00
Bagas Sanjaya
184c753342 vmalloc: separate gfp_mask adjunctive parentheses in __vmalloc_node_noprof() kernel-doc comment
Sphinx reports htmldocs warning on __vmalloc_node() comment:

Documentation/core-api/mm-api:52: ./mm/vmalloc.c:4036: WARNING: Inline strong start-string without end-string. [docutils]

Fix it by separating adjunctive parentheses from preceding gfp_mask
formatting markup.

Link: https://lkml.kernel.org/r/20251020044933.15222-1-bagasdotme@gmail.com
Fixes: 32904ba6f5ef ("vmalloc: update __vmalloc_node_noprof() documentation")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/linux-next/20251020134902.3a11107e@canb.auug.org.au/
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Stehen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:14 -08:00
Lorenzo Stoakes
8247e2600e mm: update resctl to use mmap_prepare
Make use of the ability to specify a remap action within mmap_prepare to
update the resctl pseudo-lock to use mmap_prepare in favour of the
deprecated mmap hook.

Link: https://lkml.kernel.org/r/95b28b066f37ca25f56fa9460a9367f1a866f88b.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: Reinette Chatre <reinette.chatre@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:14 -08:00
Lorenzo Stoakes
ab04945f91 mm: update mem char driver to use mmap_prepare
Update the mem char driver (backing /dev/mem and /dev/zero) to use
f_op->mmap_prepare hook rather than the deprecated f_op->mmap.

The /dev/zero implementation has a very unique and rather concerning
characteristic in that it converts MAP_PRIVATE mmap() mappings anonymous
when they are, in fact, not.

The new f_op->mmap_prepare() can support this, but rather than introducing
a helper function to perform this hack (and risk introducing other users),
utilise the success hook to do so.

We utilise the newly introduced shmem_zero_setup_desc() to allow for the
shared mapping case via an f_op->mmap_prepare() hook.

We also use the desc->action_error_hook to filter the remap error to
-EAGAIN to keep behaviour consistent.

Link: https://lkml.kernel.org/r/48f60764d7a6901819d1af778fa33b775d2e8c77.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:13 -08:00
Lorenzo Stoakes
89646d9c74 mm: add shmem_zero_setup_desc()
Add the ability to set up a shared anonymous mapping based on a VMA
descriptor rather than a VMA.

This is a prerequisite for converting to the char mm driver to use the
mmap_prepare hook.

Link: https://lkml.kernel.org/r/d9181517a7e3d6b014a5697c6990d3722c2c9fcd.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:13 -08:00
Lorenzo Stoakes
ea52cb24cd mm/hugetlbfs: update hugetlbfs to use mmap_prepare
Since we can now perform actions after the VMA is established via
mmap_prepare, use desc->action_success_hook to set up the hugetlb lock
once the VMA is setup.

We also make changes throughout hugetlbfs to make this possible.

Note that we must hide newly established hugetlb VMAs from the rmap until
the operation is entirely complete as we establish a hugetlb lock during
VMA setup that can be raced by rmap users.

Link: https://lkml.kernel.org/r/b1afa16d3cfa585a03df9ae215ae9f905b3f0ed7.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:13 -08:00
Lorenzo Stoakes
da003453dc doc: update porting, vfs documentation for mmap_prepare actions
Now we have introduced the ability to specify that actions should be taken
after a VMA is established via the vm_area_desc->action field as specified
in mmap_prepare, update both the VFS documentation and the porting guide
to describe this.

Link: https://lkml.kernel.org/r/472ce3da7662ed1065cc299d14bffb70b1a845e7.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:13 -08:00
Lorenzo Stoakes
ac0a3fc9c0 mm: add ability to take further action in vm_area_desc
Some drivers/filesystems need to perform additional tasks after the VMA is
set up.  This is typically in the form of pre-population.

The forms of pre-population most likely to be performed are a PFN remap
or the insertion of normal folios and PFNs into a mixed map.

We start by implementing the PFN remap functionality, ensuring that we
perform the appropriate actions at the appropriate time - that is setting
flags at the point of .mmap_prepare, and performing the actual remap at the
point at which the VMA is fully established.

This prevents the driver from doing anything too crazy with a VMA at any
stage, and we retain complete control over how the mm functionality is
applied.

Unfortunately callers still do often require some kind of custom action,
so we add an optional success/error _hook to allow the caller to do
something after the action has succeeded or failed.

This is done at the point when the VMA has already been established, so
the harm that can be done is limited.

The error hook can be used to filter errors if necessary.

There may be cases in which the caller absolutely must hold the file rmap
lock until the operation is entirely complete. It is an edge case, but
certainly the hugetlbfs mmap hook requires it.

To accommodate this, we add the hide_from_rmap_until_complete flag to the
mmap_action type. In this case, if a new VMA is allocated, we will hold the
file rmap lock until the operation is entirely completed (including any
success/error hooks).

Note that we do not need to update __compat_vma_mmap() to accommodate this
flag, as this function will be invoked from an .mmap handler whose VMA is
not yet visible, so we implicitly hide it from the rmap.

If any error arises on these final actions, we simply unmap the VMA
altogether.

Also update the stacked filesystem compatibility layer to utilise the
action behaviour, and update the VMA tests accordingly.

While we're here, rename __compat_vma_mmap_prepare() to __compat_vma_mmap()
as we are now performing actions invoked by the mmap_prepare in addition to
just the mmap_prepare hook.

Link: https://lkml.kernel.org/r/2601199a7b2eaeadfcd8ab6e199c6d1706650c94.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:12 -08:00
Lorenzo Stoakes
db91b78329 mm: introduce io_remap_pfn_range_[prepare, complete]()
We introduce the io_remap*() equivalents of remap_pfn_range_prepare() and
remap_pfn_range_complete() to allow for I/O remapping via mmap_prepare.

Make these internal to mm, as they should only be used by internal helpers.

Link: https://lkml.kernel.org/r/4065134f13a24a3e14691b7443bcee7490b18a5c.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:12 -08:00
Lorenzo Stoakes
c707a68f94 mm: abstract io_remap_pfn_range() based on PFN
The only instances in which we customise this function are ones in which we
customise the PFN used.

Instances where architectures were not passing the pgprot value through
pgprot_decrypted() are ones where pgprot_decrypted() was a no-op anyway, so
we can simply always pass pgprot through this function.

Use this fact to simplify the use of io_remap_pfn_range(), by abstracting
the PFN via io_remap_pfn_range_pfn() and using this instead of providing a
general io_remap_pfn_range() function per-architecture.

Link: https://lkml.kernel.org/r/d086191bf431b58ce3b231b4f4f555d080f60327.1760959442.git.lorenzo.stoakes@oracle.com
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:12 -08:00
Lorenzo Stoakes
51e38e7d40 mm: add remap_pfn_range_prepare(), remap_pfn_range_complete()
We need the ability to split PFN remap between updating the VMA and
performing the actual remap, in order to do away with the legacy f_op->mmap
hook.

To do so, update the PFN remap code to provide shared logic, and also make
remap_pfn_range_notrack() static, as its one user, io_mapping_map_user()
was removed in commit 9a4f90e24661 ("mm: remove mm/io-mapping.c").

Then, introduce remap_pfn_range_prepare(), which accepts VMA descriptor
and PFN parameters, and remap_pfn_range_complete() which accepts the same
parameters as remap_pfn_rangte().

remap_pfn_range_prepare() will set the cow vma->vm_pgoff if necessary, so
it must be supplied with a correct PFN to do so.

While we're here, also clean up the duplicated #ifdef
__HAVE_PFNMAP_TRACKING check and put into a single #ifdef/#else block.

We keep these internal to mm as they should only be used by internal
helpers.

Link: https://lkml.kernel.org/r/75b55de63249b3aa0fd5b3b08ed1d3ff19255d0d.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Pedro Falcato <pfalcato@suse.de>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:12 -08:00
Lorenzo Stoakes
2bcd9207de mm/vma: rename __mmap_prepare() function to avoid confusion
Now we have the f_op->mmap_prepare() hook, having a static function called
__mmap_prepare() that has nothing to do with it is confusing, so rename
the function to __mmap_setup().

Link: https://lkml.kernel.org/r/d25a22c60ca0f04091697ef9cda0d72ce0cf8af3.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:11 -08:00
Lorenzo Stoakes
651fdda840 relay: update relay to use mmap_prepare
It is relatively trivial to update this code to use the f_op->mmap_prepare
hook in favour of the deprecated f_op->mmap hook, so do so.

Link: https://lkml.kernel.org/r/7c9e82cdddf8b573ea3edb8cdb697363e3ccb5d7.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:11 -08:00
Lorenzo Stoakes
54c58a2f5f mm: add vma_desc_size(), vma_desc_pages() helpers
It's useful to be able to determine the size of a VMA descriptor range
used on f_op->mmap_prepare, expressed both in bytes and pages, so add
helpers for both and update code that could make use of it to do so.

Link: https://lkml.kernel.org/r/74ef338203c9ff08a9ace73a8f1f6116a79112a0.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:11 -08:00
Lorenzo Stoakes
cf1d98f44d mm/vma: remove unused function, make internal functions static
unlink_file_vma() is not used by anything, so remove it.

vma_link() and vma_link_file() are only used within mm/vma.c, so make them
static.

Link: https://lkml.kernel.org/r/f2ab9ea051225a02e6d1d45a7608f4e149220117.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pedro Falcato <pfalcato@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:10 -08:00
Lorenzo Stoakes
8e18a7f435 device/dax: update devdax to use mmap_prepare
The devdax driver does nothing special in its f_op->mmap hook, so
straightforwardly update it to use the mmap_prepare hook instead.

Link: https://lkml.kernel.org/r/1e8665d052ac8cf2f7ff92b6c7862614f7fd306c.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Pedro Falcato <pfalcato@suse.de>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:10 -08:00
Lorenzo Stoakes
ab3c8e7b86 mm/shmem: update shmem to use mmap_prepare
Patch series "expand mmap_prepare functionality, port more users", v5.

Since commit c84bf6dd2b83 ("mm: introduce new .mmap_prepare() file
callback"), The f_op->mmap hook has been deprecated in favour of
f_op->mmap_prepare.

This was introduced in order to make it possible for us to eventually
eliminate the f_op->mmap hook which is highly problematic as it allows
drivers and filesystems raw access to a VMA which is not yet correctly
initialised.

This hook also introduced complexity for the memory mapping operation, as
we must correctly unwind what we do should an error arises.

Overall this interface being so open has caused significant problems for
us, including security issues, it is important for us to simply eliminate
this as a source of problems.

Therefore this series continues what was established by extending the
functionality further to permit more drivers and filesystems to use
mmap_prepare.

We start by udpating some existing users who can use the mmap_prepare
functionality as-is.

We then introduce the concept of an mmap 'action', which a user, on
mmap_prepare, can request to be performed upon the VMA:

* Nothing - default, we're done
* Remap PFN - perform PFN remap with specified parameters
* I/O remap PFN - perform I/O PFN remap with specified parameters

By setting the action in mmap_prepare, this allows us to dynamically
decide what to do next, so if a driver/filesystem needs to determine
whether to e.g.  remap or use a mixed map, it can do so then change which
is done.

This significantly expands the capabilities of the mmap_prepare hook,
while maintaining as much control as possible in the mm logic.

We split [io_]remap_pfn_range*() functions which allow for PFN remap (a
typical mapping prepopulation operation) split between a prepare/complete
step, as well as io_mremap_pfn_range_prepare, complete for a similar
purpose.

From there we update various mm-adjacent logic to use this functionality
as a first set of changes.

We also add success and error hooks for post-action processing for e.g. 
output debug log on success and filtering error codes.


This patch (of 15):

This simply assigns the vm_ops so is easily updated - do so.

Link: https://lkml.kernel.org/r/cover.1760959441.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/7b93b1e89028e39507dac5ca01991e1374d5bbe8.1760959442.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chatre, Reinette <reinette.chatre@intel.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Dave Martin <dave.martin@arm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guo Ren <guoren@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Morse <james.morse@arm.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:10 -08:00
Quanmin Yan
dfc02531f4 mm/damon/reclaim: use min_sz_region for core address alignment when setting regions
When setting regions in DAMON_RECLAIM, DAMON_MIN_REGION will be applied as
the core address alignment, and the monitoring target address ranges would
be aligned on DAMON_MIN_REGION * addr_unit.  When users 1) set addr_unit
to a value larger than 1, and 2) set the monitoring target address range
as not aligned on DAMON_MIN_REGION * addr_unit, it will cause
DAMON_RECLAIM to operate on unexpectedly large physical address ranges.

For example, if the user sets the monitoring target address range to [4,
8) and addr_unit as 1024, the aimed monitoring target address range is [4
KiB, 8 KiB).  Assuming DAMON_MIN_REGION is 4096, so resulting target
address range will be [0, 4096) in the DAMON core layer address system,
and [0, 4 MiB) in the physical address space, which is an unexpected
range.

To fix the issue, use min_sz_region for core address alignment when
setting regions.

Link: https://lkml.kernel.org/r/20251020130125.2875164-3-yanquanmin1@huawei.com
Fixes: 7db551fcfb2a ("mm/damon/reclaim: support addr_unit for DAMON_RECLAIM")
Signed-off-by: Quanmin Yan <yanquanmin1@huawei.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: ze zuo <zuoze1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:10 -08:00
Quanmin Yan
e859a224fa mm/damon: add a min_sz_region parameter to damon_set_region_biggest_system_ram_default()
Patch series "mm/damon: fixes for address alignment issues in
DAMON_LRU_SORT and DAMON_RECLAIM", v2.

In DAMON_LRU_SORT and DAMON_RECLAIM, damon_set_regions() will apply
DAMON_MIN_REGION as the core address alignment, and the monitoring target
address ranges would be aligned on DAMON_MIN_REGION * addr_unit.  When
users 1) set addr_unit to a value larger than 1, and 2) set the monitoring
target address range as not aligned on DAMON_MIN_REGION * addr_unit, it
will cause DAMON_LRU_SORT and DAMON_RECLAIM to operate on unexpectedly
large physical address ranges.

For example, if the user sets the monitoring target address range to [4,
8) and addr_unit as 1024, the aimed monitoring target address range is [4
KiB, 8 KiB).  Assuming DAMON_MIN_REGION is 4096, so resulting target
address range will be [0, 4096) in the DAMON core layer address system,
and [0, 4 MiB) in the physical address space, which is an unexpected
range.

To fix the issue, add a min_sz_region parameter to
damon_set_region_biggest_system_ram_default() and use it when calling
damon_set_regions(), replacing the direct use of DAMON_MIN_REGION.


This patch (of 2):

In DAMON_LRU_SORT, damon_set_regions() will apply DAMON_MIN_REGION as the
core address alignment, and the monitoring target address ranges would be
aligned on DAMON_MIN_REGION * addr_unit.  When users 1) set addr_unit to a
value larger than 1, and 2) set the monitoring target address range as not
aligned on DAMON_MIN_REGION * addr_unit, it will cause DAMON_LRU_SORT to
operate on unexpectedly large physical address ranges.

For example, if the user sets the monitoring target address range to [4,
8) and addr_unit as 1024, the aimed monitoring target address range is [4
KiB, 8 KiB).  Assuming DAMON_MIN_REGION is 4096, so resulting target
address range will be [0, 4096) in the DAMON core layer address system,
and [0, 4 MiB) in the physical address space, which is an unexpected
range.

To fix the issue, add a min_sz_region parameter to
damon_set_region_biggest_system_ram_default() and use it when calling
damon_set_regions(), replacing the direct use of DAMON_MIN_REGION.

Link: https://lkml.kernel.org/r/20251020130125.2875164-1-yanquanmin1@huawei.com
Link: https://lkml.kernel.org/r/20251020130125.2875164-2-yanquanmin1@huawei.com
Fixes: 2e0fe9245d6b ("mm/damon/lru_sort: support addr_unit for DAMON_LRU_SORT")
Signed-off-by: Quanmin Yan <yanquanmin1@huawei.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: ze zuo <zuoze1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:09 -08:00
Lance Yang
074f027d15 mm/khugepaged: guard is_zero_pfn() calls with pte_present()
A non-present entry, like a swap PTE, contains completely different data
(swap type and offset).  pte_pfn() doesn't know this, so if we feed it a
non-present entry, it will spit out a junk PFN.

What if that junk PFN happens to match the zeropage's PFN by sheer chance?
While really unlikely, this would be really bad if it did.

So, let's fix this potential bug by ensuring all calls to is_zero_pfn() in
khugepaged.c are properly guarded by a pte_present() check.

Link: https://lkml.kernel.org/r/20251020151111.53561-1-lance.yang@linux.dev
Signed-off-by: Lance Yang <lance.yang@linux.dev>
Suggested-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Nico Pache <npache@redhat.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:09 -08:00
SeongJae Park
40d923acfa Docs/ABI/damon: document DAMOS quota goal path file
A DAMON sysfs interface file for DAMOS quota goal's optional path argument
has been added.  Document it on the ABI doc.

Link: https://lkml.kernel.org/r/20251017212706.183502-11-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:09 -08:00
SeongJae Park
87b8351580 Docs/admin-guide/mm/damon/usage: document DAMOS quota goal path file
A new DAMON sysfs interface file, namely 'path' has been added under DAMOS
quota goal directory, for specifying the cgroup for
DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP metrics.  Document it on the usage
document.

Link: https://lkml.kernel.org/r/20251017212706.183502-10-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:09 -08:00
SeongJae Park
4cc00d41c6 Docs/mm/damon/design: document DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP
Update design doc for the newly added two DAMOS quota auto-tuning target
goal metrics, DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP.

Link: https://lkml.kernel.org/r/20251017212706.183502-9-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:08 -08:00
SeongJae Park
c2fbf2da4c mm/damon/sysfs-schemes: support DAMOS_QUOTA_NODE_MEMCG_FREE_BP
Extend DAMON sysfs to support DAMOS_QUOTA_NODE_MEMCG_FREE_BP.

Link: https://lkml.kernel.org/r/20251017212706.183502-8-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:08 -08:00
SeongJae Park
98fdce76fb mm/damon/core: add DAMOS quota gaol metric for per-memcg per-numa free memory
Add a variant of DAMOS_QUOTA_NODE_MEMCG_USED_BP, for the free memory
portion.  The value of the metric is implemented as the entire memory of
the given NUMA node subtracted by the given cgroup's usage.  So from a
perspective, "unused" could be a better term than "free".  But arguably it
is not very clear what is better, so use the term "free".

Link: https://lkml.kernel.org/r/20251017212706.183502-7-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:08 -08:00
SeongJae Park
a1d1df78ac mm/damon/sysfs-schemes: support DAMOS_QUOTA_NODE_MEMCG_USED_BP
Add support of DAMOS_QUOTA_NODE_MEMCG_USED_BP.  For this, extend quota
goal metric inputs for the new metric, and update DAMOS core layer request
construction logic to set the target cgroup, which is specified by the
user, via the 'path' file.

Link: https://lkml.kernel.org/r/20251017212706.183502-6-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:08 -08:00
SeongJae Park
c41e253a41 mm/damon/sysfs-schemes: implement path file under quota goal directory
Add a DAMOS sysfs file for specifying the cgroup of the interest for
DAMOS_QUOTA_NODE_MEMCG_USED_BP.

Link: https://lkml.kernel.org/r/20251017212706.183502-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:07 -08:00
SeongJae Park
b74a120bcf mm/damon/core: implement DAMOS_QUOTA_NODE_MEMCG_USED_BP
Implement the handling of the new DAMOS quota goal metric for per-memcg
per-node memory usage, namely DAMOS_QUOTA_NODE_MEMCG_USED_BP.  The metric
value is calculated as the sum of active/inactive anon/file pages of the
given cgroup for a given NUMA node.

Link: https://lkml.kernel.org/r/20251017212706.183502-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:07 -08:00
SeongJae Park
6a18bbe483 mm/damon: add DAMOS quota goal type for per-memcg per-node memory usage
Define a new DAMOS quota auto-tuning target metric for per-cgroup per-node
memory usage.  For specifying the cgroup of the interest, add a field,
namely memcg_id, to damos_quota_goal struct.

Note that this commit is only implementing the interface.  The handling of
the interface (the metric value calculation) will be implemented in the
following commit.

Link: https://lkml.kernel.org/r/20251017212706.183502-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:07 -08:00
SeongJae Park
d3946c5f4c mm/damon: document damos_quota_goal->nid use case
Patch series "mm/damon: allow DAMOS auto-tuned for per-memcg per-node
memory usage".

Introduce two new DAMOS quota auto-tuning target metrics for per-cgroup
per-NUMA node memory utilization.  Expected use cases are cgroup level
access-aware NUMA memory managements, such as memory tiering or proactive
reclamation on cgroup-based multi-tenant NUMA systems.

Background
==========

The aim-oriented aggressiveness auto-tuning feature of DAMOS is a highly
recommended way for modern DAMOS use cases.  Using it, users can specify
what system status they want to achieve with what access-aware system
operations.  For example, reclaim cold memory aiming for 0.5 percent of
memory pressure (proactive reclaim), or migrate hot and cold memory
between NUMA nodes having different speed (memory tiering).  Then DAMOS
automatically adjusts the aggressiveness of the system operation (e.g.,
increase/decrease reclaim target coldness threshold) based on current
status of the system.

The use case is limited by the supported system status metrics for
specifying the target system status.  Two new system metrics for per-node
memory usage ratio, namely DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, were
recently added to extend the use cases for access-aware NUMA nodes
management, such as memory tiering.  Those are expected to be useful for
not only memory tiering but also general access-aware inter-NUMA node page
migration, though.

Limitation
----------

The per-node memory usage based auto-tuning can be applied only
system-wide.  For cgroups-based multi-tenant systems, it could arguably
harm the fairness.  For example, a cgroup may use faster NUMA node memory
more than other cgroup, depending on their access pattern.  If the user of
each cgroup are promised to get the same quality and amount of the system
resource, this can arguably be an unfair situation.

DAMOS supports cgroup level system operations via DAMOS filter.  But the
quota auto-tuning system is not aware of cgroups.

New DAMOS Quota Tuning Metrics for Per-Cgroup Per-NUMA Memory Usage
===================================================================

To overcome the limitation, introduce two new DAMOS quota auto-tuning goal
metrics, namely DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP.  Those can be
thought of as a variant of DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP that
extended for cgroups.

The two metrics specifies per-cgroup, per-node amount of used and unused
memory in ratio to the total memory of the node.  For example, let's
assume a system has two NUMA nodes of size 100 GiB and 50 GiB.  And two
cgroups are using 40 GiB and 60 GiB of node 0, 20 GiB and 10 GiB of node
1, respectively, as illustrated by the below table.

                     node-0    node-1
    Total memory     100 GiB   50 GiB
    Cgroup A usage   40 GiB    20 GiB
    Cgroup B usage   60 GiB    10 GiB

Then, DAMOS_QUOTA_NODE_MEMCG_USED_BP for the cgroups for the first node
are, 40 GiB / 100 GiB = 4,000 bp (40 percent) and 60 GiB / 100 GiB = 6,000
bp (60 percent), respectively.  Those for the second node are, 20 GiB / 50
GiB = 4000 bp (40 percent) and 10 GiB / 50 GiB = 2000 bp (20 percent),
respectively.

DAMOS_QUOTA_NODE_MEMCG_FREE_BP for the four cases are, 60 GiB /100 GiB =
6000 bp, 40 GiB / 100 GiB = 4000 bp, 30 GiB / 50 GiB = 6000 bp, and 40 GiB
/ 50 GiB = 8000 bp, respectively.

    DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup A node-0: 4000 bp
    DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup B node-0: 6000 bp
    DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup A node-1: 4000 bp
    DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup B node-1: 2000 bp

    DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup A node-0: 6000 bp
    DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup B node-0: 4000 bp
    DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup A node-1: 6000 bp
    DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup B node-1: 8000 bp

Using these, users can specify how much [un]used amount of memory for
per-cgroup and per-node DAMOS should make as a result of the auto-tuning.

Example Usecase: Cgroup Level Memory Tiering
============================================

Let's suppose a typical and simple tiered memory system.  The system
equips two NUMA nodes.  The first node (node 0) is CPU-attached and fast. 
The second node (node 1) is CPU-unattached and slow.  It runs two cgroups
that desire to use about 30 percent and 70 percent of the faster node as
much as possible for their hot data, respectively.  Then, the user can
implement DAMOS-based memory tiering for the system using the DAMON
user-space tool (damo), like below.

    # ./damo start \
    	`# kdamond for node 1 (slow)` \
        --numa_node 1 --monitoring_intervals_goal 4% 3 5ms 10s \
	    `# promotion scheme for cgroup a` \
            --damos_action migrate_hot 0 --damos_access_rate 5% max \
            --damos_apply_interval 1s \
	    --damos_filter allow memcg /workloads/a \
            --damos_filter allow young \
            --damos_quota_interval 1s --damos_quota_space 200MB \
            --damos_quota_goal node_memcg_used_bp 29.7% 0 /workloads/a \
	    \
	    `# promotion scheme for cgroup b` \
            --damos_action migrate_hot 0 --damos_access_rate 5% max \
            --damos_apply_interval 1s \
	    --damos_filter allow memcg /workloads/b \
            --damos_filter allow young \
            --damos_quota_interval 1s --damos_quota_space 200MB \
            --damos_quota_goal node_memcg_used_bp 69.7% 0 workloads/b \
	    \
    	`# kdamond for node 0 (fast)` \
        --numa_node 0 --monitoring_intervals_goal 4% 3 5ms 10s \
            `# demotion scheme for cgroup a` \
            --damos_action migrate_cold 1 --damos_access_rate 0% 0% \
            --damos_apply_interval 1s \
	    --damos_filter allow memcg /workloads/a \
            --damos_filter reject young \
            --damos_quota_interval 1s --damos_quota_space 200MB \
            --damos_quota_goal node_memcg_free_bp 70.5% 0 \
	    \
            `# demotion scheme for cgroup b` \
            --damos_action migrate_cold 1 --damos_access_rate 0% 0% \
            --damos_apply_interval 1s \
	    --damos_filter allow memcg /workloads/a \
            --damos_filter reject young \
            --damos_quota_interval 1s --damos_quota_space 200MB \
            --damos_quota_goal node_memcg_free_bp 30.5% 0 \
	    \
            --damos_nr_quota_goals 1 1 1 1 --damos_nr_filters 1 1 1 1 \
        --nr_targets 1 1 --nr_schemes 2 2 --nr_ctxs 1 1

With the command, the user-space tool will ask DAMON to spawn two kernel
threads, each for monitoring accesses to node 1 (slow) and node 0 (fast),
respectively.  It installs two DAMOS schemes on each thread.  Let's call
them "promotion scheme for cgroup a/b", and "demotion scheme for cgroup
a/b" in the order.  The promotion schemes are installed on the DAMON
thread for node 1 (slow), and demotion schemes are installed on the DAMON
thread for node 0 (fast).

Cgroup Level Hot Pages Migration (Promotion)
--------------------------------------------

Promotion schemes will find memory regions on node 1 (slow), that some
access was detected.  The schemes will then migrate the found memory to
node 0 (fast), hottest pages first.

For accurate and effective migration, these schemes use two page level
filters.  First, the migration will be filtered for only cgroup A and
cgroup B.  That is, "promotion scheme for cgroup B" will not do the
migration if the page is for cgroup A.  Secondly, the schemes will ignore
pages that having their page table's Accessed bits unset.  The per-page
Accessed bit check logic will also unset the bit if it was set, for the
next check.

For controlled amounts of system resource consumption and aiming on the
target memory usage, the schemes use quotas setup.  The migration is
limited to be done only up to 200 MiB per second, to limit the peak system
resource usage.  And DAMOS_QUOTA_NODE_MEMCG_USED_BP target is set for
29.7% and 69.7% of node 0 (fast), respectively.  The target value is lower
than the high level goal (30% and 70% system memory), to give headroom on
node 0 (fast).  DAMOS will adjust the speed of the pages migration based
on the target and current per-cgroup node 0 memory usage.  For example, if
cgroup A is utilizing only 10% of node 0, DAMOS will try to migrate more
of cgroup A hot pages from node 1 to node 0, up to 200 MiB per second.  If
cgroup A utilizes more than 29.7% of node 0 memory, the cgroup A hot pages
migration from node 1 to node 0 will be slowed and eventually stopped.

Cgroup Level Cold Pages Migration (Demotion)
--------------------------------------------

Demotion schemes are similar to promotion schemes, but differ in filtering
setup and quota tuning setup.  Those filter out pages having their page
table Accessed bits set.  And set 70.5% and 30.5% of node 0 memory free
rate for the cgroup A and B, respectively.  Hence, if promotion schemes or
something made cgroup A and/or B uses more than 29.5% and 69.5% of node 0,
demotion schemes will start migrating cold pages of appropriate cgroups in
node 0 to node 1, under the 200 MiB per second speed cap, while adjusting
the speed based on how much more than wanted memory is being used.

The quota target values are set to overlap with promotion targets, to keep
a minimum level of page exchanges between the nodes.  This is to avoid a
case that the target memory utilization is met, and then access pattern
changes (pages in node 1 become hotter than pages in node 0) while the
memory utilization is unchanged.  Without the overlap, neither promotion
of hotter pages in node 1, nor demotion of colder pages in node 0 will
happen since both goals are met.  As a result, the faster and slower node
will unexpectedly serve cold and hot data.

Test: Per-cgroup Memory Tiering
===============================

I ran a simplified cgroup level memory tiering using the feature, and
confirmed it works as intended.

Setup
-----

I configured a QEMU virtual machine representing a simplified version of
the system that described on the above cgroup level memory tiering example
use case.  The system equips 40 CPU cores and two NUMA nodes each having
30 GiB physical memory.  The first node (node 0) represents the faster
NUMA node, and the second node (node 1) represents the slower NUMA node. 
In specific, below qemu command line options are used.

    [...]
    -object memory-backend-ram,size=30G,id=m0 \
    -object memory-backend-ram,size=30G,id=m1 \
    -numa node,cpus=0-39,memdev=m0 \
    -numa node,memdev=m1 \
    [...]

I booted the virtual machine with a kernel that this patch series is
applied.  On the virtual machine, I created two cgroups, namely workload_a
and workload_b.  And ran a test program in each cgroup, resulting in one
process per cgroup.  The test program allocates 10 GiB memory and evenly
split it into 10 regions.  After the allocation, it repeatedly access the
first region for one minute, than the second one for one minute, and so
on.  After the one minute repeated access for the 10-th region is done, it
repeats the access from the first region.  So the process has 10 GiB of
data in total, but only 1 GiB of it is hot at a given moment, and the hot
data is gradually changed.

While the processes are running, run DAMON for a simple access-aware
memory tiering using below script.  It migrates hot and cold data of the
cgroups into node 0 and node 1, aiming the first and the second cgroups
(workload_a and workload_b, respectively) utilizing about 9.7 percent and
19.7 percent of node 0, respectively.

Note that this setup is a simplified version of the above example use
case, for ease of test.  Also note that we assigned 30 GiB physical memory
to node 0, but DAMON in this setup works for only 27 GiB of the memory. 
It is due to an internal implementation detail of DAMON user-space tool
that not really important for this test.

    #!/bin/bash
    damo start \
        --numa_node 1 \
            --damos_action migrate_hot 0 --damos_access_rate 5% max \
                --damos_apply_interval 1s \
                --damos_filter allow memcg /workload_a \
                --damos_filter allow young \
                --damos_quota_interval 1s \
                --damos_quota_goal node_memcg_used_bp 9.7% 0 /workload_a \
            --damos_action migrate_hot 0 --damos_access_rate 5% max \
                --damos_apply_interval 1s \
                --damos_filter allow memcg /workload_b \
                --damos_filter allow young \
                --damos_quota_interval 1s \
                --damos_quota_goal node_memcg_used_bp 19.7% 0 /workload_b \
        --numa_node 0 \
            --damos_action migrate_cold 1 --damos_access_rate 0% 0% \
                --damos_apply_interval 1s \
                --damos_filter allow memcg /workload_a \
                --damos_filter reject young \
                --damos_quota_interval 1s \
                --damos_quota_goal node_memcg_free_bp 90.5% 0 /workload_a \
            --damos_action migrate_cold 1 --damos_access_rate 0% 0% \
                --damos_apply_interval 1s \
                --damos_filter allow memcg /workload_b \
                --damos_filter reject young \
                --damos_quota_interval 1s \
                --damos_quota_goal node_memcg_free_bp 80.5% 0 /workload_b \
                --damos_nr_quota_goals 1 1 1 1 --damos_nr_filters 2 2 2 2 \
        --nr_targets 1 1 --nr_schemes 2 2 --nr_ctxs 1 1

After starting DAMON, the pages continuously be migrated across nodes.  A
few minutes later, the memory usage of the cgroups converges into the
aimed amounts, and keeps the level, as expected.  To confirm the status is
kept in the target level as expected, I collected the memory usage stat of
the cgroups using memory.numa_stat file, after the stats are converged.  I
repeat the stat collection 42 times with 5 seconds delay between each of
the collections.  The results are as below:

    node0_memory_usage  average  stdev
    workload_a          2.79GiB  522.06MiB
    workload_b          5.15GiB  739.10MiB

The average values are quite close to the targeted values: 27 GiB * 9.7% =
2.619 GiB for workload_a, and 27 GiB * 19.7% = 5.319 GiB.  A level of
variances are expected, given the overlap of the promotion/demotion
targets, and dynamic data access pattern of the workloads.  Give that, the
measured variances are at a reasonable level.

Patches Sequence
================

The first patch (patch 1) updates the kernel-doc comment of
damos_quota_goal struct to clarify usage of optional fields of the struct,
since later patches will add such optional fields.

Following four patches (patches 2-5) implement a new DAMOS quota goal
metric for per-cgroup per-node memory usage.  Those extends the core layer
interface for the new metric (patch 2), implement the metric value
calculation on the core layer (patch 3), add DAMON sysfs interface file
for the target cgroup specification (patch 4), and implement support of
the new metric on DAMON sysfs interface (patch 5).

Next two patches implment the second new DAMOS quota goal metric for
per-cgroup per-node free (or, unused) memory.  Those implement it in the
core layer (patch 6) and DAMON sysfs interface (patch 7), extending the
existing implementation for memory usage metric.

Final three patches update the design (patch 8), the usage (patch 9), and
the ABI (patch 10) documents for the changes that are introduced by this
patch series.


This patch (of 10):

damos_quota_goal kerneldoc comment is not explaining when @metric is used.
Update the comment for that.

Link: https://lkml.kernel.org/r/20251017212706.183502-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20251017212706.183502-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:07 -08:00
Baolin Wang
2f05435df9 mm: vmscan: simplify the logic for activating dirty file folios
After commit 6b0dfabb3555 ("fs: Remove aops->writepage"), we no longer
attempt to write back filesystem folios through reclaim.

However, in the shrink_folio_list() function, there still remains some
logic related to writeback control of dirty file folios.  The original
logic was that, for direct reclaim, or when folio_test_reclaim() is false,
or the PGDAT_DIRTY flag is not set, the dirty file folios would be
directly activated to avoid being scanned again; otherwise, it will try to
writeback the dirty file folios.  However, since we can no longer perform
writeback on dirty folios, the dirty file folios will still be activated.

Additionally, under the original logic, if we continue to try writeback
dirty file folios, we will also check the references flag,
sc->may_writepage, and may_enter_fs(), which may result in dirty file
folios being left in the inactive list.  This is unreasonable.  Even if
these dirty folios are scanned again, we still cannot clean them.

Therefore, the checks on these dirty file folios appear to be redundant
and can be removed.  Dirty file folios should be directly moved to the
active list to avoid being scanned again.  Since we set the PG_reclaim
flag for the dirty folios, once the writeback is completed, they will be
moved back to the tail of the inactive list to be retried for quick
reclaim.

Link: https://lkml.kernel.org/r/ba5c49955fd93c6850bcc19abf0e02e1573768aa.1760687075.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:06 -08:00
Baolin Wang
b34619af9c mm: vmscan: filter out the dirty file folios for node_reclaim()
Patch series "optimize the logic for handling dirty file folios during
reclaim", v2.

Since we no longer attempt to write back filesystem folios during reclaim,
some logic for handling dirty file folios in the reclaim process also
needs to be updated.  Please check the details in each patch.  


This patch (of 2):

After commit 6b0dfabb3555 ("fs: Remove aops->writepage"), we no longer
attempt to write back filesystem folios in pageout(), and only tmpfs/shmem
folios and anonymous swapcache folios can be written back.  Therefore, we
should also filter out the dirty filesystem folios for node_reclaim() to
avoid unnecessary LRU scans.

Link: https://lkml.kernel.org/r/cover.1760687075.git.baolin.wang@linux.alibaba.com
Link: https://lkml.kernel.org/r/c91f5ecc5152b647904c7503618a01885d913928.1760687075.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:06 -08:00
Ye Liu
5bf65d4a8d tools/mm/page_owner_sort: add help option support
Add -h/--help option to display usage information and improve code style.

Link: https://lkml.kernel.org/r/20251016054927.138510-1-ye.liu@linux.dev
Signed-off-by: Ye Liu <liuye@kylinos.cn>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:06 -08:00
Shakeel Butt
d929525c2e memcg: net: track network throttling due to memcg memory pressure
The kernel can throttle network sockets if the memory cgroup associated
with the corresponding socket is under memory pressure.  The throttling
actions include clamping the transmit window, failing to expand receive or
send buffers, aggressively prune out-of-order receive queue, FIN deferred
to a retransmitted packet and more.  Let's add memcg metric to track such
throttling actions.

At the moment memcg memory pressure is defined through vmpressure and in
future it may be defined using PSI or we may add more flexible way for the
users to define memory pressure, maybe through ebpf.  However the
potential throttling actions will remain the same, so this newly
introduced metric will continue to track throttling actions irrespective
of how memcg memory pressure is defined.

Link: https://lkml.kernel.org/r/20251016161035.86161-1-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Daniel Sedlak <daniel.sedlak@cdn77.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kacinski <kuba@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Simon Horman <horms@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:06 -08:00
wang lian
a059ad48b4 mm/khugepaged: fix comment for default scan sleep duration
The comment for khugepaged_scan_sleep_millisecs incorrectly states the
default scan period is 30 seconds.  The actual default value in the code
is 10000ms (10 seconds).

This patch corrects the comment to match the code, preventing potential
confusion.  The incorrect comment has existed since the feature was first
introduced.  While at it, replace the magic value 512 by HPAGE_PMD_NR and
use 'ptes'.

Link: https://lkml.kernel.org/r/20251015092957.37432-1-lianux.mm@gmail.com
Signed-off-by: wang lian <lianux.mm@gmail.com>
Suggested-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Acked-by: Nico Pache <npache@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:05 -08:00
Ye Liu
2f79ddb64b tools/mm: use <stdbool.h> in page_owner_sort.c
Use standard <stdbool.h> instead of manually defining bool, true and false.

Link: https://lkml.kernel.org/r/20251015093851.109663-1-ye.liu@linux.dev
Signed-off-by: Ye Liu <liuye@kylinos.cn>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:05 -08:00
Vlastimil Babka
0f21b91101 mm/page_alloc: simplify and cleanup pcp locking
The pcp locking relies on pcp_spin_trylock() which has to be used together
with pcp_trylock_prepare()/pcp_trylock_finish() to work properly on !SMP
!RT configs.  This is tedious and error-prone.

We can remove pcp_spin_lock() and underlying pcpu_spin_lock() because we
don't use it.  Afterwards pcp_spin_unlock() is only used together with
pcp_spin_trylock().  Therefore we can add the UP_flags parameter to them
both and handle pcp_trylock_prepare()/finish() within.

Additionally for the configs where pcp_trylock_prepare()/finish() are
no-op (SMP || RT) make them pass &UP_flags to a no-op inline function. 
This ensures typechecking and makes the local variable "used" so we can
remove the __maybe_unused attributes.

In my compile testing, bloat-o-meter reported no change on SMP config, so
the compiler is capable of optimizing away the no-ops same as before, and
we have simplified the code using pcp_spin_trylock().

Link: https://lkml.kernel.org/r/20251015-b4-pcp-lock-cleanup-v2-1-740d999595d5@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Joshua Hahn <joshua.hahnjy@gmail.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:05 -08:00
Joshua Hahn
91e6912966 mm/page_alloc: batch page freeing in free_frozen_page_commit
Before returning, free_frozen_page_commit calls free_pcppages_bulk using
nr_pcp_free to determine how many pages can appropritately be freed, based
on the tunable parameters stored in pcp.  While this number is an accurate
representation of how many pages should be freed in total, it is not an
appropriate number of pages to free at once using free_pcppages_bulk,
since we have seen the value consistently go above 2000 in the Meta fleet
on larger machines.

As such, perform batched page freeing in free_pcppages_bulk by using
pcp->batch.  In order to ensure that other processes are not starved of
the zone lock, free both the zone lock and pcp lock to yield to other
threads.

Note that because free_frozen_page_commit now performs a spinlock inside
the function (and can fail), the function may now return with a freed pcp.
To handle this, return true if the pcp is locked on exit and false
otherwise.

In addition, since free_frozen_page_commit must now be aware of what UP
flags were stored at the time of the spin lock, and because we must be
able to report new UP flags to the callers, add a new unsigned long*
parameter UP_flags to keep track of this.

The following are a few synthetic benchmarks, made on three machines.  The
first is a large machine with 754GiB memory and 316 processors.  The
second is a relatively smaller machine with 251GiB memory and 176
processors.  The third and final is the smallest of the three, which has
62GiB memory and 36 processors.

On all machines, I kick off a kernel build with -j$(nproc).  Negative
delta is better (faster compilation)

Large machine (754GiB memory, 316 processors)
make -j$(nproc)
+------------+---------------+-----------+
| Metric (s) | Variation (%) | Delta(%)  |
+------------+---------------+-----------+
| real       |        0.8070 |  - 1.4865 |
| user       |        0.2823 |  + 0.4081 |
| sys        |        5.0267 |  -11.8737 |
+------------+---------------+-----------+

Medium machine (251GiB memory, 176 processors)
make -j$(nproc)
+------------+---------------+----------+
| Metric (s) | Variation (%) | Delta(%) |
+------------+---------------+----------+
| real       |        0.2806 |  +0.0351 |
| user       |        0.0994 |  +0.3170 |
| sys        |        0.6229 |  -0.6277 |
+------------+---------------+----------+

Small machine (62GiB memory, 36 processors)
make -j$(nproc)
+------------+---------------+----------+
| Metric (s) | Variation (%) | Delta(%) |
+------------+---------------+----------+
| real       |        0.1503 |  -2.6585 |
| user       |        0.0431 |  -2.2984 |
| sys        |        0.1870 |  -3.2013 |
+------------+---------------+----------+

Here, variation is the coefficient of variation, i.e.  standard deviation
/ mean.

[joshua.hahnjy@gmail.com: simplify checks]
  Link: https://lkml.kernel.org/r/20251014192827.851389-1-joshua.hahnjy@gmail.com
Link: https://lkml.kernel.org/r/20251014145011.3427205-4-joshua.hahnjy@gmail.com
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
Suggested-by: Chris Mason <clm@fb.com>
Co-developed-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Michal Hocko <mhocko@suse.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:05 -08:00
Joshua Hahn
fc4b909c36 mm/page_alloc: batch page freeing in decay_pcp_high
It is possible for pcp->count - pcp->high to exceed pcp->batch by a lot. 
When this happens, we should perform batching to ensure that
free_pcppages_bulk isn't called with too many pages to free at once and
starve out other threads that need the pcp or zone lock.

Since we are still only freeing the difference between the initial
pcp->count and pcp->high values, there should be no change to how many
pages are freed.

Link: https://lkml.kernel.org/r/20251014145011.3427205-3-joshua.hahnjy@gmail.com
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
Suggested-by: Chris Mason <clm@fb.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Co-developed-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Michal Hocko <mhocko@suse.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:04 -08:00
Joshua Hahn
0acc67c403 mm/page_alloc/vmstat: simplify refresh_cpu_vm_stats change detection
Patch series "mm/page_alloc: Batch callers of free_pcppages_bulk", v5.

Motivation & Approach
=====================

While testing workloads with high sustained memory pressure on large
machines in the Meta fleet (1Tb memory, 316 CPUs), we saw an unexpectedly
high number of softlockups.  Further investigation showed that the zone
lock in free_pcppages_bulk was being held for a long time, and was called
to free 2k+ pages over 100 times just during boot.

This causes starvation in other processes for the zone lock, which can
lead to the system stalling as multiple threads cannot make progress
without the locks.  We can see these issues manifesting as warnings:

[ 4512.591979] rcu: INFO: rcu_sched self-detected stall on CPU
[ 4512.604370] rcu:     20-....: (9312 ticks this GP) idle=a654/1/0x4000000000000000 softirq=309340/309344 fqs=5426
[ 4512.626401] rcu:              hardirqs   softirqs   csw/system
[ 4512.638793] rcu:      number:        0        145            0
[ 4512.651177] rcu:     cputime:       30      10410          174   ==> 10558(ms)
[ 4512.666657] rcu:     (t=21077 jiffies g=783665 q=1242213 ncpus=316)

While these warnings don't indicate a crash or a kernel panic, they do
point to the underlying issue of lock contention.  To prevent starvation
in both locks, batch the freeing of pages using pcp->batch.

Because free_pcppages_bulk is called with the pcp lock and acquires the
zone lock, relinquishing and reacquiring the locks are only effective when
both of them are broken together (unless the system was built with queued
spinlocks).  Thus, instead of modifying free_pcppages_bulk to break both
locks, batch the freeing from its callers instead.

A similar fix has been implemented in the Meta fleet, and we have seen
significantly less softlockups.

Testing
=======
The following are a few synthetic benchmarks, made on three machines. The
first is a large machine with 754GiB memory and 316 processors.
The second is a relatively smaller machine with 251GiB memory and 176
processors. The third and final is the smallest of the three, which has 62GiB
memory and 36 processors.

On all machines, I kick off a kernel build with -j$(nproc).
Negative delta is better (faster compilation).

Large machine (754GiB memory, 316 processors)
make -j$(nproc)
+------------+---------------+-----------+
| Metric (s) | Variation (%) | Delta(%)  |
+------------+---------------+-----------+
| real       |        0.8070 |  - 1.4865 |
| user       |        0.2823 |  + 0.4081 |
| sys        |        5.0267 |  -11.8737 |
+------------+---------------+-----------+

Medium machine (251GiB memory, 176 processors)
make -j$(nproc)
+------------+---------------+----------+
| Metric (s) | Variation (%) | Delta(%) |
+------------+---------------+----------+
| real       |        0.2806 |  +0.0351 |
| user       |        0.0994 |  +0.3170 |
| sys        |        0.6229 |  -0.6277 |
+------------+---------------+----------+

Small machine (62GiB memory, 36 processors)
make -j$(nproc)
+------------+---------------+----------+
| Metric (s) | Variation (%) | Delta(%) |
+------------+---------------+----------+
| real       |        0.1503 |  -2.6585 |
| user       |        0.0431 |  -2.2984 |
| sys        |        0.1870 |  -3.2013 |
+------------+---------------+----------+

Here, variation is the coefficient of variation, i.e.  standard deviation
/ mean.

Based on these results, it seems like there are varying degrees to how
much lock contention this reduces.  For the largest and smallest machines
that I ran the tests on, it seems like there is quite some significant
reduction.  There is also some performance increases visible from
userspace.

Interestingly, the performance gains don't scale with the size of the
machine, but rather there seems to be a dip in the gain there is for the
medium-sized machine.  One possible theory is that because the high
watermark depends on both memory and the number of local CPUs, what
impacts zone contention the most is not these individual values, but
rather the ratio of mem:processors.


This patch (of 5):

Currently, refresh_cpu_vm_stats returns an int, indicating how many
changes were made during its updates.  Using this information, callers
like vmstat_update can heuristically determine if more work will be done
in the future.

However, all of refresh_cpu_vm_stats's callers either (a) ignore the
result, only caring about performing the updates, or (b) only care about
whether changes were made, but not *how many* changes were made.

Simplify the code by returning a bool instead to indicate if updates
were made.

In addition, simplify fold_diff and decay_pcp_high to return a bool
for the same reason.

Link: https://lkml.kernel.org/r/20251014145011.3427205-1-joshua.hahnjy@gmail.com
Link: https://lkml.kernel.org/r/20251014145011.3427205-2-joshua.hahnjy@gmail.com
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Chris Mason <clm@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:04 -08:00
Kefeng Wang
f66e2727dd mm: huge_memory: use folio_can_map_prot_numa() for pmd folio
The folio_can_map_prot_numa() checks whether the folio can map prot numa,
which skips unsuitable folio, i.e.  zone device, shared folios (KSM, CoW),
non-movable dma pinned, dirty file folio and folios that already have the
expected node affinity.  Although the ksm only applies to small folios, an
extra test was added for large folios, but the other policies should be
applied to pmd folio, which helps to avoid unnecessary pmd change and
folio migration attempts.

Link: https://lkml.kernel.org/r/20251023113737.3572790-5-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:04 -08:00
Kefeng Wang
ca43034cdb mm: mprotect: convert to folio_can_map_prot_numa()
The prot_numa_skip() naming is not good since it updates the folio access
time except checking whether to skip prot NUMA, so rename it to
folio_can_map_prot_numa(), and cleanup it a bit, remove ret by directly
return value instead of goto style.

Adding a new helper vma_is_single_threaded_private() to check whether it's
a single threaded private VMA, and make folio_can_map_prot_numa() a
non-static function so that they could be reused in change_huge_pmd(),
since folio_can_map_prot_numa() will be shared in different paths, let's
move it near change_prot_numa() in mempolicy.c.

Link: https://lkml.kernel.org/r/20251023113737.3572790-4-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:03 -08:00
Kefeng Wang
6e97624dac mm: mprotect: avoid unnecessary struct page accessing if pte_protnone()
If the pte_protnone() is true, we could avoid unnecessary struct page
accessing and reduce cache footprint when scanning page tables for prot
numa, there was a similar change before, see more commit a818f5363a0e
("autonuma: reduce cache footprint when scanning page tables").

Link: https://lkml.kernel.org/r/20251023113737.3572790-3-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:03 -08:00
Kefeng Wang
03aa8e4f27 mm: mprotect: always skip dma pinned folio in prot_numa_skip()
Patch series "mm: some optimizations for prot numa", v5.


This patch (of 4):

If the folio (even not CoW folio) is dma pinned, it can't be migrated due
to the elevated reference count.  So always skip a pinned folio to avoid
wasting cycles when folios are migrated.

Link: https://lkml.kernel.org/r/20251023113737.3572790-1-wangkefeng.wang@huawei.com
Link: https://lkml.kernel.org/r/20251023113737.3572790-2-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:03 -08:00
Donet Tom
d945667dcb drivers/base/node: fold unregister_node() into unregister_one_node()
unregister_node() is only called from unregister_one_node().  This patch
folds unregister_node() into its only caller and renames
unregister_one_node() to unregister_node().

This reduces unnecessary indirection and simplifies the code structure. 
No functional changes are introduced.

[donettom@linux.ibm.com: remove extra spaces before @nid and "All"]
  Link: https://lkml.kernel.org/r/cff01514-9074-4c97-bcf1-d4e3594e48b0@linux.ibm.com
Link: https://lkml.kernel.org/r/32b7d5d8f0f30d313c3e1d8798f591459c8746f9.1760097208.git.donettom@linux.ibm.com
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: SeongJae Park <sj@kernel.org>
Cc: Aboorva Devarajan <aboorvad@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:03 -08:00
Donet Tom
eb8762dc22 drivers/base/node: fold register_node() into register_one_node()
Patch series "drivers/base/node: fold node register and unregister
functions", v2.

The first patch merges register_one_node() and register_node(), leaving a
single register_node() function.

The second patch merges unregister_one_node() and unregister_node(),
leaving a single unregister_node() function.

There are no functional changes in these patches.


This patch (of 2):

register_node() is only called from register_one_node().  This patch folds
register_node() into its only caller and renames register_one_node() to
register_node().

This reduces unnecessary indirection and simplifies the code structure. 
No functional changes are introduced.

[akpm@linux-foundation.org: fix kerneldoc, per David]
Link: https://lkml.kernel.org/r/cover.1760097207.git.donettom@linux.ibm.com
Link: https://lkml.kernel.org/r/910853c9dd61f7a2190a56cba101e73e9c6859be.1760097207.git.donettom@linux.ibm.com
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: SeongJae Park <sj@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Aboorva Devarajan <aboorvad@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:02 -08:00
Huacai Chen
900fcf00e1 mm: remove the BOUNCE config option
Commit eeadd68e2a5f ("block: remove bounce buffering support") remove
block/bounce.c but left the BOUNCE config option.  Now this option has no
users, so remove it.

Link: https://lkml.kernel.org/r/20251013095620.1111061-1-chenhuacai@loongson.cn
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Cc: John Garry <john.g.garry@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:02 -08:00
Uladzislau Rezki (Sony)
8cb290dd4b vmalloc: update __vmalloc_node_noprof() documentation
The kernel-doc for __vmalloc_node_noprof() incorrectly states that
__GFP_NOFAIL reclaim modifier is not supported.  In fact it has been
supported since commit 9376130c390a ("mm/vmalloc: add support for
__GFP_NOFAIL").

To avoid duplication and future drift, point this helper's doc to
__vmalloc_node_range_noprof() for details and the full description.

Link: https://lkml.kernel.org/r/20251013174222.90123-1-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:02 -08:00
Yadong Qi
a739e6b557 mm: vmalloc: WARN_ON if mapping size is not PAGE_SIZE aligned
In mm/vmalloc.c, the function vmap_pte_range() assumes that the mapping
size is aligned to PAGE_SIZE.  If this assumption is violated, the loop
will become infinite because the termination condition (`addr != end`)
will never be met.  This can lead to overwriting other VA ranges and/or
random pages physically follow the page table.

It's the caller's responsibility to ensure that the mapping size is
aligned to PAGE_SIZE.  However, the memory corruption is hard to root
cause.  To identify the programming error in the caller easier, check
whether the mapping size is PAGE_SIZE aligned with WARN_ON_ONCE().

[yadong.qi@linux.alibaba.com: fix uninitialized value issue]
  Closes: https://lore.kernel.org/r/202510110050.VG9YKMRK-lkp@intel.com/
Link: https://lkml.kernel.org/r/20251010014311.1689-1-yadong.qi@linux.alibaba.com
Signed-off-by: Yadong Qi <yadong.qi@linux.alibaba.com>
Reviewed-by: Huang Ying <ying.huang@linux.alibaba.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:02 -08:00
Song Hu
ca30ac479e mm/page_owner: simplify zone iteration logic in init_early_allocated_pages()
The current implementation uses nested loops: first iterating over all
online nodes, then over zones within each node.  This can be simplified by
using the for_each_populated_zone() macro which directly iterates through
all populated zones.

This change:
1. Removes the intermediate init_zones_in_node() function
2. Simplifies init_early_allocated_pages() to use direct zone iteration
3. Updates init_pages_in_zone() to take only zone parameter and access
   node_id via zone->zone_pgdat

The functionality remains identical, but the code is cleaner and more
maintainable.

Link: https://lkml.kernel.org/r/20250930092153.843109-2-husong@kylinos.cn
Signed-off-by: Song Hu <husong@kylinos.cn>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Ye Liu <liuye@kylinos.cn>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:01 -08:00
Song Hu
9686080d62 mm/page_owner: rename proc-prefixed variables for clarity
`proc_page_owner_operations` and related variables were renamed to
`page_owner_fops` to better reflect their association with `debugfs`
rather than `/proc`.  This improves code clarity and aligns with kernel
naming conventions.

Link: https://lkml.kernel.org/r/20250930092153.843109-1-husong@kylinos.cn
Signed-off-by: Song Hu <husong@kylinos.cn>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Ye Liu <liuye@kylinos.cn>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:01 -08:00
Sabyrzhan Tasbolatov
ada5cbe33a kasan: cleanup of kasan_enabled() checks
Deduplication of kasan_enabled() checks which are already used by callers.

* Altered functions:

check_page_allocation
	Delete the check because callers have it already in __wrappers in
	include/linux/kasan.h:
		__kasan_kfree_large
		__kasan_mempool_poison_pages
		__kasan_mempool_poison_object

kasan_populate_vmalloc, kasan_release_vmalloc
	Add __wrappers in include/linux/kasan.h.
	They are called externally in mm/vmalloc.c.

__kasan_unpoison_vmalloc, __kasan_poison_vmalloc
	Delete checks because there're already kasan_enabled() checks
	in respective __wrappers in include/linux/kasan.h.

release_free_meta -- Delete the check because the higher caller path
	has it already. See the stack trace:

	__kasan_slab_free -- has the check already
	__kasan_mempool_poison_object -- has the check already
		poison_slab_object
			kasan_save_free_info
				release_free_meta
					kasan_enabled() -- Delete here

Link: https://lkml.kernel.org/r/20251009155403.1379150-3-snovitoll@gmail.com
Signed-off-by: Sabyrzhan Tasbolatov <snovitoll@gmail.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:01 -08:00
Sabyrzhan Tasbolatov
27109f5703 kasan: remove __kasan_save_free_info wrapper
Patch series "kasan: cleanups for kasan_enabled() checks".

This patch series is the continuation of [1] the previous discussion
related to the KASAN internal refactoring.

Here we remove kasan_enabled() checks which are duplicated by higher
callers.  These checks deduplication are also related to the separate
patch series [2].


This patch (of 2):

We don't need a kasan_enabled() check in kasan_save_free_info() at all. 
Both the higher level paths (kasan_slab_free and
kasan_mempool_poison_object) already contain this check.  Therefore,
remove the __wrapper.

Link: https://lkml.kernel.org/r/20251009155403.1379150-1-snovitoll@gmail.com
Link: https://lkml.kernel.org/r/20251009155403.1379150-2-snovitoll@gmail.com
Link: https://lore.kernel.org/all/CA+fCnZce3AR+pUesbDkKMtMJ+iR8eDrcjFTbVpAcwjBoZ=gJnQ@mail.gmail.com/ [1]
Link: https://lore.kernel.org/all/aNTfPjS2buXMI46D@MiWiFi-R3L-srv/ [2]
Signed-off-by: Sabyrzhan Tasbolatov <snovitoll@gmail.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:01 -08:00
Lokesh Gidra
cc22b99785 mm/userfaultfd: don't lock anon_vma when performing UFFDIO_MOVE
Now that rmap_walk() is guaranteed to be called with the folio lock held,
we can stop serializing on the src VMA anon_vma lock when moving an
exclusive folio from a src VMA to a dst VMA in UFFDIO_MOVE ioctl.

When moving a folio, we modify folio->mapping through
folio_move_anon_rmap() and adjust folio->index accordingly.  Doing that
while we could have concurrent RMAP walks would be dangerous.  Therefore,
to avoid that, we had to acquire anon_vma of src VMA in write-mode.  That
meant that when multiple threads called UFFDIO_MOVE concurrently on
distinct pages of the same src VMA, they would serialize on it, hurting
scalability.

In addition to avoiding the scalability bottleneck, this patch also
simplifies the complicated lock dance that UFFDIO_MOVE has to go through
between RCU, folio-lock, ptl, and anon_vma.

folio_move_anon_rmap() already enforces that the folio is locked.  So when
we have the folio locked we can no longer race with concurrent rmap_walk()
as used by folio_referenced() and others who call it on unlocked non-KSM
anon folios, and therefore the anon_vma lock is no longer required.

Note that this handling is now the same as for other
folio_move_anon_rmap() users that also do not hold the anon_vma lock --
namely COW reuse handling (do_wp_page()->wp_can_reuse_anon_folio(),
do_huge_pmd_wp_page(), and hugetlb_wp()).  These users never required the
anon_vma lock as they are only moving the anon VMA closer to the anon_vma
leaf of the VMA, for example, from an anon_vma root to a leaf of that
root.  rmap walks were always able to tolerate that scenario.

Link: https://lkml.kernel.org/r/20250923071019.775806-3-lokeshgidra@google.com
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:00 -08:00
Lokesh Gidra
95b34d6648 mm: always call rmap_walk() on locked folios
Patch series "Improve UFFDIO_MOVE scalability by removing anon_vma lock", v2.

Userfaultfd has a scalability issue in its UFFDIO_MOVE ioctl, which is
heavily used in Android as its java garbage collector uses it for
concurrent heap compaction.

The issue arises because UFFDIO_MOVE updates folio->mapping to an anon_vma
with a different root, in order to move the folio from a src VMA to dst
VMA.  It performs the operation with the folio locked, but this is
insufficient, because rmap_walk() can be performed on non-KSM anonymous
folios without folio lock.

This means that UFFDIO_MOVE has to acquire the anon_vma write lock of the
root anon_vma belonging to the folio it wishes to move.

This causes scalability bottleneck when multiple threads perform
UFFDIO_MOVE simultanously on distinct pages of the same src VMA.  In field
traces of arm64 android devices, we have observed janky user interactions
due to long (sometimes over ~50ms) uninterruptible sleeps on main UI
thread caused by anon_vma lock contention in UFFDIO_MOVE.  This is
particularly severe during the beginning of GC's compaction phase when it
is likely to have multiple threads involved.

This patch resolves the issue by removing the exception in rmap_walk() for
non-KSM anon folios by ensuring that all folios are locked during rmap
walk.  This is less problematic than it might seem, as the only major
caller which utilises this mode is shrink_active_list(), which is covered
in detail in the first patch of this series.

As a result of changing our approach to locking, we can remove all the
code that took steps to acquire an anon_vma write lock instead of a folio
lock.  This results in a significant simplification and scalability
improvement of the code (currently only in UFFDIO_MOVE).  Furthermore, as
a side-effect, folio_lock_anon_vma_read() gets simpler as we don't need to
worry that folio->mapping may have changed under us.


This patch (of 2):

Guarantee that rmap_walk() is called on locked folios so that threads
changing folio->mapping and folio->index for non-KSM anon folios can
serialize on fine-grained folio lock rather than anon_vma lock.  Other
folio types are already always locked before rmap_walk().  With this, we
are going from 'not necessarily' locking the non-KSM anon folio to
'definitely' locking it during rmap walks.

This patch is in preparation for removing anon_vma write-lock from
UFFDIO_MOVE.

With this patch, three functions are now expected to be called with a
locked folio.  To be careful of not missing any case, here is the
exhaustive list of all their callers.

1) rmap_walk() is called from:

a) folio_referenced()
b) damon_folio_mkold()
c) damon_folio_young()
d) page_idle_clear_pte_refs()
e) try_to_unmap()
f) try_to_migrate()
g) folio_mkclean()
h) remove_migration_ptes()

In the above list, first 4 are changed in this patch to try-lock non-KSM
anon folios, similar to other types of folios.  The remaining functions in
the list already hold folio lock when calling rmap_walk().

2) folio_lock_anon_vma_read() is called from following functions:

a) collect_procs_anon()
b) page_idle_clear_pte_refs()
c) damon_folio_mkold()
d) damon_folio_young()
e) folio_referenced()
f) try_to_unmap()
g) try_to_migrate()

All the functions in above list, except collect_procs_anon(), are covered
by the rmap_walk() list above.  For collect_procs_anon(), with
kill_procs_now() changed to take folio lock in this patch ensures that all
callers of folio_lock_anon_vma_read() now hold the lock.

3) folio_get_anon_vma() is called from following functions, all of which
   already hold the folio lock:

a) move_pages_huge_pmd()
b) __folio_split()
c) move_pages_ptes()
d) migrate_folio_unmap()
e) unmap_and_move_huge_page()

Functionally, this patch doesn't break the logic because rmap walkers
generally do some other check to see if what is expected to mapped did
happen so it's fine, or otherwise treat things as best-effort.

Among the 4 functions changed in this patch, folio_referenced() is the
only core-mm function, and is also frequently accessed.  To assess the
impact of locking non-KSM anon folios in
shrink_active_list()->folio_referenced() path, we performed an app cycle
test on an arm64 android device.  During the whole duration of the test
there were over 140k invocations of shrink_active_list(), out of which
over 29k had at least one non-KSM anon folio on which folio_referenced()
was called.  In none of these invocations folio_trylock() failed.

Of course, we now take a lock where we wouldn't previously have.  In the
past it would have had a major impact in causing a CoW write fault to copy
a page in do_wp_page(), as commit 09854ba94c6a ("mm: do_wp_page()
simplification") caused a failure to obtain folio lock to result in a page
copy even if one wasn't necessary.

However, since commit 6c287605fd56 ("mm: remember exclusively mapped
anonymous pages with PG_anon_exclusive"), and the introduction of the
folio anon exclusive flag, this issue is significantly mitigated.

The only case remaining that we might worry about from this perspective is
that of read-only folios immediately after fork where the anon exclusive
bit will not have been set yet.

We note however in the case of read-only just-forked folios that
wp_can_reuse_anon_folio() will notice the raised reference count
established by shrink_active_list() via isolate_lru_folios() and refuse to
reuse in any case, so this will in fact have no impact - the folio lock is
ultimately immaterial here.

All-in-all it appears that there is little opportunity for meaningful
negative impact from this change.

Link: https://lkml.kernel.org/r/20250923071019.775806-1-lokeshgidra@google.com
Link: https://lkml.kernel.org/r/20250923071019.775806-2-lokeshgidra@google.com
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Barry Song <baohua@kernel.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:00 -08:00
Usama Arif
eb02f14c4a mm/hugetlb: allow overcommitting gigantic hugepages
Currently, gigantic hugepages cannot use the overcommit mechanism
(nr_overcommit_hugepages), forcing users to permanently reserve memory via
nr_hugepages even when pages might not be actively used.

The restriction was added in 2011 [1], which was before there was support
for reserving 1G hugepages at runtime.  Remove this blanket restriction on
gigantic hugepage overcommit.  This will bring the same benefits to
gigantic pages as hugepages:

- Memory is only taken out of regular use when actually needed
- Unused surplus pages can be returned to the system
- Better memory utilization, especially with CMA backing which can
  significantly increase the changes of hugepage allocation

Without this patch:
echo 3 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_overcommit_hugepages
bash: echo: write error: Invalid argument

With this patch:
echo 3 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_overcommit_hugepages
./mmap_hugetlb_test
Successfully allocated huge pages at address: 0x7f9d40000000

cat mmap_hugetlb_test.c
...
    unsigned long ALLOC_SIZE = 3 * (unsigned long) HUGE_PAGE_SIZE;
    addr = mmap(NULL,
                ALLOC_SIZE, // 3GB
                PROT_READ | PROT_WRITE,
                MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_1GB,
                -1,
                0);

    if (addr == MAP_FAILED) {
        fprintf(stderr, "mmap failed: %s\n", strerror(errno));
        return 1;
    }
    printf("Successfully allocated huge pages at address: %p\n", addr);
...

Link: https://lkml.kernel.org/r/20251009172433.4158118-2-usamaarif642@gmail.com
Link: https://git.zx2c4.com/linux-rng/commit/mm/hugetlb.c?id=adbe8726dc2a3805630d517270db17e3af86e526 [1]
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Oscar Salvador <osalvador@suse.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:00 -08:00
Usama Arif
a743e0af50 mm/hugetlb: create hstate_is_gigantic_no_runtime helper
This is a common condition used to skip operations that cannot be
performed on gigantic pages when runtime support is disabled.  This helper
is introduced as the condition will exist even more when allowing
"overcommit" of gigantic hugepages.  No functional change intended with
this patch.

Link: https://lkml.kernel.org/r/20251009172433.4158118-1-usamaarif642@gmail.com
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Oscar Salvador <osalvador@suse.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:28:00 -08:00
Joshua Hahn
2783088ef2 mm/page_alloc: prevent reporting pcp->batch = 0
zone_batchsize returns the appropriate value that should be used for
pcp->batch. If it finds a zone with less than 4096 pages or PAGE_SIZE >
1M, however, it leads to some incorrect math.

In the above case, we will get an intermediary value of 1, which is then
rounded down to the nearest power of two, and 1 is subtracted from it.
Since 1 is already a power of two, we will get batch = 1-1 = 0:

	batch = rounddown_pow_of_two(batch + batch/2) - 1;

A pcp->batch value of 0 is nonsensical. If this were actually set, then
functions like drain_zone_pages would become no-ops, since they could
only free 0 pages at a time.

Of the two callers of zone_batchsize, the one that is actually used to
set pcp->batch works around this by setting pcp->batch to the maximum
of 1 and zone_batchsize. However, the other caller, zone_pcp_init,
incorrectly prints out the batch size of the zone to be 0.

This is probably rare in a typical zone, but the DMA zone can often have
less than 4096 pages, which means it will print out "LIFO batch:0".

Before: [    0.001216]   DMA zone: 3998 pages, LIFO batch:0
After:  [    0.001210]   DMA zone: 3998 pages, LIFO batch:1

Instead of dealing with the error handling and the mismatch between the
reported and actual zone batchsize, just return 1 if the zone_batchsize
is 1 page or less before the rounding.

Link: https://lkml.kernel.org/r/20251009192933.3756712-3-joshua.hahnjy@gmail.com
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:59 -08:00
Joshua Hahn
4dcf65bf5b mm/page_alloc: clarify batch tuning in zone_batchsize
Patch series "mm/page_alloc: pcp->batch cleanups", v2.

Two small cleanups for mm/page_alloc.

Patch 1 cleans up a misleading comment about how pcp->batch is calculated,
and folds in the calculation to increase clarity. No functional change
intended.

Patch 2 corrects zones from reporting that their pcp->batch is 0 when it
is actually 1. Namely, corrects ZONE_DMA from reporting that its batch
size is 0.


This patch (of 2):

Recently while working on another patch about batching free_pcppages_bulk
[1], I was curious why pcp->batch was always 63 on my machine.  This led
me to zone_batchsize(), where I found this set of lines to determine what
the batch size should be for the host:

	batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE);
	batch /= 4;		/* We effectively *= 4 below */
	if (batch < 1)
		batch = 1;

All of this is good, except the comment above which says "We effectively
*= 4 below".  Nowhere else in the function zone_batchsize(), is there a
corresponding multipliation by 4.  Looking into the history of this, it
seems like Dave Hansen had also noticed this back in 2013 [1].  Turns out
there *used* to be a corresponding *= 4, which was turned into a *= 6
later on to be used in pageset_setup_from_batch_size(), which no longer
exists.

Despite this mismatch not being corrected in the comments, it seems that
getting rid of the /= 4 leads to a performance regression on machines with
less than 250G memory and 176 processors.  As such, let us preserve the
functionality but clean up the comments.

Fold the /= 4 into the calculation above: bitshift by 10+2=12, and instead
of dividing 1MB, divide 256KB and adjust the comments accordingly.  No
functional change intended.

Link: https://lkml.kernel.org/r/20251009192933.3756712-1-joshua.hahnjy@gmail.com
Link: https://lkml.kernel.org/r/20251009192933.3756712-2-joshua.hahnjy@gmail.com
Link: https://lore.kernel.org/all/20251002204636.4016712-1-joshua.hahnjy@gmail.com/ [1]
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
Suggested-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:59 -08:00
Mauricio Faria de Oliveira
0de9a442ee mm/page_owner: update Documentation with 'show_handles' and 'show_stacks_handles'
Describe and provide examples for 'show_handles' and 'show_stacks_handles'.

Link: https://lkml.kernel.org/r/20251001175611.575861-6-mfo@igalia.com
Signed-off-by: Mauricio Faria de Oliveira <mfo@igalia.com>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:59 -08:00
Mauricio Faria de Oliveira
5513cfbcf4 mm/page_owner: add debugfs file 'show_stacks_handles'
Add the file 'show_stacks_handles' to show just stack traces and their
handles, in order to resolve stack traces and handles (i.e., to identify
the stack traces for handles in previous reads from 'show_handles').

All stacks/handles must show up, regardless of their number of pages, that
might have become zero or no longer make 'count_threshold', but made it in
previous reads from 'show_handles' -- and need to be resolved later.

P.S.: now, print the extra newline independently of the number of pages.

Link: https://lkml.kernel.org/r/20251001175611.575861-5-mfo@igalia.com
Signed-off-by: Mauricio Faria de Oliveira <mfo@igalia.com>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:59 -08:00
Mauricio Faria de Oliveira
3b52b9e31a mm/page_owner: add debugfs file 'show_handles'
Add the flag STACK_PRINT_FLAG_HANDLE to print a stack's handle number from
stackdepot, and add the file 'show_handles' to show just handles and their
number of pages.

This is similar to 'show_stacks', with handles instead of stack traces.

Link: https://lkml.kernel.org/r/20251001175611.575861-4-mfo@igalia.com
Signed-off-by: Mauricio Faria de Oliveira <mfo@igalia.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:58 -08:00
Mauricio Faria de Oliveira
5c8ca473d5 mm/page_owner: add struct stack_print_ctx.flags
Add the flags field to stack_print_ctx, and define two flags for current
behavior (printing stack traces and their number of base pages).

The plumbing of flags is debugfs_create_file(data) -> inode.i_private ->
page_owner_stack_open() -> stack_print_ctx.flags -> stack_print().

No behavior change intended.

Link: https://lkml.kernel.org/r/20251001175611.575861-3-mfo@igalia.com
Signed-off-by: Mauricio Faria de Oliveira <mfo@igalia.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:58 -08:00
Mauricio Faria de Oliveira
156c0c5d14 mm/page_owner: introduce struct stack_print_ctx
Patch series "mm/page_owner: add debugfs files 'show_handles' and
'show_stacks_handles'", v2.

Context:

The page_owner debug feature can help understand a particular situation in
in a point in time (e.g., identify biggest memory consumers; verify memory
counters that do not add up).

Another useful usecase is to collect data repeatedly over time, and use it
for profiling, monitoring, and even comparing different kernel versions,
at the stack trace level (e.g., watch for trends, leaks, correlations, and
regressions).

For this usecase, userspace periorically collects the data from page_owner
and organizes it in data structures appropriate for access per-stack
trace.

Problem:

The usecase of tracking memory usage per stack trace (or tracking it for a
particular stack trace) requires uniquely identifying each stack trace
(i.e., keys to store their memory usage over periodic data collections).

This has to be done for every stack trace in every sample/data collection,
even if tracking only one stack trace (to identify it among all others).

Therefore, an approach like hashing the stack traces in userspace to
create unique keys/identifiers for them during post-processing can quickly
become expensive, considering the repetition and a growing number of stack
traces.

Solution:

Fortunately, the kernel can provide a unique identifier for stack traces
in page_owner, which is the handle number in stackdepot.  This eliminates
the need for creating keys (hashing) in userspace during post-processing.

Additionally, with that information, the stack traces themselves are not
needed until the memory usage should be resolved from a handle to a stack
trace (say, to look at the stack traces of a few top consumers).  This can
reduce the amount of text emitted/copied by the kernel to userspace, and
save userspace from matching and discarding stack traces when not needed.

Changes:

This patchset adds 2 files to provide information, like 'show_stacks':
 - show_handles: print handle number and number of pages (no stack traces)
 - show_stacks_handles: print handle numbers and stack traces (no pages)

Now, it's possible to periodically collect data with handle numbers (keys)
and without stack traces (lower overhead) from 'show_handles', and later
do a final collection with handles and stack traces from
'show_stacks_handles' to resolve the handles to their stack traces.

The output format follows the existing 'show_stacks' file, for simplicity,
but it can certainly be changed if a different format is more convenient.

Example:

The number of base pages collected can be stored per-handle number over
the periodic data collections, and finally resolved to stack traces
per-handle number as well with a final collection.

Later, one can, for example, identify the biggest consumers and watch
their trends or correlate increases/decreases with other events in the
system, or watch a particular stack trace(s) of interest during
development.

Testing:

Tested on next-20250929.

 - show_stacks:
	
	 register_dummy_stack+0x32/0x70
	 init_page_owner+0x29/0x2f0
	 page_ext_init+0x27c/0x2b0
	 mm_core_init+0xdc/0x110
	nr_base_pages: 47
		
 - show_handles:

	handle: 1
	nr_base_pages: 47

 - show_stacks_handles:

	 register_dummy_stack+0x32/0x70
	 init_page_owner+0x29/0x2f0
	 page_ext_init+0x27c/0x2b0
	 mm_core_init+0xdc/0x110
	handle: 1
	
 - count_threshold:

	# echo 100 >/sys/kernel/debug/page_owner_stacks/count_threshold
	# grep register_dummy_stack show_stacks		# not present
	# grep -B4 '^handle: 1$' show_handles  		# not present 
	# grep -B4 '^handle: 1$' show_stacks_handles	# present
	 register_dummy_stack+0x32/0x70
	 init_page_owner+0x29/0x2f0
	 page_ext_init+0x27c/0x2b0
	 mm_core_init+0xdc/0x110
	handle: 1


This patch (of 5):

Currently, struct seq_file.private is used as an iterator in stack_list by
stack_start|next(), for stack_print().

Create a context struct for this, in order to add another field next.

No behavior change intended.

P.S.: page_owner_stack_open() is expanded with separate statements for
variable definition and return just in preparation for the next patch.

Link: https://lkml.kernel.org/r/20251001175611.575861-1-mfo@igalia.com
Link: https://lkml.kernel.org/r/20251001175611.575861-2-mfo@igalia.com
Signed-off-by: Mauricio Faria de Oliveira <mfo@igalia.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Brendan Jackman <jackmanb@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:58 -08:00
Anshuman Khandual
b4e53984f2 mm/dirty: replace READ_ONCE() with pudp_get()
Replace READ_ONCE() with a standard page table accessor i.e pudp_get() that
anyways defaults into READ_ONCE() in cases where platform does not override

Link: https://lkml.kernel.org/r/20251006055214.1845342-1-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Lance Yang <lance.yang@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:58 -08:00
Ryan Roberts
9ac09bb9fe mm: consistently use current->mm in mm_get_unmapped_area()
mm_get_unmapped_area() is a wrapper around arch_get_unmapped_area() /
arch_get_unmapped_area_topdown(), both of which search current->mm for
some free space.  Neither take an mm_struct - they implicitly operate on
current->mm.

But the wrapper takes an mm_struct and uses it to decide whether to search
bottom up or top down.  All callers pass in current->mm for this, so
everything is working consistently.  But it feels like an accident waiting
to happen; eventually someone will call that function with a different mm,
expecting to find free space in it, but what gets returned is free space
in the current mm.

So let's simplify by removing the parameter and have the wrapper use
current->mm to decide which end to start at.  Now everything is consistent
and self-documenting.

Link: https://lkml.kernel.org/r/20251003155306.2147572-1-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:57 -08:00
SeongJae Park
0fdaa13ee9 Docs/admin-guide/mm/zswap: s/red-black tree/xarray/
The change from commit 796c2c23e14e ("zswap: replace RB tree with xarray")
is not reflected on the document.  Update the document.

Link: https://lkml.kernel.org/r/20251003203851.43128-5-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Acked-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:57 -08:00
SeongJae Park
1f52f3de4b mm/zswap: s/red-black tree/xarray/
Changes made by commit 796c2c23e14e ("zswap: replace RB tree with xarray")
are not reflected on a comment.  Update the comment.

Link: https://lkml.kernel.org/r/20251003203851.43128-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Acked-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:57 -08:00
SeongJae Park
f7ed6bf237 mm/zswap: fix typos: s/zwap/zswap/
As the subject says.

Link: https://lkml.kernel.org/r/20251003203851.43128-3-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Acked-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chris Li <chrisl@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:57 -08:00
SeongJae Park
138336d674 mm/zswap: remove unnecessary dlen writes for incompressible pages
Patch series "mm/zswap: misc cleanup of code and documentations".

Clean up an unnecessary local variable write in incompressible pages
handling, typos (s/zwap/zswap/) and outdated comments/documentations about
the zswap's red-black tree, which is replaced by xarray.


This patch (of 4):

Incompressible pages handling logic in zswap_compress() is setting 'dlen'
as PAGE_SIZE twice.  Once before deciding whether to save the content as
is, and once again after it is decided to save it as is.  But the value of
'dlen' is used only if it is decided to save the content as is, so the
first write is unnecessary.  It is not causing real user issues, but
making code confusing to read.  Remove the unnecessary write operation.

Link: https://lkml.kernel.org/r/20251003203851.43128-1-sj@kernel.org
Link: https://lkml.kernel.org/r/20251003203851.43128-2-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Acked-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Acked-by: Nhat Pham <nphamcs@gmail.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Chris Li <chrisl@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:56 -08:00
Fushuai Wang
bd63d0fde2 mm/vmscan: remove redundant __GFP_NOWARN
The __GFP_NOWARN flag was included in GFP_NOWAIT since commit 16f5dfbc851b
("gfp: include __GFP_NOWARN in GFP_NOWAIT").  So remove the redundant
__GFP_NOWARN flag.

Link: https://lkml.kernel.org/r/20251006014948.44695-1-wangfushuai@baidu.com
Signed-off-by: Fushuai Wang <wangfushuai@baidu.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Wei Xu <weixugc@google.com>
Cc: Yuanchu Xie <yuanchu@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:56 -08:00
Roman Gushchin
9f1edf1aed mm: readahead: make thp readahead conditional to mmap_miss logic
Commit 4687fdbb805a ("mm/filemap: Support VM_HUGEPAGE for file mappings")
introduced a special handling for VM_HUGEPAGE mappings: even if the
readahead is disabled, 1 or 2 HPAGE_PMD_ORDER pages are allocated.

This change causes a significant regression for containers with a tight
memory.max limit, if VM_HUGEPAGE is widely used.  Prior to this commit,
mmap_miss logic would eventually lead to the readahead disablement,
effectively reducing the memory pressure in the cgroup.  With this change
the kernel is trying to allocate 1-2 huge pages for each fault, no matter
if these pages are used or not before being evicted, increasing the memory
pressure multi-fold.

To fix the regression, let's make the new VM_HUGEPAGE conditional to the
mmap_miss check, but keep independent from the ra->ra_pages.  This way the
main intention of commit 4687fdbb805a ("mm/filemap: Support VM_HUGEPAGE
for file mappings") stays intact, but the regression is resolved.

The logic behind this changes is simple: even if a user explicitly
requests using huge pages to back the file mapping (using VM_HUGEPAGE
flag), under a very strong memory pressure it's better to fall back to
ordinary pages.

Link: https://lkml.kernel.org/r/20251006175106.377411-1-roman.gushchin@linux.dev
Fixes: 4687fdbb805a ("mm/filemap: Support VM_HUGEPAGE for file mappings")
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:56 -08:00
Anshuman Khandual
c0efdb373c mm: replace READ_ONCE() with standard page table accessors
Replace all READ_ONCE() with a standard page table accessors i.e
pxdp_get() that defaults into READ_ONCE() in cases where platform does not
override.

Link: https://lkml.kernel.org/r/20251007063100.2396936-1-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:56 -08:00
xu xin
bda7bf0684 selftests: update ksm inheritance tests for prctl fork/exec
To reproduce the issue mentioned by [1], this add a setting of
pages_to_scan and sleep_millisecs at the start of test_prctl_fork_exec(). 
The main change is just raise the scanning frequency of ksmd.

[1] https://lore.kernel.org/all/202510012256278259zrhgATlLA2C510DMD3qI@zte.com.cn/

Link: https://lkml.kernel.org/r/20251007182935207jm31wCIgLpZg5XbXQY64S@zte.com.cn
Signed-off-by: xu xin <xu.xin16@zte.com.cn>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jinjiang Tu <tujinjiang@huawei.com>
Cc: Stefan Roesch <shr@devkernel.io>
Cc: Wang Yaxin <wang.yaxin@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:55 -08:00
xu xin
590c03ca6a mm/ksm: fix exec/fork inheritance support for prctl
Patch series "ksm: fix exec/fork inheritance", v2.

This series fixes exec/fork inheritance.  See the detailed description of
the issue below.


This patch (of 2):

Background
==========

commit d7597f59d1d33 ("mm: add new api to enable ksm per process")
introduced MMF_VM_MERGE_ANY for mm->flags, and allowed user to set it by
prctl() so that the process's VMAs are forcibly scanned by ksmd. 

Subsequently, the 3c6f33b7273a ("mm/ksm: support fork/exec for prctl")
supported inheriting the MMF_VM_MERGE_ANY flag when a task calls execve().

Finally, commit 3a9e567ca45fb ("mm/ksm: fix ksm exec support for prctl")
fixed the issue that ksmd doesn't scan the mm_struct with MMF_VM_MERGE_ANY
by adding the mm_slot to ksm_mm_head in __bprm_mm_init().

Problem
=======

In some extreme scenarios, however, this inheritance of MMF_VM_MERGE_ANY
during exec/fork can fail.  For example, when the scanning frequency of
ksmd is tuned extremely high, a process carrying MMF_VM_MERGE_ANY may
still fail to pass it to the newly exec'd process.  This happens because
ksm_execve() is executed too early in the do_execve flow (prematurely
adding the new mm_struct to the ksm_mm_slot list).

As a result, before do_execve completes, ksmd may have already performed a
scan and found that this new mm_struct has no VM_MERGEABLE VMAs, thus
clearing its MMF_VM_MERGE_ANY flag.  Consequently, when the new program
executes, the flag MMF_VM_MERGE_ANY inheritance missed.

Root reason
===========

commit d7597f59d1d33 ("mm: add new api to enable ksm per process") clear
the flag MMF_VM_MERGE_ANY when ksmd found no VM_MERGEABLE VMAs.

Solution
========

Firstly, Don't clear MMF_VM_MERGE_ANY when ksmd found no VM_MERGEABLE
VMAs, because perhaps their mm_struct has just been added to ksm_mm_slot
list, and its process has not yet officially started running or has not
yet performed mmap/brk to allocate anonymous VMAS.

Secondly, recheck MMF_VM_MERGEABLE again if a process takes
MMF_VM_MERGE_ANY, and create a mm_slot and join it into ksm_scan_list
again.

Link: https://lkml.kernel.org/r/20251007182504440BJgK8VXRHh8TD7IGSUIY4@zte.com.cn
Link: https://lkml.kernel.org/r/20251007182821572h_SoFqYZXEP1mvWI4n9VL@zte.com.cn
Fixes: 3c6f33b7273a ("mm/ksm: support fork/exec for prctl")
Fixes: d7597f59d1d3 ("mm: add new api to enable ksm per process")
Signed-off-by: xu xin <xu.xin16@zte.com.cn>
Cc: Stefan Roesch <shr@devkernel.io>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jinjiang Tu <tujinjiang@huawei.com>
Cc: Wang Yaxin <wang.yaxin@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:55 -08:00
Uladzislau Rezki (Sony)
c6307674ed mm: kvmalloc: add non-blocking support for vmalloc
Extend __kvmalloc_node_noprof() to handle non-blocking GFP flags
(GFP_NOWAIT and GFP_ATOMIC).  Previously such flags were rejected,
returning NULL.  With this change:

- kvmalloc() can fall back to vmalloc() if non-blocking contexts;
- for non-blocking allocations the VM_ALLOW_HUGE_VMAP option is
  disabled, since the huge mapping path still contains might_sleep();
- documentation update to reflect that GFP_NOWAIT and GFP_ATOMIC
  are now supported.

Link: https://lkml.kernel.org/r/20251007122035.56347-11-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Marco Elver <elver@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:55 -08:00
Uladzislau Rezki (Sony)
0667b209e9 mm/vmalloc: update __vmalloc_node_range() documentation
__vmalloc() now supports non-blocking flags such as GFP_ATOMIC and
GFP_NOWAIT.  Update the documentation accordingly.

Link: https://lkml.kernel.org/r/20251007122035.56347-10-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Marco Elver <elver@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:55 -08:00
Uladzislau Rezki (Sony)
7241bb2ea3 mm: skip might_alloc() warnings when PF_MEMALLOC is set
might_alloc() catches invalid blocking allocations in contexts where
sleeping is not allowed.

However when PF_MEMALLOC is set, the page allocator already skips reclaim
and other blocking paths.  In such cases, a blocking gfp_mask does not
actually lead to blocking, so triggering might_alloc() splats is
misleading.

Adjust might_alloc() to skip warnings when the current task has
PF_MEMALLOC set, matching the allocator's actual blocking behaviour.

Link: https://lkml.kernel.org/r/20251007122035.56347-9-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Marco Elver <elver@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:54 -08:00
Uladzislau Rezki (Sony)
b186a94227 kmsan: remove hard-coded GFP_KERNEL flags
kmsan_vmap_pages_range_noflush() allocates its temp s_pages/o_pages arrays
with GFP_KERNEL, which may sleep.  This is inconsistent with vmalloc() as
it will support non-blocking requests later.

Plumb gfp_mask through the kmsan_vmap_pages_range_noflush(), so it can use
it internally for its demand.

Please note, the subsequent __vmap_pages_range_noflush() still uses
GFP_KERNEL and can sleep.  If a caller runs under reclaim constraints,
sleeping is forbidden, it must establish the appropriate memalloc scope
API.

Link: https://lkml.kernel.org/r/20251007122035.56347-8-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Cc: Marco Elver <elver@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:54 -08:00
Uladzislau Rezki (Sony)
ad435e79f8 mm/kasan: support non-blocking GFP in kasan_populate_vmalloc()
A "gfp_mask" is already passed to kasan_populate_vmalloc() as an argument
to respect GFPs from callers and KASAN uses it for its internal
allocations.

But apply_to_page_range() function ignores GFP flags due to a hard-coded
mask.

Wrap the call with memalloc_apply_gfp_scope()/memalloc_restore_scope() so
that non-blocking GFP flags(GFP_ATOMIC, GFP_NOWAIT) are respected.

Link: https://lkml.kernel.org/r/20251007122035.56347-7-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Marco Elver <elver@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:54 -08:00
Uladzislau Rezki (Sony)
8da89ba18e mm/vmalloc: handle non-blocking GFP in __vmalloc_area_node()
Make __vmalloc_area_node() respect non-blocking GFP masks such as
GFP_ATOMIC and GFP_NOWAIT.

- Add memalloc_apply_gfp_scope()/memalloc_restore_scope()
  helpers to apply a proper scope.
- Apply memalloc_apply_gfp_scope()/memalloc_restore_scope()
  around vmap_pages_range() for page table setup.
- Set "nofail" to false if a non-blocking mask is used, as
  they are mutually exclusive.

This is particularly important for page table allocations that internally
use GFP_PGTABLE_KERNEL, which may sleep unless such scope restrictions are
applied.  For example:

<snip>
__pte_alloc_kernel()
  pte_alloc_one_kernel(&init_mm);
    pagetable_alloc_noprof(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM, 0);
<snip>

Note: in most cases, PTE entries are established only up to the level
required by current vmap space usage, meaning the page tables are
typically fully populated during the mapping process.

Link: https://lkml.kernel.org/r/20251007122035.56347-6-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Marco Elver <elver@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:54 -08:00
Uladzislau Rezki (Sony)
9c47753167 mm/vmalloc: defer freeing partly initialized vm_struct
__vmalloc_area_node() may call free_vmap_area() or vfree() on error paths,
both of which can sleep.  This becomes problematic if the function is
invoked from an atomic context, such as when GFP_ATOMIC or GFP_NOWAIT is
passed via gfp_mask.

To fix this, unify error paths and defer the cleanup of partly initialized
vm_struct objects to a workqueue.  This ensures that freeing happens in a
process context and avoids invalid sleeps in atomic regions.

Link: https://lkml.kernel.org/r/20251007122035.56347-5-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Marco Elver <elver@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:53 -08:00
Uladzislau Rezki (Sony)
86e968d8ca mm/vmalloc: support non-blocking GFP flags in alloc_vmap_area()
alloc_vmap_area() currently assumes that sleeping is allowed during
allocation.  This is not true for callers which pass non-blocking GFP
flags, such as GFP_ATOMIC or GFP_NOWAIT.

This patch adds logic to detect whether the given gfp_mask permits
blocking.  It avoids invoking might_sleep() or falling back to reclaim
path if blocking is not allowed.

This makes alloc_vmap_area() safer for use in non-sleeping contexts, where
previously it could hit unexpected sleeps, trigger warnings.

It is a preparation and adjustment step to later allow both GFP_ATOMIC and
GFP_NOWAIT allocations in this series.

Link: https://lkml.kernel.org/r/20251007122035.56347-4-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Marco Elver <elver@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:53 -08:00
Uladzislau Rezki (Sony)
e781c1c0a9 lib/test_vmalloc: remove xfail condition check
A test marked with "xfail = true" is expected to fail but that does not
mean it is predetermined to fail.  Remove "xfail" condition check for
tests which pass successfully.

Link: https://lkml.kernel.org/r/20251007122035.56347-3-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Marco Elver <elver@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:53 -08:00
Uladzislau Rezki (Sony)
9ff86ca1cc lib/test_vmalloc: add no_block_alloc_test case
Patch series "__vmalloc()/kvmalloc() and no-block support", v4.


This patch (of 10):

Introduce a new test case "no_block_alloc_test" that verifies non-blocking
allocations using __vmalloc() with GFP_ATOMIC and GFP_NOWAIT flags.

It is recommended to build kernel with CONFIG_DEBUG_ATOMIC_SLEEP enabled
to help catch "sleeping while atomic" issues.  This test ensures that
memory allocation logic under atomic constraints does not inadvertently
sleep.

Link: https://lkml.kernel.org/r/20251007122035.56347-2-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Marco Elver <elver@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:52 -08:00
Anshuman Khandual
11119b19f6 mm/ptdump: replace READ_ONCE() with standard page table accessors
Replace READ_ONCE() with standard page table accessors i.e pxdp_get()
which anyways default into READ_ONCE() in cases where platform does not
override.  Also convert ptep_get_lockless() into ptep_get() as well.

Link: https://lkml.kernel.org/r/20251001042502.1400726-1-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Acked-by: Lance Yang <lance.yang@linux.dev>
Acked-by: SeongJae Park <sj@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:52 -08:00
jianyun.gao
b6c46600bf mm: fix some typos in mm module
Below are some typos in the code comments:

  intevals ==> intervals
  addesses ==> addresses
  unavaliable ==> unavailable
  facor ==> factor
  droping ==> dropping
  exlusive ==> exclusive
  decription ==> description
  confict ==> conflict
  desriptions ==> descriptions
  otherwize ==> otherwise
  vlaue ==> value
  cheching ==> checking
  exisitng ==> existing
  modifed ==> modified
  differenciate ==> differentiate
  refernece ==> reference
  permissons ==> permissions
  indepdenent ==> independent
  spliting ==> splitting

Just fix it.

Link: https://lkml.kernel.org/r/20250929002608.1633825-1-jianyungao89@gmail.com
Signed-off-by: jianyun.gao <jianyungao89@gmail.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Acked-by: Chris Li <chrisl@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:52 -08:00
Anshuman Khandual
37d1792548 mm/thp: drop follow_devmap_pmd() default stub
follow_devmap_pmd() has already been dropped by the commit fd2825b0760a
("mm/gup: remove pXX_devmap usage from get_user_pages()").  The fallback
stub in the header which is now redundant, can be dropped off as well.

Link: https://lkml.kernel.org/r/20250929104643.1100421-1-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-16 17:27:52 -08:00
Andrew Morton
3407caccb8 Merge branch 'mm-hotfixes-stable' into mm-stable to be able to merge "mm:
introduce deferred freeing for kernel page tables" into mm-stable.
2025-11-16 17:24:35 -08:00
Steven Rostedt
bc089c4725 tracing: Convert function graph set_flags() to use a switch() statement
Currently the set_flags() of the function graph tracer has a bunch of:

  if (bit == FLAG1) {
	[..]
  }

  if (bit == FLAG2) {
	[..]
  }

To clean it up a bit, convert it over to a switch statement.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://patch.msgid.link/20251114192319.117123664@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-14 14:30:55 -05:00
Steven Rostedt
5abb6ccb58 tracing: Have function graph tracer option sleep-time be per instance
Currently the option to have function graph tracer to ignore time spent
when a task is sleeping is global when the interface is per-instance.
Changing the value in one instance will affect the results of another
instance that is also running the function graph tracer. This can lead to
confusing results.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://patch.msgid.link/20251114192318.950255167@kernel.org
Fixes: c132be2c4fcc1 ("function_graph: Have the instances use their own ftrace_ops for filtering")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-14 14:30:55 -05:00
Steven Rostedt
4132886e1b tracing: Move graph-time out of function graph options
The option "graph-time" affects the function profiler when it is using the
function graph infrastructure. It has nothing to do with the function
graph tracer itself. The option only affects the global function profiler
and does nothing to the function graph tracer.

Move it out of the function graph tracer options and make it a global
option that is only available at the top level instance.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://patch.msgid.link/20251114192318.781711154@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-14 14:30:55 -05:00
Steven Rostedt
6479325eca tracing: Have function graph tracer option funcgraph-irqs be per instance
Currently the option to trace interrupts in the function graph tracer is
global when the interface is per-instance. Changing the value in one
instance will affect the results of another instance that is also running
the function graph tracer. This can lead to confusing results.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://patch.msgid.link/20251114192318.613867934@kernel.org
Fixes: c132be2c4fcc1 ("function_graph: Have the instances use their own ftrace_ops for filtering")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-14 14:30:54 -05:00
Steven Rostedt
0d5077c73a MAINTAINERS: Add Tomas Glozar as a maintainer to RTLA tool
Tomas will start taking over managing the changes to the Real-time Linux
Analysis (RTLA) tool. Make him officially one of the maintainers.

Also update the RTLA entry to include the linux-kernel mailing list as
well as list the patchwork and git repository that the patches will go
through.

Cc: Tomas Glozar <tglozar@redhat.com>
Cc: Gabriele Monaco <gmonaco@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Crystal Wood <crwood@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://patch.msgid.link/20251112113556.47ec9d12@gandalf.local.home
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-14 12:24:41 -05:00
Yongliang Gao
97e047f44d trace/pid_list: optimize pid_list->lock contention
When the system has many cores and task switching is frequent,
setting set_ftrace_pid can cause frequent pid_list->lock contention
and high system sys usage.

For example, in a 288-core VM environment, we observed 267 CPUs
experiencing contention on pid_list->lock, with stack traces showing:

 #4 [ffffa6226fb4bc70] native_queued_spin_lock_slowpath at ffffffff99cd4b7e
 #5 [ffffa6226fb4bc90] _raw_spin_lock_irqsave at ffffffff99cd3e36
 #6 [ffffa6226fb4bca0] trace_pid_list_is_set at ffffffff99267554
 #7 [ffffa6226fb4bcc0] trace_ignore_this_task at ffffffff9925c288
 #8 [ffffa6226fb4bcd8] ftrace_filter_pid_sched_switch_probe at ffffffff99246efe
 #9 [ffffa6226fb4bcf0] __schedule at ffffffff99ccd161

Replaces the existing spinlock with a seqlock to allow concurrent readers,
while maintaining write exclusivity.

Link: https://patch.msgid.link/20251113000252.1058144-1-leonylgao@gmail.com
Reviewed-by: Huang Cun <cunhuang@tencent.com>
Signed-off-by: Yongliang Gao <leonylgao@tencent.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-13 15:15:54 -05:00
Steven Rostedt
e29aa918a9 tracing: Have function graph tracer define options per instance
Currently the function graph tracer's options are saved via a global mask
when it should be per instance. Use the new infrastructure to define a
"default_flags" field in the tracer structure that is used for the top
level instance as well as new ones.

Currently the global mask causes confusion:

  # cd /sys/kernel/tracing
  # mkdir instances/foo
  # echo function_graph > instances/foo/current_tracer
  # echo 1 > options/funcgraph-args
  # echo function_graph > current_tracer
  # cat trace
[..]
 2)               |          _raw_spin_lock_irq(lock=0xffff96b97dea16c0) {
 2)   0.422 us    |            do_raw_spin_lock(lock=0xffff96b97dea16c0);
 7)               |              rcu_sched_clock_irq(user=0) {
 2)   1.478 us    |          }
 7)   0.758 us    |                rcu_is_cpu_rrupt_from_idle();
 2)   0.647 us    |          enqueue_hrtimer(timer=0xffff96b97dea2058, base=0xffff96b97dea1740, mode=0);
 # cat instances/foo/options/funcgraph-args
 1
 # cat instances/foo/trace
[..]
 4)               |  __x64_sys_read() {
 4)               |    ksys_read() {
 4)   0.755 us    |      fdget_pos();
 4)               |      vfs_read() {
 4)               |        rw_verify_area() {
 4)               |          security_file_permission() {
 4)               |            apparmor_file_permission() {
 4)               |              common_file_perm() {
 4)               |                aa_file_perm() {
 4)               |                  rcu_read_lock_held() {
[..]

The above shows that updating the "funcgraph-args" option at the top level
instance also updates the "funcgraph-args" option in the instance but
because the update is only done by the instance that gets changed (as it
should), it's confusing to see that the option is already set in the other
instance.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://patch.msgid.link/20251111232429.641030027@kernel.org
Fixes: c132be2c4fcc1 ("function_graph: Have the instances use their own ftrace_ops for filtering")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-13 15:08:17 -05:00
Steven Rostedt
76680d0d28 tracing: Have function tracer define options per instance
Currently the function tracer's options are saved via a global mask when
it should be per instance. Use the new infrastructure to define a
"default_flags" field in the tracer structure that is used for the top
level instance as well as new ones.

Currently the global mask causes confusion:

  # cd /sys/kernel/tracing
  # mkdir instances/foo
  # echo function > instances/foo/current_tracer
  # echo 1 > options/func-args
  # echo function > current_tracer
  # cat trace
[..]
  <idle>-0       [005] d..3.  1050.656187: rcu_needs_cpu() <-tick_nohz_next_event
  <idle>-0       [005] d..3.  1050.656188: get_next_timer_interrupt(basej=0x10002dbad, basem=0xf45fd7d300) <-tick_nohz_next_event
  <idle>-0       [005] d..3.  1050.656189: _raw_spin_lock(lock=0xffff8944bdf5de80) <-__get_next_timer_interrupt
  <idle>-0       [005] d..4.  1050.656190: do_raw_spin_lock(lock=0xffff8944bdf5de80) <-__get_next_timer_interrupt
  <idle>-0       [005] d..4.  1050.656191: _raw_spin_lock_nested(lock=0xffff8944bdf5f140, subclass=1) <-__get_next_timer_interrupt
 # cat instances/foo/options/func-args
 1
 # cat instances/foo/trace
[..]
  kworker/4:1-88      [004] ...1.   298.127735: next_zone <-refresh_cpu_vm_stats
  kworker/4:1-88      [004] ...1.   298.127736: first_online_pgdat <-refresh_cpu_vm_stats
  kworker/4:1-88      [004] ...1.   298.127738: next_online_pgdat <-refresh_cpu_vm_stats
  kworker/4:1-88      [004] ...1.   298.127739: fold_diff <-refresh_cpu_vm_stats
  kworker/4:1-88      [004] ...1.   298.127741: round_jiffies_relative <-vmstat_update
[..]

The above shows that updating the "func-args" option at the top level
instance also updates the "func-args" option in the instance but because
the update is only done by the instance that gets changed (as it should),
it's confusing to see that the option is already set in the other instance.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://patch.msgid.link/20251111232429.470883736@kernel.org
Fixes: f20a580627f43 ("ftrace: Allow instances to use function tracing")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-12 09:59:54 -05:00
Steven Rostedt
428add559b tracing: Have tracer option be instance specific
Tracers can add specify options to modify them. This logic was added
before instances were created and the tracer flags were global variables.
After instances were created where a tracer may exist in more than one
instance, the flags were not updated from being global into instance
specific. This causes confusion with these options. For example, the
function tracer has an option to enable function arguments:

  # cd /sys/kernel/tracing
  # mkdir instances/foo
  # echo function > instances/foo/current_tracer
  # echo 1 > options/func-args
  # echo function > current_tracer
  # cat trace
[..]
  <idle>-0       [005] d..3.  1050.656187: rcu_needs_cpu() <-tick_nohz_next_event
  <idle>-0       [005] d..3.  1050.656188: get_next_timer_interrupt(basej=0x10002dbad, basem=0xf45fd7d300) <-tick_nohz_next_event
  <idle>-0       [005] d..3.  1050.656189: _raw_spin_lock(lock=0xffff8944bdf5de80) <-__get_next_timer_interrupt
  <idle>-0       [005] d..4.  1050.656190: do_raw_spin_lock(lock=0xffff8944bdf5de80) <-__get_next_timer_interrupt
  <idle>-0       [005] d..4.  1050.656191: _raw_spin_lock_nested(lock=0xffff8944bdf5f140, subclass=1) <-__get_next_timer_interrupt
 # cat instances/foo/options/func-args
 1
 # cat instances/foo/trace
[..]
  kworker/4:1-88      [004] ...1.   298.127735: next_zone <-refresh_cpu_vm_stats
  kworker/4:1-88      [004] ...1.   298.127736: first_online_pgdat <-refresh_cpu_vm_stats
  kworker/4:1-88      [004] ...1.   298.127738: next_online_pgdat <-refresh_cpu_vm_stats
  kworker/4:1-88      [004] ...1.   298.127739: fold_diff <-refresh_cpu_vm_stats
  kworker/4:1-88      [004] ...1.   298.127741: round_jiffies_relative <-vmstat_update
[..]

The above shows that setting "func-args" in the top level instance also
set it in the instance "foo", but since the interface of the trace flags
are per instance, the update didn't take affect in the "foo" instance.

Update the infrastructure to allow tracers to add a "default_flags" field
in the tracer structure that can be set instead of "flags" which will make
the flags per instance. If a tracer needs to keep the flags global (like
blktrace), keeping the "flags" field set will keep the old behavior.

This does not update function or the function graph tracers. That will be
handled later.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://patch.msgid.link/20251111232429.305317942@kernel.org
Fixes: f20a580627f43 ("ftrace: Allow instances to use function tracing")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-12 09:59:54 -05:00
Menglong Dong
cd06078a38 tracing: fprobe: use ftrace if CONFIG_DYNAMIC_FTRACE_WITH_ARGS
For now, we will use ftrace for the fprobe if fp->exit_handler not exists
and CONFIG_DYNAMIC_FTRACE_WITH_REGS is enabled.

However, CONFIG_DYNAMIC_FTRACE_WITH_REGS is not supported by some arch,
such as arm. What we need in the fprobe is the function arguments, so we
can use ftrace for fprobe if CONFIG_DYNAMIC_FTRACE_WITH_ARGS is enabled.

Therefore, use ftrace if CONFIG_DYNAMIC_FTRACE_WITH_REGS or
CONFIG_DYNAMIC_FTRACE_WITH_ARGS enabled.

Link: https://lore.kernel.org/all/20251103063434.47388-1-dongml2@chinatelecom.cn/

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2025-11-11 22:32:10 +09:00
Menglong Dong
08ed5c81f6 lib/test_fprobe: add testcase for mixed fprobe
Add the testcase for the fprobe, which will hook the same target with two
fprobe: entry, entry+exit. And the two fprobes will be registered with
different order.

fgraph and ftrace are both used for the fprobe, and this testcase is for
the mixed situation.

Link: https://lore.kernel.org/all/20251015083238.2374294-3-dongml2@chinatelecom.cn/

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2025-11-11 22:32:09 +09:00
Menglong Dong
2c67dc457b tracing: fprobe: optimization for entry only case
For now, fgraph is used for the fprobe, even if we need trace the entry
only. However, the performance of ftrace is better than fgraph, and we
can use ftrace_ops for this case.

Then performance of kprobe-multi increases from 54M to 69M. Before this
commit:

  $ ./benchs/run_bench_trigger.sh kprobe-multi
  kprobe-multi   :   54.663 ± 0.493M/s

After this commit:

  $ ./benchs/run_bench_trigger.sh kprobe-multi
  kprobe-multi   :   69.447 ± 0.143M/s

Mitigation is disable during the bench testing above.

Link: https://lore.kernel.org/all/20251015083238.2374294-2-dongml2@chinatelecom.cn/

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2025-11-11 22:32:09 +09:00
Masami Hiramatsu (Google)
e667152e00 tracing: fprobe: Fix to init fprobe_ip_table earlier
Since the fprobe_ip_table is used from module unloading in
the failure path of load_module(), it must be initialized in
the earlier timing than late_initcall(). Unless that, the
fprobe_module_callback() will use an uninitialized spinlock of
fprobe_ip_table.

Initialize fprobe_ip_table in core_initcall which is the same
timing as ftrace.

Link: https://lore.kernel.org/all/175939434403.3665022.13030530757238556332.stgit@mhiramat.tok.corp.google.com/

Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202509301440.be4b3631-lkp@intel.com
Fixes: e5a4cc28a052 ("tracing: fprobe: use rhltable for fprobe_ip_table")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Menglong Dong <menglong8.dong@gmail.com>
2025-11-11 22:32:09 +09:00
Thomas Weißschuh
69d8895cb9 rv: Add explicit lockdep context for reactors
Reactors can be called from any context through tracepoints.
When developing reactors care needs to be taken to only call APIs which
are safe. As the tracepoints used during testing may not actually be
called from restrictive contexts lockdep may not be helpful.

Add explicit overrides to help lockdep find invalid code patterns.

The usage of LD_WAIT_FREE will trigger lockdep warnings in the panic
reactor. These are indeed valid warnings but they are out of scope for
RV and will instead be fixed by the printk subsystem.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Acked-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/20251014-rv-lockdep-v1-3-0b9e51919ea8@linutronix.de
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
2025-11-11 13:18:56 +01:00
Thomas Weißschuh
68f63cea46 rv: Make rv_reacting_on() static
There are no external users left.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Link: https://lore.kernel.org/r/20251014-rv-lockdep-v1-2-0b9e51919ea8@linutronix.de
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
2025-11-11 13:18:56 +01:00
Thomas Weißschuh
4f739ed19d rv: Pass va_list to reactors
The only thing the reactors can do with the passed in varargs is to
convert it into a va_list. Do that in a central helper instead.
It simplifies the reactors, removes some hairy macro-generated code
and introduces a convenient hook point to modify reactor behavior.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Link: https://lore.kernel.org/r/20251014-rv-lockdep-v1-1-0b9e51919ea8@linutronix.de
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
2025-11-11 13:18:55 +01:00
Gabriele Monaco
0c0cd931a0 selftests/verification: Add initial RV tests
Add a series of tests to validate the RV tracefs API and basic
functionality.

* available monitors:
    Check that all monitors (from the monitors folder) appear as
    available and have a description. Works with nested monitors.

* enable/disable:
    Enable and disable all monitors and validate both the enabled file
    and the enabled_monitors. Check that enabling container monitors
    enables all nested monitors.

* reactors:
    Set all reactors and validate the setting, also for nested monitors.

* wwnr with printk:
    wwnr is broken on purpose, run it with a load and check that the
    printk reactor works. Also validate disabling reacting_on or
    monitoring_on prevents reactions.

These tests use the ftracetest suite.

Acked-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/20251017115203.140080-3-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
2025-11-11 13:18:55 +01:00
Gabriele Monaco
a0aa283c53 selftest/ftrace: Generalise ftracetest to use with RV
The ftracetest script is a fairly complete test framework for tracefs-like
subsystem, but it can only be used for ftrace selftests.

If OPT_TEST_DIR is provided and includes a function file, use that as
test directory going forward rather than just grabbing tests from it.

Generalise function names like initialize_ftrace to initialize_system.

Add the --rv argument to set up the test for rv, basically changing the
trace directory to $TRACING_DIR/rv and displaying an error if that
cannot be found.

This prepares for rv selftests inclusion.

Link: https://lore.kernel.org/r/20251017115203.140080-2-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
2025-11-11 12:40:15 +01:00
Masami Hiramatsu (Google)
7157062bb4 tracing: Report wrong dynamic event command
Report wrong dynamic event type in the command via error_log.
-----
 # echo "z hoge" > /sys/kernel/tracing/dynamic_events
 sh: write error: Invalid argument
 # cat /sys/kernel/tracing/error_log
 [   22.977022] dynevent: error: No matching dynamic event type
   Command: z hoge
            ^
-----

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/176278970056.343441.10528135217342926645.stgit@devnote2
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-10 19:26:14 -05:00
Steven Rostedt
3a0d5bc76f tracing: Use switch statement instead of ifs in set_tracer_flag()
The "mask" passed in to set_trace_flag() has a single bit set. The
function then checks if the mask is equal to one of the option masks and
performs the appropriate function associated to that option.

Instead of having a bunch of "if ()" statement, use a "switch ()"
statement instead to make it cleaner and a bit more optimal.

No function changes.

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://patch.msgid.link/20251106003501.890298562@kernel.org
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-10 14:33:51 -05:00
Steven Rostedt
9c5053083e tracing: Exit out immediately after update_marker_trace()
The call to update_marker_trace() in set_tracer_flag() performs the update
to the tr->trace_flags. There's no reason to perform it again after it is
called. Return immediately instead.

Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://patch.msgid.link/20251106003501.726406870@kernel.org
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-10 14:33:43 -05:00
Steven Rostedt
5aa0d18df0 tracing: Have add_tracer_options() error pass up to callers
The function add_tracer_options() can fail, but currently it is ignored.
Pass the status of add_tracer_options() up to adding a new tracer as well
as when an instance is created. Have the instance creation fail if the
add_tracer_options() fail.

Only print a warning for the top level instance, like it does with other
failures.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://patch.msgid.link/20251105161935.375299297@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-10 14:28:31 -05:00
Steven Rostedt
c7bed15ccf tracing: Remove dummy options and flags
When a tracer does not define their own flags, dummy options and flags are
used so that the values are always valid. There's not that many locations
that reference these values so having dummy versions just complicates the
code. Remove the dummy values and just check for NULL when appropriate.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://patch.msgid.link/20251105161935.206093132@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-10 14:27:04 -05:00
Steven Rostedt
a10e6e6818 tracing: Hide __NR_utimensat and _NR_mq_timedsend when not defined
Some architectures (riscv-32) do not define __NR_utimensat and
_NR_mq_timedsend, and fails to build when they are used.

Hide them in "ifdef"s.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251104205310.00a1db9a@batman.local.home
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202511031239.ZigDcWzY-lkp@intel.com/
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-10 14:23:53 -05:00
Steven Rostedt
2f294c35c0 Merge branch 'topic/func-profiler-offset' of git://git.kernel.org/pub/scm/linux/kernel/git/mhiramat/linux into trace/trace/core
Updates to the function profiler adds new options to tracefs. The options
are currently defined by an enum as flags. The added options brings the
number of options over 32, which means they can no longer be held in a 32
bit enum. The TRACE_ITER_* flags are converted to a macro TRACE_ITER(*) to
allow the creation of options to still be done by macros.

This change is intrusive, as it affects all TRACE_ITER* options throughout
the trace code. Merge the branch that added these options and converted
the TRACE_ITER_* enum into a TRACE_ITER(*) macro, to allow the topic
branches to still be developed without conflict.

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-11-04 10:12:32 -05:00
Masami Hiramatsu (Google)
1149fcf759 tracing: Add an option to show symbols in _text+offset for function profiler
Function profiler shows the hit count of each function using its symbol
name. However, there are some same-name local symbols, which we can not
distinguish.
To solve this issue, this introduces an option to show the symbols
in "_text+OFFSET" format. This can avoid exposing the random shift of
KASLR. The functions in modules are shown as "MODNAME+OFFSET" where the
offset is from ".text".

E.g. for the kernel text symbols, specify vmlinux and the output to
 addr2line, you can find the actual function and source info;

  $ addr2line -fie vmlinux _text+3078208
  __balance_callbacks
  kernel/sched/core.c:5064

for modules, specify the module file and .text+OFFSET;

  $ addr2line -fie samples/trace_events/trace-events-sample.ko .text+8224
  do_simple_thread_func
  samples/trace_events/trace-events-sample.c:23

Link: https://lore.kernel.org/all/176187878064.994619.8878296550240416558.stgit@devnote2/

Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2025-11-04 21:44:18 +09:00
Masami Hiramatsu (Google)
bbec8e28ca tracing: Allow tracer to add more than 32 options
Since enum trace_iterator_flags is 32bit, the max number of the
option flags is limited to 32 and it is fully used now. To add
a new option, we need to expand it.

So replace the TRACE_ITER_##flag with TRACE_ITER(flag) macro which
is 64bit bitmask.

Link: https://lore.kernel.org/all/176187877103.994619.166076000668757232.stgit@devnote2/

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2025-11-04 21:44:00 +09:00
Masami Hiramatsu (Google)
90e69d291d tracing: fprobe: Remove unused local variable
The 'ret' local variable in fprobe_remove_node_in_module() was used
for checking the error state in the loop, but commit dfe0d675df82
("tracing: fprobe: use rhltable for fprobe_ip_table") removed the loop.
So we don't need it anymore.

Link: https://lore.kernel.org/all/175867358989.600222.6175459620045800878.stgit@devnote2/

Fixes: e5a4cc28a052 ("tracing: fprobe: use rhltable for fprobe_ip_table")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Acked-by: Menglong Dong <menglong8.dong@gmail.com>
2025-11-01 01:10:29 +09:00
Thorsten Blum
cbe1e1241a tracing: probes: Replace strcpy() with memcpy() in __trace_probe_log_err()
strcpy() is deprecated; use memcpy() instead.

Link: https://lore.kernel.org/all/20250820214717.778243-3-thorsten.blum@linux.dev/

Link: https://github.com/KSPP/linux/issues/88
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2025-11-01 01:10:29 +09:00
Menglong Dong
ceb5d8d367 tracing: fprobe: fix suspicious rcu usage in fprobe_entry
rcu_read_lock() is not needed in fprobe_entry, but rcu_dereference_check()
is used in rhltable_lookup(), which causes suspicious RCU usage warning:

  WARNING: suspicious RCU usage
  6.17.0-rc1-00001-gdfe0d675df82 #1 Tainted: G S
  -----------------------------
  include/linux/rhashtable.h:602 suspicious rcu_dereference_check() usage!
  ......
  stack backtrace:
  CPU: 1 UID: 0 PID: 4652 Comm: ftracetest Tainted: G S
  Tainted: [S]=CPU_OUT_OF_SPEC, [I]=FIRMWARE_WORKAROUND
  Hardware name: Dell Inc. OptiPlex 7040/0Y7WYT, BIOS 1.1.1 10/07/2015
  Call Trace:
   <TASK>
   dump_stack_lvl+0x7c/0x90
   lockdep_rcu_suspicious+0x14f/0x1c0
   __rhashtable_lookup+0x1e0/0x260
   ? __pfx_kernel_clone+0x10/0x10
   fprobe_entry+0x9a/0x450
   ? __lock_acquire+0x6b0/0xca0
   ? find_held_lock+0x2b/0x80
   ? __pfx_fprobe_entry+0x10/0x10
   ? __pfx_kernel_clone+0x10/0x10
   ? lock_acquire+0x14c/0x2d0
   ? __might_fault+0x74/0xc0
   function_graph_enter_regs+0x2a0/0x550
   ? __do_sys_clone+0xb5/0x100
   ? __pfx_function_graph_enter_regs+0x10/0x10
   ? _copy_to_user+0x58/0x70
   ? __pfx_kernel_clone+0x10/0x10
   ? __x64_sys_rt_sigprocmask+0x114/0x180
   ? __pfx___x64_sys_rt_sigprocmask+0x10/0x10
   ? __pfx_kernel_clone+0x10/0x10
   ftrace_graph_func+0x87/0xb0

As we discussed in [1], fix this by using guard(rcu)() in fprobe_entry()
to protect the rhltable_lookup() and rhl_for_each_entry_rcu() with
rcu_read_lock and suppress this warning.

Link: https://lore.kernel.org/all/20250904062729.151931-1-dongml2@chinatelecom.cn/

Link: https://lore.kernel.org/all/20250829021436.19982-1-dongml2@chinatelecom.cn/ [1]
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202508281655.54c87330-lkp@intel.com
Fixes: dfe0d675df82 ("tracing: fprobe: use rhltable for fprobe_ip_table")
Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2025-11-01 01:10:29 +09:00
Masami Hiramatsu (Google)
84404ce71a tracing: uprobe: eprobes: Allocate traceprobe_parse_context per probe
Since traceprobe_parse_context is reusable among a probe arguments,
it is more efficient to allocate it outside of the loop for parsing
probe argument as kprobe and fprobe events do.

Link: https://lore.kernel.org/all/175509541393.193596.16330324746701582114.stgit@devnote2/

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2025-11-01 01:10:29 +09:00
Masami Hiramatsu (Google)
8b658df206 tracing: uprobes: Cleanup __trace_uprobe_create() with __free()
Use __free() to cleanup ugly gotos in __trace_uprobe_create().

Link: https://lore.kernel.org/all/175509540482.193596.6541098946023873304.stgit@devnote2/

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2025-11-01 01:10:29 +09:00
Masami Hiramatsu (Google)
0d6edbc9a4 tracing: eprobe: Cleanup eprobe event using __free()
Use __free(trace_event_probe_cleanup) to remove unneeded gotos and
cleanup the last part of trace_eprobe_parse_filter().

Link: https://lore.kernel.org/all/175509539571.193596.4674012182718751429.stgit@devnote2/

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2025-11-01 01:10:29 +09:00
Masami Hiramatsu (Google)
f959ecdfcb tracing: probes: Use __free() for trace_probe_log
Use __free() for trace_probe_log_clear() to cleanup error log interface.

Link: https://lore.kernel.org/all/175509538609.193596.16646724647358218778.stgit@devnote2/

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2025-11-01 01:10:28 +09:00
Menglong Dong
0de4c70d04 tracing: fprobe: use rhltable for fprobe_ip_table
For now, all the kernel functions who are hooked by the fprobe will be
added to the hash table "fprobe_ip_table". The key of it is the function
address, and the value of it is "struct fprobe_hlist_node".

The budget of the hash table is FPROBE_IP_TABLE_SIZE, which is 256. And
this means the overhead of the hash table lookup will grow linearly if
the count of the functions in the fprobe more than 256. When we try to
hook all the kernel functions, the overhead will be huge.

Therefore, replace the hash table with rhltable to reduce the overhead.

Link: https://lore.kernel.org/all/20250819031825.55653-1-dongml2@chinatelecom.cn/

Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
2025-11-01 01:10:28 +09:00
Randy Dunlap
74a7b4f183 sysctl: fix kernel-doc format warning
Describe the "type" struct member using '@type' and move it together
with the rest of the doc for ctl_table_header to avoid a kernel-doc
warning:

Warning: include/linux/sysctl.h:178 Incorrect use of kernel-doc format:
 * enum type - Enumeration to differentiate between ctl target types

Fixes: 2f2665c13af4 ("sysctl: replace child with an enumeration")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-10-30 13:23:09 +01:00
Bartosz Golaszewski
b21f90e2e4 scripts: add tracepoint-update to the list of ignores files
The new program for removing unused tracepoints is not ignored as it
should. Add it to the local .gitignore.

Cc: Vladimir Oltean <vladimir.oltean@nxp.com>
Cc: Jacob Keller <jacob.e.keller@intel.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/20251029120709.24669-1-brgl@bgdev.pl
Fixes: e30f8e61e251 ("tracing: Add a tracepoint verification check at build time")
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-29 08:46:05 -04:00
Steven Rostedt
25bd47a592 tracing: Have persistent ring buffer print syscalls normally
The persistent ring buffer from a previous boot has to be careful printing
events as the print formats of random events can have pointers to strings
and such that are not available.

Ftrace static events (like the function tracer event) are stable and are
printed normally.

System call event formats are also stable. Allow them to be printed
normally as well:

Instead of:

  <...>-1       [005] ...1.    57.240405: sys_enter_waitid: __syscall_nr=0xf7 (247) which=0x1 (1) upid=0x499 (1177) infop=0x7ffd5294d690 (140725988939408) options=0x5 (5) ru=0x0 (0)
  <...>-1       [005] ...1.    57.240433: sys_exit_waitid: __syscall_nr=0xf7 (247) ret=0x0 (0)
  <...>-1       [005] ...1.    57.240437: sys_enter_rt_sigprocmask: __syscall_nr=0xe (14) how=0x2 (2) nset=0x7ffd5294d7c0 (140725988939712) oset=0x0 (0) sigsetsize=0x8 (8)
  <...>-1       [005] ...1.    57.240438: sys_exit_rt_sigprocmask: __syscall_nr=0xe (14) ret=0x0 (0)
  <...>-1       [005] ...1.    57.240442: sys_enter_close: __syscall_nr=0x3 (3) fd=0x4 (4)
  <...>-1       [005] ...1.    57.240463: sys_exit_close: __syscall_nr=0x3 (3) ret=0x0 (0)
  <...>-1       [005] ...1.    57.240485: sys_enter_openat: __syscall_nr=0x101 (257) dfd=0xffffffffffdfff9c (-2097252) filename=(0xffff8b81639ca01c) flags=0x80000 (524288) mode=0x0 (0) __filename_val=/run/systemd/reboot-param
  <...>-1       [005] ...1.    57.240555: sys_exit_openat: __syscall_nr=0x101 (257) ret=0xffffffffffdffffe (-2097154)
  <...>-1       [005] ...1.    57.240571: sys_enter_openat: __syscall_nr=0x101 (257) dfd=0xffffffffffdfff9c (-2097252) filename=(0xffff8b81639ca01c) flags=0x80000 (524288) mode=0x0 (0) __filename_val=/run/systemd/reboot-param
  <...>-1       [005] ...1.    57.240620: sys_exit_openat: __syscall_nr=0x101 (257) ret=0xffffffffffdffffe (-2097154)
  <...>-1       [005] ...1.    57.240629: sys_enter_writev: __syscall_nr=0x14 (20) fd=0x3 (3) vec=0x7ffd5294ce50 (140725988937296) vlen=0x7 (7)
  <...>-1       [005] ...1.    57.242281: sys_exit_writev: __syscall_nr=0x14 (20) ret=0x24 (36)
  <...>-1       [005] ...1.    57.242286: sys_enter_reboot: __syscall_nr=0xa9 (169) magic1=0xfee1dead (4276215469) magic2=0x28121969 (672274793) cmd=0x1234567 (19088743) arg=0x0 (0)

Have:

  <...>-1       [000] ...1.    91.446011: sys_waitid(which: 1, upid: 0x4d2, infop: 0x7ffdccdadfd0, options: 5, ru: 0)
  <...>-1       [000] ...1.    91.446042: sys_waitid -> 0x0
  <...>-1       [000] ...1.    91.446045: sys_rt_sigprocmask(how: 2, nset: 0x7ffdccdae100, oset: 0, sigsetsize: 8)
  <...>-1       [000] ...1.    91.446047: sys_rt_sigprocmask -> 0x0
  <...>-1       [000] ...1.    91.446051: sys_close(fd: 4)
  <...>-1       [000] ...1.    91.446073: sys_close -> 0x0
  <...>-1       [000] ...1.    91.446095: sys_openat(dfd: 18446744073709551516, filename: 139732544945794 "/run/systemd/reboot-param", flags: O_RDONLY|O_CLOEXEC)
  <...>-1       [000] ...1.    91.446165: sys_openat -> 0xfffffffffffffffe
  <...>-1       [000] ...1.    91.446182: sys_openat(dfd: 18446744073709551516, filename: 139732544945794 "/run/systemd/reboot-param", flags: O_RDONLY|O_CLOEXEC)
  <...>-1       [000] ...1.    91.446233: sys_openat -> 0xfffffffffffffffe
  <...>-1       [000] ...1.    91.446242: sys_writev(fd: 3, vec: 0x7ffdccdad790, vlen: 7)
  <...>-1       [000] ...1.    91.447877: sys_writev -> 0x24
  <...>-1       [000] ...1.    91.447883: sys_reboot(magic1: 0xfee1dead, magic2: 0x28121969, cmd: 0x1234567, arg: 0)

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231149.097404581@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-28 20:11:00 -04:00
Steven Rostedt
b6e5d971fc tracing: Check for printable characters when printing field dyn strings
When the "fields" option is enabled, it prints each trace event field
based on its type. But a dynamic array and a dynamic string can both have
a "char *" type. Printing it as a string can cause escape characters to be
printed and mess up the output of the trace.

For dynamic strings, test if there are any non-printable characters, and
if so, print both the string with the non printable characters as '.', and
the print the hex value of the array.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231148.929243047@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-28 20:10:59 -04:00
Steven Rostedt
64b627c8da tracing: Add parsing of flags to the sys_enter_openat trace event
Add some logic to give the openat system call trace event a bit more human
readable information:

   syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7f0053dc121c "/etc/ld.so.cache", flags: O_RDONLY|O_CLOEXEC, mode: 0000

The above is output from "perf script" and now shows the flags used by the
openat system call.

Since the output from tracing is in the kernel, it can also remove the
mode field when not used (when flags does not contain O_CREATE|O_TMPFILE)

   touch-1185    [002] ...1.  1291.690154: sys_openat(dfd: 4294967196, filename: 139785545139344 "/usr/lib/locale/locale-archive", flags: O_RDONLY|O_CLOEXEC)
   touch-1185    [002] ...1.  1291.690504: sys_openat(dfd: 18446744073709551516, filename: 140733603151330 "/tmp/x", flags: O_WRONLY|O_CREAT|O_NOCTTY|O_NONBLOCK, mode: 0666)

As system calls have a fixed ABI, their trace events can be extended. This
currently only updates the openat system call, but others may be extended
in the future.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231148.763161484@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-28 20:10:59 -04:00
Steven Rostedt
32e0f607ac tracing: Add trace_seq_pop() and seq_buf_pop()
In order to allow an interface to remove an added character from the
trace_seq and seq_buf descriptors, add helper functions trace_seq_pop()
and seq_buf_pop().

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231148.594898736@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-28 20:10:59 -04:00
Steven Rostedt
e77ad6da90 tracing: Show printable characters in syscall arrays
When displaying the contents of the user space data passed to the kernel,
instead of just showing the array values, also print any printable
content.

Instead of just:

  bash-1113    [003] .....  3433.290654: sys_write(fd: 2, buf: 0x555a8deeddb0 (72:6f:6f:74:40:64:65:62:69:61:6e:2d:78:38:36:2d:36:34:3a:7e:23:20), count: 0x16)

Display:

  bash-1113    [003] .....  3433.290654: sys_write(fd: 2, buf: 0x555a8deeddb0 (72:6f:6f:74:40:64:65:62:69:61:6e:2d:78:38:36:2d:36:34:3a:7e:23:20) "root@debian-x86-64:~# ", count: 0x16)

This only affects tracing and does not affect perf, as this only updates
the output from the kernel. The output from perf is via user space. This
may change by an update to libtraceevent that will then update perf to
have this as well.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231148.429422865@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-28 20:10:59 -04:00
Steven Rostedt
299ea67e6a tracing: Add a config and syscall_user_buf_size file to limit amount written
When a system call that can copy user space addresses into the ring
buffer, it can copy up to 511 bytes of data. This can waste precious ring
buffer space if the user isn't interested in the output. Add a new file
"syscall_user_buf_size" that gets initialized to a new config
CONFIG_SYSCALL_BUF_SIZE_DEFAULT that defaults to 63.

The config also is used to limit how much perf can read from user space.

Also lower the max down to 165, as this isn't to record everything that a
system call may be passing through to the kernel. 165 is more than enough.

The reason for 165 is because adding one for the nul terminating byte, as
well as possibly needing to append the "..." string turns it into 170
bytes. As this needs to save up to 3 arguments and 3 * 170 is 510 which
fits nicely in 512 bytes (a power of 2).

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231148.260068913@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-28 20:10:59 -04:00
Steven Rostedt
baa031b7bd tracing: Allow syscall trace events to read more than one user parameter
Allow more than one field of a syscall trace event to read user space.
Build on top of the user_mask by allowing more than one bit to be set that
corresponds to the @args array of the syscall metadata. For each argument
in the @args array that is to be read, it will have a dynamic array/string
field associated to it.

Note that multiple fields to be read from user space is not supported if
the user_arg_size field is set in the syscall metada. That field can only
be used if only one field is being read from user space as that field is a
number representing the size field of the syscall event that holds the
size of the data to read from user space. It becomes ambiguous if the
system call reads more than one field. Currently this is not an issue.

If a syscall event happens to enable two events to read user space and
sets the user_arg_size field, it will trigger a warning at boot and the
user_arg_size field will be cleared.

The per CPU buffer that is used to read the user space addresses is now
broken up into 3 sections, each of 168 bytes. The reason for 168 is that
it is the biggest portion of 512 bytes divided by 3 that is 8 byte aligned.

The max amount copied into the ring buffer from user space is now only 128
bytes, which is plenty. When reading user space, it still reads 167
(168-1) bytes and uses the remaining to know if it should append the extra
"..." to the end or not.

This will allow the event to look like this:

  sys_renameat2(olddfd: 0xffffff9c, oldname: 0x7ffe02facdff "/tmp/x", newdfd: 0xffffff9c, newname: 0x7ffe02face06 "/tmp/y", flags: 1)

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231148.095789277@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-28 20:10:58 -04:00
Steven Rostedt
011ea0501d tracing: Display some syscall arrays as strings
Some of the system calls that read a fixed length of memory from the user
space address are not arrays but strings. Take a bit away from the nb_args
field in the syscall meta data to use as a flag to denote that the system
call's user_arg_size is being used as a string. The nb_args should never
be more than 6, so 7 bits is plenty to hold that number. When the
user_arg_is_str flag that, when set, will display the data array from the
user space address as a string and not an array.

This will allow the output to look like this:

  sys_sethostname(name: 0x5584310eb2a0 "debian", len: 6)

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231147.930550359@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-28 20:10:58 -04:00
Steven Rostedt
b4f7624cfc tracing: Have system call events record user array data
For system call events that have a length field, add a "user_arg_size"
parameter to the system call meta data that denotes the index of the args
array that holds the size of arg that the user_mask field has a bit set
for.

The "user_mask" has a bit set that denotes the arg that points to an array
in the user space address space and if a system call event has the
user_mask field set and the user_arg_size set, it will then record the
content of that address into the trace event, up to the size defined by
SYSCALL_FAULT_BUF_SZ - 1.

This allows the output to look like:

  sys_write(fd: 0xa, buf: 0x5646978d13c0 (01:00:05:00:00:00:00:00:01:87:55:89:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00), count: 0x20)

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231147.763528474@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-28 20:10:58 -04:00
Steven Rostedt
2e82e256df perf: tracing: Have perf system calls read user space
Allow some of the system call events to read user space buffers. Instead
of just showing the pointer into user space, allow perf events to also
record the content of those pointers. For example:

  # perf record -e syscalls:sys_enter_openat ls /usr/bin
  [..]
  # perf script
      ls    1024 [005]    52.902721: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbae321c "/etc/ld.so.cache", flags: 0x00080000, mode: 0x00000000
      ls    1024 [005]    52.902899: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbaae140 "/lib/x86_64-linux-gnu/libselinux.so.1", flags: 0x00080000, mode: 0x00000000
      ls    1024 [005]    52.903471: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbaae690 "/lib/x86_64-linux-gnu/libcap.so.2", flags: 0x00080000, mode: 0x00000000
      ls    1024 [005]    52.903946: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbaaebe0 "/lib/x86_64-linux-gnu/libc.so.6", flags: 0x00080000, mode: 0x00000000
      ls    1024 [005]    52.904629: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dbaaf110 "/lib/x86_64-linux-gnu/libpcre2-8.so.0", flags: 0x00080000, mode: 0x00000000
      ls    1024 [005]    52.906985: syscalls:sys_enter_openat: dfd: 0xffffffffffffff9c, filename: 0x7fc1dba92904 "/proc/filesystems", flags: 0x00080000, mode: 0x00000000
      ls    1024 [005]    52.907323: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x7fc1dba19490 "/usr/lib/locale/locale-archive", flags: 0x00080000, mode: 0x00000000
      ls    1024 [005]    52.907746: syscalls:sys_enter_openat: dfd: 0xffffff9c, filename: 0x556fb888dcd0 "/usr/bin", flags: 0x00090800, mode: 0x00000000

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231147.593925979@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-28 20:10:58 -04:00
Steven Rostedt
bd1b80fba7 perf: tracing: Simplify perf_sysenter_enable/disable() with guards
Use guard(mutex)(&syscall_trace_lock) for perf_sysenter_enable() and
perf_sysenter_disable() as well as for the perf_sysexit_enable() and
perf_sysexit_disable(). This will make it easier to update these functions
with other code that has early exit handling.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231147.429583335@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-28 20:10:58 -04:00
Steven Rostedt
a544d9a66b tracing: Have syscall trace events read user space string
As of commit 654ced4a1377 ("tracing: Introduce tracepoint_is_faultable()")
system call trace events allow faulting in user space memory. Have some of
the system call trace events take advantage of this.

Use the trace_user_fault_read() logic to read the user space buffer from
user space and instead of just saving the pointer to the buffer in the
system call event, also save the string that is passed in.

The syscall event has its nb_args shorten from an int to a short (where
even u8 is plenty big enough) and the freed two bytes are used for
"user_mask".  The new "user_mask" field is used to store the index of the
"args" field array that has the address to read from user space. This
value is set to 0 if the system call event does not need to read user
space for a field. This mask can be used to know if the event may fault or
not. Only one bit set in user_mask is supported at this time.

This allows the output to look like this:

 sys_access(filename: 0x7f8c55368470 "/etc/ld.so.preload", mode: 4)
 sys_execve(filename: 0x564ebcf5a6b8 "/usr/bin/emacs", argv: 0x7fff357c0300, envp: 0x564ebc4a4820)

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231147.261867956@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-28 20:10:58 -04:00
Steven Rostedt
a9f1687264 tracing: Make trace_user_fault_read() exposed to rest of tracing
The write to the trace_marker file is a critical section where it cannot
take locks nor allocate memory. To read from user space, it allocates a per
CPU buffer when the trace_marker file is opened, and then when the write
system call is performed, it uses the following method to read from user
space:

	preempt_disable();
	buffer = per_cpu_ptr(cpu_buffers, cpu);
	do {
		cnt = nr_context_switches_cpu();
		migrate_disable();
		preempt_enable();
		ret = copy_from_user(buffer, ptr, len);
		preempt_disable();
		migrate_enable();
	} while (!ret && cnt != nr_context_switches_cpu());
	if (!ret)
		ring_buffer_write(buffer);
	preempt_enable();

It records the number of context switches for the current CPU, enables
preemption, copies from user space, disable preemption and then checks if
the number of context switches changed. If it did not, then the buffer is
valid, otherwise the buffer may have been corrupted and the read from user
space must be tried again.

The system call trace events are now faultable and have the same
restrictions as the trace_marker write. For system calls to read the user
space buffer (for example to read the file of the openat system call), it
needs the same logic. Instead of copying the code over to the system call
trace events, make the code generic to allow the system call trace events to
use the same code. The following API is added internally to the tracing sub
system (these are only exposed within the tracing subsystem and not to be
used outside of it):

  trace_user_fault_init() - initializes a trace_user_buf_info descriptor
       that will allocate the per CPU buffers to copy from user space into.

  trace_user_fault_destroy() - used to free the allocations made by
       trace_user_fault_init().

  trace_user_fault_get() - update the ref count of the info descriptor to
       allow more than one user to use the same descriptor.

  trace_user_fault_put() - decrement the ref count.

  trace_user_fault_read() - performs the above action to read user space
      into the per CPU buffer. The preempt_disable() is expected before
      calling this function and preemption must remain disabled while the
      buffer returned is in use.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Takaya Saeki <takayas@google.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ian Rogers <irogers@google.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Link: https://lore.kernel.org/20251028231147.096570057@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-28 20:10:57 -04:00
Steven Rostedt
01ecf7af00 tracing: Add warnings for unused tracepoints for modules
If a modules has TRACE_EVENT() but does not use it, add a warning about it
at build time.

Currently, the build must be made by adding "UT=1" to the make command
line in order for this to trigger.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nicolas Schier <nicolas.schier@linux.dev>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/20251022004453.422000794@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-24 16:43:15 -04:00
Steven Rostedt
eec3516b25 tracing: Allow tracepoint-update.c to work with modules
In order for tracepoint-update.c to work with modules, it cannot error out
if both "__tracepoint_check" and "__tracepoints_strings" are not found.
When enabled, the vmlinux.o may be required to have both, but modules only
have these sections if they have tracepoints. Modules without tracepoints
will not have either. They should not fail to build because of that.

If one section exists the other one should too. Note, if a module defines
a tracepoint but doesn't use any, it can cause this to fail.

Add a new "--module" parameter to tracepoint-update to be used when
running on module code. It will not error out if this is set and both
sections are missing. If this is set, and only the "__tracepoint_check"
section is missing, it means the module has defined tracepoints but none
of them are used. In that case, it prints a warning that the module has
only unused tracepoints and exits normally to not fail the build.

If the "__tracepoint_check" section exists but not the
"__tracepoint_strings", then that is an error and should fail the build.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nicolas Schier <nicolas.schier@linux.dev>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/20251022004453.255696445@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-24 16:43:15 -04:00
Steven Rostedt
faf938153c tracepoint: Do not warn for unused event that is exported
There are a few generic events that may only be used by modules. They are
defined and then set with EXPORT_TRACEPOINT*(). Mark events that are
exported as being used, even though they still waste memory in the kernel
proper.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nicolas Schier <nicolas.schier@linux.dev>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/20251022004453.089254920@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-24 16:43:14 -04:00
Steven Rostedt
e30f8e61e2 tracing: Add a tracepoint verification check at build time
If a tracepoint is defined via DECLARE_TRACE() or TRACE_EVENT() but never
called (via the trace_<tracepoint>() function), its metadata is still
around in memory and not discarded.

When created via TRACE_EVENT() the situation is worse because the
TRACE_EVENT() creates metadata that can be around 5k per trace event.
Having unused trace events causes several thousand of wasted bytes.

Add a verifier that injects a string of the name of the tracepoint it
calls that is added to the discarded section "__tracepoint_check".
For every builtin tracepoint, its name (which is saved in the in-memory
section "__tracepoint_strings") will have its name also in the
"__tracepoint_check" section if it is used.

Add a new program that is run on build called tracepoint-update. This is
executed on the vmlinux.o before the __tracepoint_check section is
discarded (the section is discarded before vmlinux is created). This
program will create an array of each string in the __tracepoint_check
section and then sort it. Then it will walk the strings in the
__tracepoint_strings section and do a binary search to check if its name
is in the __tracepoint_check section. If it is not, then it is unused and
a warning is printed.

Note, this currently only handles tracepoints that are builtin and not in
modules.

Enabling this currently with a given config produces:

warning: tracepoint 'sched_move_numa' is unused.
warning: tracepoint 'sched_stick_numa' is unused.
warning: tracepoint 'sched_swap_numa' is unused.
warning: tracepoint 'pelt_hw_tp' is unused.
warning: tracepoint 'pelt_irq_tp' is unused.
warning: tracepoint 'rcu_preempt_task' is unused.
warning: tracepoint 'rcu_unlock_preempted_task' is unused.
warning: tracepoint 'xdp_bulk_tx' is unused.
warning: tracepoint 'xdp_redirect_map' is unused.
warning: tracepoint 'xdp_redirect_map_err' is unused.
warning: tracepoint 'vma_mas_szero' is unused.
warning: tracepoint 'vma_store' is unused.
warning: tracepoint 'hugepage_set_pmd' is unused.
warning: tracepoint 'hugepage_set_pud' is unused.
warning: tracepoint 'hugepage_update_pmd' is unused.
warning: tracepoint 'hugepage_update_pud' is unused.
warning: tracepoint 'block_rq_remap' is unused.
warning: tracepoint 'xhci_dbc_handle_event' is unused.
warning: tracepoint 'xhci_dbc_handle_transfer' is unused.
warning: tracepoint 'xhci_dbc_gadget_ep_queue' is unused.
warning: tracepoint 'xhci_dbc_alloc_request' is unused.
warning: tracepoint 'xhci_dbc_free_request' is unused.
warning: tracepoint 'xhci_dbc_queue_request' is unused.
warning: tracepoint 'xhci_dbc_giveback_request' is unused.
warning: tracepoint 'tcp_ao_wrong_maclen' is unused.
warning: tracepoint 'tcp_ao_mismatch' is unused.
warning: tracepoint 'tcp_ao_key_not_found' is unused.
warning: tracepoint 'tcp_ao_rnext_request' is unused.
warning: tracepoint 'tcp_ao_synack_no_key' is unused.
warning: tracepoint 'tcp_ao_snd_sne_update' is unused.
warning: tracepoint 'tcp_ao_rcv_sne_update' is unused.

Some of the above is totally unused but others are not used due to their
"trace_" functions being inside configs, in which case, the defined
tracepoints should also be inside those same configs. Others are
architecture specific but defined in generic code, where they should
either be moved to the architecture or be surrounded by #ifdef for the
architectures they are for.

This tool could be updated to process modules in the future.

I'd like to thank Mathieu Desnoyers for suggesting using strings instead
of pointers, as using pointers in vmlinux.o required handling relocations
and it required implementing almost a full feature linker to do so.

To enable this check, run the build with: make UT=1

Note, when all the existing unused tracepoints are removed from the build,
the "UT=1" will be removed and this will always be enabled when
tracepoints are configured to warn on any new tracepoints. The reason this
isn't always enabled now is because it will introduce a lot of warnings
for the current unused tracepoints, and all bisects would end at this
commit for those warnings.

Link: https://lore.kernel.org/all/20250528114549.4d8a5e03@gandalf.local.home/

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nicolas Schier <nicolas.schier@linux.dev>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/20251022004452.920728129@kernel.org
Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> # for using strings instead of pointers
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-24 16:43:14 -04:00
Steven Rostedt
b055f4c431 sorttable: Move ELF parsing into scripts/elf-parse.[ch]
In order to share the elf parsing that is in sorttable.c so that other
programs could use the same code, move it into elf-parse.c and
elf-parse.h.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nicolas Schier <nicolas.schier@linux.dev>
Cc: Nick Desaulniers <nick.desaulniers+lkml@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Link: https://lore.kernel.org/20251022004452.752298788@kernel.org
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-10-24 16:43:14 -04:00
Joel Granados
18c4e02884 watchdog: move nmi_watchdog sysctl into .rodata
Move nmi_watchdog into the watchdog_sysctls array to prevent it from
unnecessary modification. This move effectively moves it inside the
.rodata section.

Initially moved out into its own non-const array in commit 9ec272c586b0
("watchdog/hardlockup: keep kernel.nmi_watchdog sysctl as 0444 if probe
fails"), which made it writable only when watchdog_hardlockup_available
was true. Moving it back to watchdog_sysctl keeps this behavior as
writing to nmi_watchdog still fails when watchdog_hardlockup_available
is false.

Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Joel Granados <joel.granados@kernel.org>
2025-10-24 15:35:11 +02:00
Tzung-Bi Shih
b692553573 pstore/ram: Update module parameters from platform data
Update module parameters `mem_type` and `ramoops_ecc` from platform data
so that they are available through /sys/module/ramoops/parameters/.

`ramoops_dump_oops` isn't included as it has been deprecated.

Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
Link: https://patch.msgid.link/20251023143755.26204-1-tzungbi@kernel.org
Signed-off-by: Kees Cook <kees@kernel.org>
2025-10-23 08:56:04 -07:00
330 changed files with 16354 additions and 7843 deletions

View File

@ -140,8 +140,8 @@ ForEachMacros:
- 'damon_for_each_scheme_safe'
- 'damon_for_each_target'
- 'damon_for_each_target_safe'
- 'damos_for_each_filter'
- 'damos_for_each_filter_safe'
- 'damos_for_each_core_filter'
- 'damos_for_each_core_filter_safe'
- 'damos_for_each_ops_filter'
- 'damos_for_each_ops_filter_safe'
- 'damos_for_each_quota_goal'

View File

@ -164,6 +164,13 @@ Description: Writing to and reading from this file sets and gets the pid of
the target process if the context is for virtual address spaces
monitoring, respectively.
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/<T>/obsolete_target
Date: Oct 2025
Contact: SeongJae Park <sj@kernel.org>
Description: Writing to and reading from this file sets and gets the
obsoleteness of the matching parameters commit destination
target.
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/targets/<T>/regions/nr_regions
Date: Mar 2022
Contact: SeongJae Park <sj@kernel.org>
@ -303,6 +310,12 @@ Contact: SeongJae Park <sj@kernel.org>
Description: Writing to and reading from this file sets and gets the nid
parameter of the goal.
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/goals/<G>/path
Date: Oct 2025
Contact: SeongJae Park <sj@kernel.org>
Description: Writing to and reading from this file sets and gets the path
parameter of the goal.
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/schemes/<S>/quotas/weights/sz_permil
Date: Mar 2022
Contact: SeongJae Park <sj@kernel.org>

View File

@ -1513,6 +1513,10 @@ The following nested keys are defined.
oom_group_kill
The number of times a group OOM has occurred.
sock_throttled
The number of times network sockets associated with
this cgroup are throttled.
memory.events.local
Similar to memory.events but the fields in the file are local
to the cgroup i.e. not hierarchical. The file modified event

View File

@ -211,6 +211,28 @@ End of target memory region in physical address.
The end physical address of memory region that DAMON_LRU_SORT will do work
against. By default, biggest System RAM is used as the region.
addr_unit
---------
A scale factor for memory addresses and bytes.
This parameter is for setting and getting the :ref:`address unit
<damon_design_addr_unit>` parameter of the DAMON instance for DAMON_RECLAIM.
``monitor_region_start`` and ``monitor_region_end`` should be provided in this
unit. For example, let's suppose ``addr_unit``, ``monitor_region_start`` and
``monitor_region_end`` are set as ``1024``, ``0`` and ``10``, respectively.
Then DAMON_LRU_SORT will work for 10 KiB length of physical address range that
starts from address zero (``[0 * 1024, 10 * 1024)`` in bytes).
Stat parameters having ``bytes_`` prefix are also in this unit. For example,
let's suppose values of ``addr_unit``, ``bytes_lru_sort_tried_hot_regions`` and
``bytes_lru_sorted_hot_regions`` are ``1024``, ``42``, and ``32``,
respectively. Then it means DAMON_LRU_SORT tried to LRU-sort 42 KiB of hot
memory and successfully LRU-sorted 32 KiB of the memory in total.
If unsure, use only the default value (``1``) and forget about this.
kdamond_pid
-----------

View File

@ -232,6 +232,28 @@ The end physical address of memory region that DAMON_RECLAIM will do work
against. That is, DAMON_RECLAIM will find cold memory regions in this region
and reclaims. By default, biggest System RAM is used as the region.
addr_unit
---------
A scale factor for memory addresses and bytes.
This parameter is for setting and getting the :ref:`address unit
<damon_design_addr_unit>` parameter of the DAMON instance for DAMON_RECLAIM.
``monitor_region_start`` and ``monitor_region_end`` should be provided in this
unit. For example, let's suppose ``addr_unit``, ``monitor_region_start`` and
``monitor_region_end`` are set as ``1024``, ``0`` and ``10``, respectively.
Then DAMON_RECLAIM will work for 10 KiB length of physical address range that
starts from address zero (``[0 * 1024, 10 * 1024)`` in bytes).
``bytes_reclaim_tried_regions`` and ``bytes_reclaimed_regions`` are also in
this unit. For example, let's suppose values of ``addr_unit``,
``bytes_reclaim_tried_regions`` and ``bytes_reclaimed_regions`` are ``1024``,
``42``, and ``32``, respectively. Then it means DAMON_RECLAIM tried to reclaim
42 KiB memory and successfully reclaimed 32 KiB memory in total.
If unsure, use only the default value (``1``) and forget about this.
skip_anon
---------

View File

@ -10,6 +10,8 @@ on the system's entire physical memory using DAMON, and provides simplified
access monitoring results statistics, namely idle time percentiles and
estimated memory bandwidth.
.. _damon_stat_monitoring_accuracy_overhead:
Monitoring Accuracy and Overhead
================================
@ -17,9 +19,11 @@ DAMON_STAT uses monitoring intervals :ref:`auto-tuning
<damon_design_monitoring_intervals_autotuning>` to make its accuracy high and
overhead minimum. It auto-tunes the intervals aiming 4 % of observable access
events to be captured in each snapshot, while limiting the resulting sampling
events to be 5 milliseconds in minimum and 10 seconds in maximum. On a few
interval to be 5 milliseconds in minimum and 10 seconds in maximum. On a few
production server systems, it resulted in consuming only 0.x % single CPU time,
while capturing reasonable quality of access patterns.
while capturing reasonable quality of access patterns. The tuning-resulting
intervals can be retrieved via ``aggr_interval_us`` :ref:`parameter
<damon_stat_aggr_interval_us>`.
Interface: Module Parameters
============================
@ -41,6 +45,18 @@ You can enable DAMON_STAT by setting the value of this parameter as ``Y``.
Setting it as ``N`` disables DAMON_STAT. The default value is set by
``CONFIG_DAMON_STAT_ENABLED_DEFAULT`` build config option.
.. _damon_stat_aggr_interval_us:
aggr_interval_us
----------------
Auto-tuned aggregation time interval in microseconds.
Users can read the aggregation interval of DAMON that is being used by the
DAMON instance for DAMON_STAT. It is :ref:`auto-tuned
<damon_stat_monitoring_accuracy_overhead>` and therefore the value is
dynamically changed.
estimated_memory_bandwidth
--------------------------
@ -58,12 +74,13 @@ memory_idle_ms_percentiles
Per-byte idle time (milliseconds) percentiles of the system.
DAMON_STAT calculates how long each byte of the memory was not accessed until
now (idle time), based on the current DAMON results snapshot. If DAMON found a
region of access frequency (nr_accesses) larger than zero, every byte of the
region gets zero idle time. If a region has zero access frequency
(nr_accesses), how long the region was keeping the zero access frequency (age)
becomes the idle time of every byte of the region. Then, DAMON_STAT exposes
the percentiles of the idle time values via this read-only parameter. Reading
the parameter returns 101 idle time values in milliseconds, separated by comma.
now (idle time), based on the current DAMON results snapshot. For regions
having access frequency (nr_accesses) larger than zero, how long the current
access frequency level was kept multiplied by ``-1`` becomes the idlee time of
every byte of the region. If a region has zero access frequency (nr_accesses),
how long the region was keeping the zero access frequency (age) becomes the
idle time of every byte of the region. Then, DAMON_STAT exposes the
percentiles of the idle time values via this read-only parameter. Reading the
parameter returns 101 idle time values in milliseconds, separated by comma.
Each value represents 0-th, 1st, 2nd, 3rd, ..., 99th and 100th percentile idle
times.

View File

@ -67,7 +67,7 @@ comma (",").
│ │ │ │ │ │ │ intervals_goal/access_bp,aggrs,min_sample_us,max_sample_us
│ │ │ │ │ │ nr_regions/min,max
│ │ │ │ │ :ref:`targets <sysfs_targets>`/nr_targets
│ │ │ │ │ │ :ref:`0 <sysfs_target>`/pid_target
│ │ │ │ │ │ :ref:`0 <sysfs_target>`/pid_target,obsolete_target
│ │ │ │ │ │ │ :ref:`regions <sysfs_regions>`/nr_regions
│ │ │ │ │ │ │ │ :ref:`0 <sysfs_region>`/start,end
│ │ │ │ │ │ │ │ ...
@ -81,7 +81,7 @@ comma (",").
│ │ │ │ │ │ │ :ref:`quotas <sysfs_quotas>`/ms,bytes,reset_interval_ms,effective_bytes
│ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil
│ │ │ │ │ │ │ │ :ref:`goals <sysfs_schemes_quota_goals>`/nr_goals
│ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value,nid
│ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value,nid,path
│ │ │ │ │ │ │ :ref:`watermarks <sysfs_watermarks>`/metric,interval_us,high,mid,low
│ │ │ │ │ │ │ :ref:`{core_,ops_,}filters <sysfs_filters>`/nr_filters
│ │ │ │ │ │ │ │ 0/type,matching,allow,memcg_path,addr_start,addr_end,target_idx,min,max
@ -134,7 +134,8 @@ Users can write below commands for the kdamond to the ``state`` file.
- ``on``: Start running.
- ``off``: Stop running.
- ``commit``: Read the user inputs in the sysfs files except ``state`` file
again.
again. Monitoring :ref:`target region <sysfs_regions>` inputs are also be
ignored if no target region is specified.
- ``update_tuned_intervals``: Update the contents of ``sample_us`` and
``aggr_us`` files of the kdamond with the auto-tuning applied ``sampling
interval`` and ``aggregation interval`` for the files. Please refer to
@ -264,13 +265,20 @@ to ``N-1``. Each directory represents each monitoring target.
targets/<N>/
------------
In each target directory, one file (``pid_target``) and one directory
(``regions``) exist.
In each target directory, two files (``pid_target`` and ``obsolete_target``)
and one directory (``regions``) exist.
If you wrote ``vaddr`` to the ``contexts/<N>/operations``, each target should
be a process. You can specify the process to DAMON by writing the pid of the
process to the ``pid_target`` file.
Users can selectively remove targets in the middle of the targets array by
writing non-zero value to ``obsolete_target`` file and committing it (writing
``commit`` to ``state`` file). DAMON will remove the matching targets from its
internal targets array. Users are responsible to construct target directories
again, so that those correctly represent the changed internal targets array.
.. _sysfs_regions:
targets/<N>/regions
@ -289,6 +297,11 @@ In the beginning, this directory has only one file, ``nr_regions``. Writing a
number (``N``) to the file creates the number of child directories named ``0``
to ``N-1``. Each directory represents each initial monitoring target region.
If ``nr_regions`` is zero when committing new DAMON parameters online (writing
``commit`` to ``state`` file of :ref:`kdamond <sysfs_kdamond>`), the commit
logic ignores the target regions. In other words, the current monitoring
results for the target are preserved.
.. _sysfs_region:
regions/<N>/
@ -402,9 +415,9 @@ number (``N``) to the file creates the number of child directories named ``0``
to ``N-1``. Each directory represents each goal and current achievement.
Among the multiple feedback, the best one is used.
Each goal directory contains four files, namely ``target_metric``,
``target_value``, ``current_value`` and ``nid``. Users can set and get the
four parameters for the quota auto-tuning goals that specified on the
Each goal directory contains five files, namely ``target_metric``,
``target_value``, ``current_value`` ``nid`` and ``path``. Users can set and
get the five parameters for the quota auto-tuning goals that specified on the
:ref:`design doc <damon_design_damos_quotas_auto_tuning>` by writing to and
reading from each of the files. Note that users should further write
``commit_schemes_quota_goals`` to the ``state`` file of the :ref:`kdamond

View File

@ -39,7 +39,6 @@ the Linux memory management.
shrinker_debugfs
slab
soft-dirty
swap_numa
transhuge
userfaultfd
zswap

View File

@ -115,7 +115,8 @@ Short descriptions to the page flags
A free memory block managed by the buddy system allocator.
The buddy system organizes free memory in blocks of various orders.
An order N block has 2^N physically contiguous pages, with the BUDDY flag
set for and _only_ for the first page.
set for all pages.
Before 4.6 only the first page of the block had the flag set.
15 - COMPOUND_HEAD
A compound page with order N consists of 2^N physically contiguous pages.
A compound page with order 2 takes the form of "HTTT", where H donates its

View File

@ -1,78 +0,0 @@
===========================================
Automatically bind swap device to numa node
===========================================
If the system has more than one swap device and swap device has the node
information, we can make use of this information to decide which swap
device to use in get_swap_pages() to get better performance.
How to use this feature
=======================
Swap device has priority and that decides the order of it to be used. To make
use of automatically binding, there is no need to manipulate priority settings
for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and
swapB, with swapA attached to node 0 and swapB attached to node 1, are going
to be swapped on. Simply swapping them on by doing::
# swapon /dev/swapA
# swapon /dev/swapB
Then node 0 will use the two swap devices in the order of swapA then swapB and
node 1 will use the two swap devices in the order of swapB then swapA. Note
that the order of them being swapped on doesn't matter.
A more complex example on a 4 node machine. Assume 6 swap devices are going to
be swapped on: swapA and swapB are attached to node 0, swapC is attached to
node 1, swapD and swapE are attached to node 2 and swapF is attached to node3.
The way to swap them on is the same as above::
# swapon /dev/swapA
# swapon /dev/swapB
# swapon /dev/swapC
# swapon /dev/swapD
# swapon /dev/swapE
# swapon /dev/swapF
Then node 0 will use them in the order of::
swapA/swapB -> swapC -> swapD -> swapE -> swapF
swapA and swapB will be used in a round robin mode before any other swap device.
node 1 will use them in the order of::
swapC -> swapA -> swapB -> swapD -> swapE -> swapF
node 2 will use them in the order of::
swapD/swapE -> swapA -> swapB -> swapC -> swapF
Similaly, swapD and swapE will be used in a round robin mode before any
other swap devices.
node 3 will use them in the order of::
swapF -> swapA -> swapB -> swapC -> swapD -> swapE
Implementation details
======================
The current code uses a priority based list, swap_avail_list, to decide
which swap device to use and if multiple swap devices share the same
priority, they are used round robin. This change here replaces the single
global swap_avail_list with a per-numa-node list, i.e. for each numa node,
it sees its own priority based list of available swap devices. Swap
device's priority can be promoted on its matching node's swap_avail_list.
The current swap device's priority is set as: user can set a >=0 value,
or the system will pick one starting from -1 then downwards. The priority
value in the swap_avail_list is the negated value of the swap device's
due to plist being sorted from low to high. The new policy doesn't change
the semantics for priority >=0 cases, the previous starting from -1 then
downwards now becomes starting from -2 then downwards and -1 is reserved
as the promoted value. So if multiple swap devices are attached to the same
node, they will all be promoted to priority -1 on that node's plist and will
be used round robin before any other swap devices.

View File

@ -381,6 +381,11 @@ hugepage allocation policy for the tmpfs mount by using the kernel parameter
four valid policies for tmpfs (``always``, ``within_size``, ``advise``,
``never``). The tmpfs mount default policy is ``never``.
Additionally, Kconfig options are available to set the default hugepage
policies for shmem (``CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_*``) and tmpfs
(``CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_*``) at build time. Refer to the
Kconfig help for more details.
In the same manner as ``thp_anon`` controls each supported anonymous THP
size, ``thp_shmem`` controls each supported shmem THP size. ``thp_shmem``
has the same format as ``thp_anon``, but also supports the policy

View File

@ -59,11 +59,11 @@ returned by the allocation routine and that handle must be mapped before being
accessed. The compressed memory pool grows on demand and shrinks as compressed
pages are freed. The pool is not preallocated.
When a swap page is passed from swapout to zswap, zswap maintains a mapping
of the swap entry, a combination of the swap type and swap offset, to the
zsmalloc handle that references that compressed swap page. This mapping is
achieved with a red-black tree per swap type. The swap offset is the search
key for the tree nodes.
When a swap page is passed from swapout to zswap, zswap maintains a mapping of
the swap entry, a combination of the swap type and swap offset, to the zsmalloc
handle that references that compressed swap page. This mapping is achieved
with an xarray per swap type. The swap offset is the search key for the xarray
nodes.
During a page fault on a PTE that is a swap entry, the swapin code calls the
zswap load function to decompress the page into the page allocated by the page

View File

@ -217,6 +217,12 @@ properties:
memory types as ratified in the 20191213 version of the privileged
ISA specification.
- const: svrsw60t59b
description:
The Svrsw60t59b extension for providing two more bits[60:59] to
PTE/PMD entry as ratified at commit 28bde925e7a7 ("PTE Reserved
for SW bits 60:59") of riscv-non-isa/riscv-iommu.
- const: svvptc
description:
The standard Svvptc supervisor-level extension for

View File

@ -1286,6 +1286,11 @@ The vm_area_desc provides the minimum required information for a filesystem
to initialise state upon memory mapping of a file-backed region, and output
parameters for the file system to set this state.
In nearly all cases, this is all that is required for a filesystem. However, if
a filesystem needs to perform an operation such a pre-population of page tables,
then that action can be specified in the vm_area_desc->action field, which can
be configured using the mmap_action_*() helpers.
---
**mandatory**

View File

@ -553,7 +553,7 @@ otherwise.
kernel flags associated with the particular virtual memory area in two letter
encoded manner. The codes are the following:
== =======================================
== =============================================================
rd readable
wr writeable
ex executable
@ -591,7 +591,8 @@ encoded manner. The codes are the following:
sl sealed
lf lock on fault pages
dp always lazily freeable mapping
== =======================================
gu maybe contains guard regions (if not set, definitely doesn't)
== =============================================================
Note that there is no guarantee that every flag and associated mnemonic will
be present in all further kernel releases. Things get changed, the flags may

View File

@ -1213,6 +1213,10 @@ otherwise noted.
file-backed memory mapping, most notably establishing relevant
private state and VMA callbacks.
If further action such as pre-population of page tables is required,
this can be specified by the vm_area_desc->action field and related
parameters.
Note that the file operations are implemented by the specific
filesystem in which the inode resides. When opening a device node
(character or block special) most filesystems will call special

View File

@ -381,8 +381,8 @@ That is, assumes 4% (20% of 20%) DAMON-observed access events ratio (source)
to capture 64% (80% multipled by 80%) real access events (outcomes).
To know how user-space can use this feature via :ref:`DAMON sysfs interface
<sysfs_interface>`, refer to :ref:`intervals_goal <sysfs_scheme>` part of
the documentation.
<sysfs_interface>`, refer to :ref:`intervals_goal
<damon_usage_sysfs_monitoring_intervals_goal>` part of the documentation.
.. _damon_design_damos:
@ -564,9 +564,9 @@ aggressiveness (the quota) of the corresponding scheme. For example, if DAMOS
is under achieving the goal, DAMOS automatically increases the quota. If DAMOS
is over achieving the goal, it decreases the quota.
The goal can be specified with four parameters, namely ``target_metric``,
``target_value``, ``current_value`` and ``nid``. The auto-tuning mechanism
tries to make ``current_value`` of ``target_metric`` be same to
The goal can be specified with five parameters, namely ``target_metric``,
``target_value``, ``current_value``, ``nid`` and ``path``. The auto-tuning
mechanism tries to make ``current_value`` of ``target_metric`` be same to
``target_value``.
- ``user_input``: User-provided value. Users could use any metric that they
@ -581,9 +581,18 @@ tries to make ``current_value`` of ``target_metric`` be same to
set by users at the initial time. In other words, DAMOS does self-feedback.
- ``node_mem_used_bp``: Specific NUMA node's used memory ratio in bp (1/10,000).
- ``node_mem_free_bp``: Specific NUMA node's free memory ratio in bp (1/10,000).
- ``node_memcg_used_bp``: Specific cgroup's node used memory ratio for a
specific NUMA node, in bp (1/10,000).
- ``node_memcg_free_bp``: Specific cgroup's node unused memory ratio for a
specific NUMA node, in bp (1/10,000).
``nid`` is optionally required for only ``node_mem_used_bp`` and
``node_mem_free_bp`` to point the specific NUMA node.
``nid`` is optionally required for only ``node_mem_used_bp``,
``node_mem_free_bp``, ``node_memcg_used_bp`` and ``node_memcg_free_bp`` to
point the specific NUMA node.
``path`` is optionally required for only ``node_memcg_used_bp`` and
``node_memcg_free_bp`` to point the path to the cgroup. The value should be
the path of the memory cgroup from the cgroups mount point.
To know how user-space can set the tuning goal metric, the target value, and/or
the current value via :ref:`DAMON sysfs interface <sysfs_interface>`, refer to

View File

@ -27,8 +27,8 @@ maintainer.
Note again the patches for `mm-new tree
<https://git.kernel.org/akpm/mm/h/mm-new>`_ are queued by the memory management
subsystem maintainer. If the patches requires some patches in `damon/next tree
<https://git.kernel.org/sj/h/damon/next>`_ which not yet merged in mm-new,
subsystem maintainer. If the patches require some patches in `damon/next tree
<https://git.kernel.org/sj/h/damon/next>`_ which have not yet merged in mm-new,
please make sure the requirement is clearly specified.
Submit checklist addendum
@ -57,7 +57,7 @@ Key cycle dates
Patches can be sent anytime. Key cycle dates of the `mm-new
<https://git.kernel.org/akpm/mm/h/mm-new>`_, `mm-unstable
<https://git.kernel.org/akpm/mm/h/mm-unstable>`_and `mm-stable
<https://git.kernel.org/akpm/mm/h/mm-unstable>`_ and `mm-stable
<https://git.kernel.org/akpm/mm/h/mm-stable>`_ trees depend on the memory
management subsystem maintainer.
@ -99,5 +99,5 @@ Schedules and reservation status are available at the Google `doc
<https://docs.google.com/document/d/1v43Kcj3ly4CYqmAkMaZzLiM2GEnWfgdGbZAH3mi2vpM/edit?usp=sharing>`_.
There is also a public Google `calendar
<https://calendar.google.com/calendar/u/0?cid=ZDIwOTA4YTMxNjc2MDQ3NTIyMmUzYTM5ZmQyM2U4NDA0ZGIwZjBiYmJlZGQxNDM0MmY4ZTRjOTE0NjdhZDRiY0Bncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`_
that has the events. Anyone can subscribe it. DAMON maintainer will also
provide periodic reminder to the mailing list (damon@lists.linux.dev).
that has the events. Anyone can subscribe to it. DAMON maintainer will also
provide periodic reminders to the mailing list (damon@lists.linux.dev).

View File

@ -165,7 +165,7 @@ The users of `ZONE_DEVICE` are:
* pmem: Map platform persistent memory to be used as a direct-I/O target
via DAX mappings.
* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()`
* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->folio_free()`
event callbacks to allow a device-driver to coordinate memory management
events related to device-memory, typically GPU memory. See
Documentation/mm/hmm.rst.

View File

@ -27,7 +27,10 @@ enabled. Other usages are more than welcome.
It can also be used to show all the stacks and their current number of
allocated base pages, which gives us a quick overview of where the memory
is going without the need to screen through all the pages and match the
allocation and free operation.
allocation and free operation. It's also possible to show only a numeric
identifier of all the stacks (without stack traces) and their number of
allocated base pages (faster to read and parse, eg, for monitoring) that
can be matched with stacks later (show_handles and show_stacks_handles).
page owner is disabled by default. So, if you'd like to use it, you need
to add "page_owner=on" to your boot cmdline. If the kernel is built
@ -116,6 +119,33 @@ Usage
nr_base_pages: 20824
...
cat /sys/kernel/debug/page_owner_stacks/show_handles > handles_7000.txt
cat handles_7000.txt
handle: 42
nr_base_pages: 20824
...
cat /sys/kernel/debug/page_owner_stacks/show_stacks_handles > stacks_handles.txt
cat stacks_handles.txt
post_alloc_hook+0x177/0x1a0
get_page_from_freelist+0xd01/0xd80
__alloc_pages+0x39e/0x7e0
alloc_pages_mpol+0x22e/0x490
folio_alloc+0xd5/0x110
filemap_alloc_folio+0x78/0x230
page_cache_ra_order+0x287/0x6f0
filemap_get_pages+0x517/0x1160
filemap_read+0x304/0x9f0
xfs_file_buffered_read+0xe6/0x1d0 [xfs]
xfs_file_read_iter+0x1f0/0x380 [xfs]
__kernel_read+0x3b9/0x730
kernel_read_file+0x309/0x4d0
__do_sys_finit_module+0x381/0x730
do_syscall_64+0x8d/0x150
entry_SYSCALL_64_after_hwframe+0x62/0x6a
handle: 42
...
cat /sys/kernel/debug/page_owner > page_owner_full.txt
./page_owner_sort page_owner_full.txt sorted_page_owner.txt

View File

@ -48,7 +48,8 @@ Terminology
* **VMA locks** - The VMA lock is at VMA granularity (of course) which behaves
as a read/write semaphore in practice. A VMA read lock is obtained via
:c:func:`!lock_vma_under_rcu` (and unlocked via :c:func:`!vma_end_read`) and a
write lock via :c:func:`!vma_start_write` (all VMA write locks are unlocked
write lock via vma_start_write() or vma_start_write_killable()
(all VMA write locks are unlocked
automatically when the mmap write lock is released). To take a VMA write lock
you **must** have already acquired an :c:func:`!mmap_write_lock`.
* **rmap locks** - When trying to access VMAs through the reverse mapping via a
@ -907,3 +908,9 @@ Stack expansion
Stack expansion throws up additional complexities in that we cannot permit there
to be racing page faults, as a result we invoke :c:func:`!vma_start_write` to
prevent this in :c:func:`!expand_downwards` or :c:func:`!expand_upwards`.
------------------------
Functions and structures
------------------------
.. kernel-doc:: include/linux/mmap_lock.h

View File

@ -48,7 +48,7 @@
If not set, tracer threads keep their default priority. For rtla user threads, it is set to SCHED_FIFO with priority 95. For kernel threads, see *osnoise* and *timerlat* tracer documentation for the running kernel version.
**-C**, **--cgroup**\[*=cgroup*]
**-C**, **--cgroup** \[*cgroup*]
Set a *cgroup* to the tracer's threads. If the **-C** option is passed without arguments, the tracer's thread will inherit **rtla**'s *cgroup*. Otherwise, the threads will be placed on the *cgroup* passed to the option.

View File

@ -366,6 +366,14 @@ of ftrace. Here is a list of some of the key files:
for each function. The displayed address is the patch-site address
and can differ from /proc/kallsyms address.
syscall_user_buf_size:
Some system call trace events will record the data from a user
space address that one of the parameters point to. The amount of
data per event is limited. This file holds the max number of bytes
that will be recorded into the ring buffer to hold this data.
The max value is currently 165.
dyn_ftrace_total_info:
This file is for debugging purposes. The number of functions that

View File

@ -11604,6 +11604,8 @@ F: mm/hugetlb.c
F: mm/hugetlb_cgroup.c
F: mm/hugetlb_cma.c
F: mm/hugetlb_cma.h
F: mm/hugetlb_sysctl.c
F: mm/hugetlb_sysfs.c
F: mm/hugetlb_vmemmap.c
F: mm/hugetlb_vmemmap.h
F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c
@ -11621,6 +11623,8 @@ M: Miaohe Lin <linmiaohe@huawei.com>
R: Naoya Horiguchi <nao.horiguchi@gmail.com>
L: linux-mm@kvack.org
S: Maintained
F: include/linux/memory-failure.h
F: include/trace/events/memory-failure.h
F: mm/hwpoison-inject.c
F: mm/memory-failure.c
@ -16346,6 +16350,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
F: include/linux/gfp.h
F: include/linux/gfp_types.h
F: include/linux/highmem.h
F: include/linux/leafops.h
F: include/linux/memory.h
F: include/linux/mm.h
F: include/linux/mm_*.h
@ -16353,6 +16358,7 @@ F: include/linux/mmzone.h
F: include/linux/mmdebug.h
F: include/linux/mmu_notifier.h
F: include/linux/pagewalk.h
F: include/linux/pgalloc.h
F: include/linux/pgtable.h
F: include/linux/ptdump.h
F: include/linux/vmpressure.h
@ -21802,8 +21808,12 @@ F: tools/testing/selftests/rtc/
Real-time Linux Analysis (RTLA) tools
M: Steven Rostedt <rostedt@goodmis.org>
M: Tomas Glozar <tglozar@redhat.com>
L: linux-trace-kernel@vger.kernel.org
L: linux-kernel@vger.kernel.org
S: Maintained
Q: https://patchwork.kernel.org/project/linux-trace-kernel/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
F: Documentation/tools/rtla/
F: tools/tracing/rtla/
@ -22696,6 +22706,7 @@ F: Documentation/trace/rv/
F: include/linux/rv.h
F: include/rv/
F: kernel/trace/rv/
F: tools/testing/selftests/verification/
F: tools/verification/
RUST

View File

@ -810,6 +810,25 @@ ifdef CONFIG_FUNCTION_TRACER
CC_FLAGS_FTRACE := -pg
endif
ifdef CONFIG_TRACEPOINTS
# To check for unused tracepoints (tracepoints that are defined but never
# called), run with:
#
# make UT=1
#
# Each unused tracepoints can take up to 5KB of memory in the running kernel.
# It is best to remove any that are not used.
#
# This command line option will be removed when all current unused
# tracepoints are removed.
ifeq ("$(origin UT)", "command line")
WARN_ON_UNUSED_TRACEPOINTS := $(UT)
endif
endif # CONFIG_TRACEPOINTS
export WARN_ON_UNUSED_TRACEPOINTS
include $(srctree)/arch/$(SRCARCH)/Makefile
ifdef need-config
@ -940,6 +959,9 @@ KBUILD_CFLAGS += $(call cc-option,-fzero-init-padding-bits=all)
# for the randomize_kstack_offset feature. Disable it for all compilers.
KBUILD_CFLAGS += $(call cc-option, -fno-stack-clash-protection)
# Get details on warnings generated due to GCC value tracking.
KBUILD_CFLAGS += $(call cc-option, -fdiagnostics-show-context=2)
# Clear used registers at func exit (to reduce data lifetime and ROP gadgets).
ifdef CONFIG_ZERO_CALL_USED_REGS
KBUILD_CFLAGS += -fzero-call-used-regs=used-gpr
@ -1784,6 +1806,8 @@ help:
@echo ' c: extra checks in the configuration stage (Kconfig)'
@echo ' e: warnings are being treated as errors'
@echo ' Multiple levels can be combined with W=12 or W=123'
@echo ' make UT=1 [targets] Warn if a tracepoint is defined but not used.'
@echo ' [ This will be removed when all current unused tracepoints are eliminated. ]'
@$(if $(dtstree), \
echo ' make CHECK_DTBS=1 [targets] Check all generated dtb files against schema'; \
echo ' This can be applied both to "dtbs" and to individual "foo.dtb" targets' ; \

View File

@ -49,8 +49,6 @@
#define NO_CONT_MAPPINGS BIT(1)
#define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */
#define INVALID_PHYS_ADDR (-1ULL)
DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key);
u64 kimage_voffset __ro_after_init;

View File

@ -263,7 +263,4 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
#define update_mmu_cache(vma, addr, ptep) \
update_mmu_cache_range(NULL, vma, addr, ptep, 1)
#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
remap_pfn_range(vma, vaddr, pfn, size, prot)
#endif /* __ASM_CSKY_PGTABLE_H */

View File

@ -94,12 +94,13 @@ phys_addr_t fixup_bigphys_addr(phys_addr_t phys_addr, phys_addr_t size)
return phys_addr;
}
int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long vaddr,
unsigned long pfn, unsigned long size, pgprot_t prot)
static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn,
unsigned long size)
{
phys_addr_t phys_addr = fixup_bigphys_addr(pfn << PAGE_SHIFT, size);
return remap_pfn_range(vma, vaddr, phys_addr >> PAGE_SHIFT, size, prot);
return phys_addr >> PAGE_SHIFT;
}
EXPORT_SYMBOL(io_remap_pfn_range);
EXPORT_SYMBOL(io_remap_pfn_range_pfn);
#endif /* CONFIG_MIPS_FIXUP_BIGPHYS_ADDR */

View File

@ -604,9 +604,8 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
*/
#ifdef CONFIG_MIPS_FIXUP_BIGPHYS_ADDR
phys_addr_t fixup_bigphys_addr(phys_addr_t addr, phys_addr_t size);
int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long vaddr,
unsigned long pfn, unsigned long size, pgprot_t prot);
#define io_remap_pfn_range io_remap_pfn_range
unsigned long io_remap_pfn_range_pfn(unsigned long pfn, unsigned long size);
#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn
#else
#define fixup_bigphys_addr(addr, size) (addr)
#endif /* CONFIG_MIPS_FIXUP_BIGPHYS_ADDR */

View File

@ -723,7 +723,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
dpage = pfn_to_page(uvmem_pfn);
dpage->zone_device_data = pvt;
zone_device_page_init(dpage);
zone_device_page_init(dpage, 0);
return dpage;
out_clear:
spin_lock(&kvmppc_uvmem_bitmap_lock);
@ -1014,8 +1014,9 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf)
* to a normal PFN during H_SVM_PAGE_OUT.
* Gets called with kvm->arch.uvmem_lock held.
*/
static void kvmppc_uvmem_page_free(struct page *page)
static void kvmppc_uvmem_folio_free(struct folio *folio)
{
struct page *page = &folio->page;
unsigned long pfn = page_to_pfn(page) -
(kvmppc_uvmem_pgmap.range.start >> PAGE_SHIFT);
struct kvmppc_uvmem_page_pvt *pvt;
@ -1034,7 +1035,7 @@ static void kvmppc_uvmem_page_free(struct page *page)
}
static const struct dev_pagemap_ops kvmppc_uvmem_ops = {
.page_free = kvmppc_uvmem_page_free,
.folio_free = kvmppc_uvmem_folio_free,
.migrate_to_ram = kvmppc_uvmem_migrate_to_ram,
};

View File

@ -29,7 +29,7 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn)
nid = of_node_to_nid(dn);
if (likely((nid) >= 0)) {
if (!node_online(nid)) {
if (register_one_node(nid)) {
if (register_node(nid)) {
pr_err("PCI: Failed to register node %d\n", nid);
} else {
update_numa_distance(dn);

View File

@ -142,11 +142,13 @@ config RISCV
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_SOFT_DIRTY if 64BIT && MMU && RISCV_ISA_SVRSW60T59B
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU
select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if 64BIT && MMU
select HAVE_ARCH_USERFAULTFD_MINOR if 64BIT && USERFAULTFD
select HAVE_ARCH_USERFAULTFD_WP if 64BIT && MMU && USERFAULTFD && RISCV_ISA_SVRSW60T59B
select HAVE_ARCH_VMAP_STACK if MMU && 64BIT
select HAVE_ASM_MODVERSIONS
select HAVE_CONTEXT_TRACKING_USER
@ -849,6 +851,20 @@ config RISCV_ISA_ZICBOP
If you don't know what to do here, say Y.
config RISCV_ISA_SVRSW60T59B
bool "Svrsw60t59b extension support for using PTE bits 60 and 59"
depends on MMU && 64BIT
depends on RISCV_ALTERNATIVE
default y
help
Adds support to dynamically detect the presence of the Svrsw60t59b
extension and enable its usage.
The Svrsw60t59b extension allows to free the PTE reserved bits 60
and 59 for software to use.
If you don't know what to do here, say Y.
config TOOLCHAIN_NEEDS_EXPLICIT_ZICSR_ZIFENCEI
def_bool y
# https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=aed44286efa8ae8717a77d94b51ac3614e2ca6dc

View File

@ -106,6 +106,7 @@
#define RISCV_ISA_EXT_ZAAMO 97
#define RISCV_ISA_EXT_ZALRSC 98
#define RISCV_ISA_EXT_ZICBOP 99
#define RISCV_ISA_EXT_SVRSW60T59B 100
#define RISCV_ISA_EXT_XLINUXENVCFG 127

View File

@ -19,6 +19,43 @@
#define _PAGE_SOFT (3 << 8) /* Reserved for software */
#define _PAGE_SPECIAL (1 << 8) /* RSW: 0x1 */
#ifdef CONFIG_MEM_SOFT_DIRTY
/* ext_svrsw60t59b: bit 59 for soft-dirty tracking */
#define _PAGE_SOFT_DIRTY \
((riscv_has_extension_unlikely(RISCV_ISA_EXT_SVRSW60T59B)) ? \
(1UL << 59) : 0)
/*
* Bit 3 is always zero for swap entry computation, so we
* can borrow it for swap page soft-dirty tracking.
*/
#define _PAGE_SWP_SOFT_DIRTY \
((riscv_has_extension_unlikely(RISCV_ISA_EXT_SVRSW60T59B)) ? \
_PAGE_EXEC : 0)
#else
#define _PAGE_SOFT_DIRTY 0
#define _PAGE_SWP_SOFT_DIRTY 0
#endif /* CONFIG_MEM_SOFT_DIRTY */
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
/* ext_svrsw60t59b: Bit(60) for uffd-wp tracking */
#define _PAGE_UFFD_WP \
((riscv_has_extension_unlikely(RISCV_ISA_EXT_SVRSW60T59B)) ? \
(1UL << 60) : 0)
/*
* Bit 4 is not involved into swap entry computation, so we
* can borrow it for swap page uffd-wp tracking.
*/
#define _PAGE_SWP_UFFD_WP \
((riscv_has_extension_unlikely(RISCV_ISA_EXT_SVRSW60T59B)) ? \
_PAGE_USER : 0)
#else
#define _PAGE_UFFD_WP 0
#define _PAGE_SWP_UFFD_WP 0
#endif
#define _PAGE_TABLE _PAGE_PRESENT
/*

View File

@ -417,6 +417,41 @@ static inline pte_t pte_wrprotect(pte_t pte)
return __pte(pte_val(pte) & ~(_PAGE_WRITE));
}
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
#define pgtable_supports_uffd_wp() \
riscv_has_extension_unlikely(RISCV_ISA_EXT_SVRSW60T59B)
static inline bool pte_uffd_wp(pte_t pte)
{
return !!(pte_val(pte) & _PAGE_UFFD_WP);
}
static inline pte_t pte_mkuffd_wp(pte_t pte)
{
return pte_wrprotect(__pte(pte_val(pte) | _PAGE_UFFD_WP));
}
static inline pte_t pte_clear_uffd_wp(pte_t pte)
{
return __pte(pte_val(pte) & ~(_PAGE_UFFD_WP));
}
static inline bool pte_swp_uffd_wp(pte_t pte)
{
return !!(pte_val(pte) & _PAGE_SWP_UFFD_WP);
}
static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
{
return __pte(pte_val(pte) | _PAGE_SWP_UFFD_WP);
}
static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
{
return __pte(pte_val(pte) & ~(_PAGE_SWP_UFFD_WP));
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
/* static inline pte_t pte_mkread(pte_t pte) */
static inline pte_t pte_mkwrite_novma(pte_t pte)
@ -428,7 +463,7 @@ static inline pte_t pte_mkwrite_novma(pte_t pte)
static inline pte_t pte_mkdirty(pte_t pte)
{
return __pte(pte_val(pte) | _PAGE_DIRTY);
return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
}
static inline pte_t pte_mkclean(pte_t pte)
@ -456,6 +491,42 @@ static inline pte_t pte_mkhuge(pte_t pte)
return pte;
}
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
#define pgtable_supports_soft_dirty() \
(IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && \
riscv_has_extension_unlikely(RISCV_ISA_EXT_SVRSW60T59B))
static inline bool pte_soft_dirty(pte_t pte)
{
return !!(pte_val(pte) & _PAGE_SOFT_DIRTY);
}
static inline pte_t pte_mksoft_dirty(pte_t pte)
{
return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY);
}
static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
return __pte(pte_val(pte) & ~(_PAGE_SOFT_DIRTY));
}
static inline bool pte_swp_soft_dirty(pte_t pte)
{
return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY);
}
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY);
}
static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
return __pte(pte_val(pte) & ~(_PAGE_SWP_SOFT_DIRTY));
}
#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
#ifdef CONFIG_RISCV_ISA_SVNAPOT
#define pte_leaf_size(pte) (pte_napot(pte) ? \
napot_cont_size(napot_cont_order(pte)) :\
@ -805,6 +876,72 @@ static inline pud_t pud_mkspecial(pud_t pud)
}
#endif
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline bool pmd_uffd_wp(pmd_t pmd)
{
return pte_uffd_wp(pmd_pte(pmd));
}
static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
{
return pte_pmd(pte_mkuffd_wp(pmd_pte(pmd)));
}
static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
{
return pte_pmd(pte_clear_uffd_wp(pmd_pte(pmd)));
}
static inline bool pmd_swp_uffd_wp(pmd_t pmd)
{
return pte_swp_uffd_wp(pmd_pte(pmd));
}
static inline pmd_t pmd_swp_mkuffd_wp(pmd_t pmd)
{
return pte_pmd(pte_swp_mkuffd_wp(pmd_pte(pmd)));
}
static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
{
return pte_pmd(pte_swp_clear_uffd_wp(pmd_pte(pmd)));
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline bool pmd_soft_dirty(pmd_t pmd)
{
return pte_soft_dirty(pmd_pte(pmd));
}
static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
return pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)));
}
static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
return pte_pmd(pte_clear_soft_dirty(pmd_pte(pmd)));
}
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline bool pmd_swp_soft_dirty(pmd_t pmd)
{
return pte_swp_soft_dirty(pmd_pte(pmd));
}
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
return pte_pmd(pte_swp_mksoft_dirty(pmd_pte(pmd)));
}
static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
return pte_pmd(pte_swp_clear_soft_dirty(pmd_pte(pmd)));
}
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
@ -1003,7 +1140,9 @@ static inline pud_t pud_modify(pud_t pud, pgprot_t newprot)
*
* Format of swap PTE:
* bit 0: _PAGE_PRESENT (zero)
* bit 1 to 3: _PAGE_LEAF (zero)
* bit 1 to 2: (zero)
* bit 3: _PAGE_SWP_SOFT_DIRTY
* bit 4: _PAGE_SWP_UFFD_WP
* bit 5: _PAGE_PROT_NONE (zero)
* bit 6: exclusive marker
* bits 7 to 11: swap type

View File

@ -539,6 +539,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
__RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
__RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),
__RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT),
__RISCV_ISA_EXT_DATA(svrsw60t59b, RISCV_ISA_EXT_SVRSW60T59B),
__RISCV_ISA_EXT_DATA(svvptc, RISCV_ISA_EXT_SVVPTC),
};

View File

@ -16,7 +16,6 @@
#include "decompressor.h"
#include "boot.h"
#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
struct ctlreg __bootdata_preserved(s390_invalid_asce);
#ifdef CONFIG_PROC_FS

View File

@ -596,8 +596,9 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
| _SEGMENT_ENTRY_GMAP_UC
| _SEGMENT_ENTRY;
} else
*table = pmd_val(*pmd) &
_SEGMENT_ENTRY_HARDWARE_BITS;
*table = (pmd_val(*pmd) &
_SEGMENT_ENTRY_HARDWARE_BITS)
| _SEGMENT_ENTRY;
}
} else if (*table & _SEGMENT_ENTRY_PROTECT &&
!(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {

View File

@ -11,27 +11,27 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/leafops.h>
#include <linux/pagewalk.h>
#include <linux/ksm.h>
#include <asm/gmap_helpers.h>
#include <asm/pgtable.h>
/**
* ptep_zap_swap_entry() - discard a swap entry.
* ptep_zap_softleaf_entry() - discard a software leaf entry.
* @mm: the mm
* @entry: the swap entry that needs to be zapped
* @entry: the software leaf entry that needs to be zapped
*
* Discards the given swap entry. If the swap entry was an actual swap
* entry (and not a migration entry, for example), the actual swapped
* Discards the given software leaf entry. If the leaf entry was an actual
* swap entry (and not a migration entry, for example), the actual swapped
* page is also discarded from swap.
*/
static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
{
if (!non_swap_entry(entry))
if (softleaf_is_swap(entry))
dec_mm_counter(mm, MM_SWAPENTS);
else if (is_migration_entry(entry))
dec_mm_counter(mm, mm_counter(pfn_swap_entry_folio(entry)));
else if (softleaf_is_migration(entry))
dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry)));
free_swap_and_cache(entry);
}
@ -66,7 +66,7 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
preempt_disable();
pgste = pgste_get_lock(ptep);
ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep));
ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep));
pte_clear(mm, vmaddr, ptep);
pgste_set_unlock(ptep, pgste);

View File

@ -16,7 +16,7 @@
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/swapops.h>
#include <linux/leafops.h>
#include <linux/sysctl.h>
#include <linux/ksm.h>
#include <linux/mman.h>
@ -673,12 +673,12 @@ void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
pgste_set_unlock(ptep, pgste);
}
static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
{
if (!non_swap_entry(entry))
if (softleaf_is_swap(entry))
dec_mm_counter(mm, MM_SWAPENTS);
else if (is_migration_entry(entry)) {
struct folio *folio = pfn_swap_entry_folio(entry);
else if (softleaf_is_migration(entry)) {
struct folio *folio = softleaf_to_folio(entry);
dec_mm_counter(mm, mm_counter(folio));
}
@ -700,7 +700,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
if (!reset && pte_swap(pte) &&
((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED ||
(pgstev & _PGSTE_GPS_ZERO))) {
ptep_zap_swap_entry(mm, pte_to_swp_entry(pte));
ptep_zap_softleaf_entry(mm, softleaf_from_pte(pte));
pte_clear(mm, addr, ptep);
}
if (reset)

View File

@ -395,12 +395,8 @@ __get_iospace (unsigned long addr)
#define GET_IOSPACE(pfn) (pfn >> (BITS_PER_LONG - 4))
#define GET_PFN(pfn) (pfn & 0x0fffffffUL)
int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long,
unsigned long, pgprot_t);
static inline int io_remap_pfn_range(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn,
unsigned long size)
{
unsigned long long offset, space, phys_base;
@ -408,9 +404,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma,
space = GET_IOSPACE(pfn);
phys_base = offset | (space << 32ULL);
return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot);
return phys_base >> PAGE_SHIFT;
}
#define io_remap_pfn_range io_remap_pfn_range
#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \

View File

@ -1048,9 +1048,6 @@ int page_in_phys_avail(unsigned long paddr);
#define GET_IOSPACE(pfn) (pfn >> (BITS_PER_LONG - 4))
#define GET_PFN(pfn) (pfn & 0x0fffffffffffffffUL)
int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long,
unsigned long, pgprot_t);
void adi_restore_tags(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t pte);
@ -1084,9 +1081,8 @@ static inline int arch_unmap_one(struct mm_struct *mm,
return 0;
}
static inline int io_remap_pfn_range(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn,
unsigned long size)
{
unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
int space = GET_IOSPACE(pfn);
@ -1094,9 +1090,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma,
phys_base = offset | (((unsigned long) space) << 32UL);
return remap_pfn_range(vma, from, phys_base >> PAGE_SHIFT, size, prot);
return phys_base >> PAGE_SHIFT;
}
#define io_remap_pfn_range io_remap_pfn_range
#define io_remap_pfn_range_pfn io_remap_pfn_range_pfn
static inline unsigned long __untagged_addr(unsigned long start)
{

View File

@ -241,7 +241,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u
if (flags & MAP_FIXED) {
/* Ok, don't mess with it. */
return mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags);
return mm_get_unmapped_area(NULL, orig_addr, len, pgoff, flags);
}
flags &= ~MAP_SHARED;
@ -254,7 +254,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u
align_goal = (64UL * 1024);
do {
addr = mm_get_unmapped_area(current->mm, NULL, orig_addr,
addr = mm_get_unmapped_area(NULL, orig_addr,
len + (align_goal - PAGE_SIZE), pgoff, flags);
if (!(addr & ~PAGE_MASK)) {
addr = (addr + (align_goal - 1UL)) & ~(align_goal - 1UL);
@ -273,7 +273,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u
* be obtained.
*/
if (addr & ~PAGE_MASK)
addr = mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags);
addr = mm_get_unmapped_area(NULL, orig_addr, len, pgoff, flags);
return addr;
}

View File

@ -281,6 +281,7 @@ config X86
select HAVE_PCI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select ASYNC_KERNEL_PGTABLE_FREE if IOMMU_SVA
select MMU_GATHER_RCU_TABLE_FREE
select MMU_GATHER_MERGE_VMAS
select HAVE_POSIX_CPU_TIMERS_TASK_WORK

View File

@ -130,7 +130,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file,
if (flags & MAP_FIXED)
return addr;
return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
return mm_get_unmapped_area(file, addr, len, pgoff, flags);
}
#ifdef CONFIG_COMPAT

View File

@ -1028,7 +1028,7 @@ static void __meminit free_pagetable(struct page *page, int order)
free_reserved_pages(page, nr_pages);
#endif
} else {
__free_pages(page, order);
pagetable_free(page_ptdesc(page));
}
}

View File

@ -262,7 +262,7 @@ void __init init_gi_nodes(void)
* bringup_nonboot_cpus
* cpu_up
* __try_online_node
* register_one_node
* register_node
* because node_subsys is not initialized yet.
* TODO remove dependency on node_online
*/
@ -303,7 +303,7 @@ void __init init_cpu_to_node(void)
* bringup_nonboot_cpus
* cpu_up
* __try_online_node
* register_one_node
* register_node
* because node_subsys is not initialized yet.
* TODO remove dependency on node_online
*/

View File

@ -429,7 +429,7 @@ static void cpa_collapse_large_pages(struct cpa_data *cpa)
list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) {
list_del(&ptdesc->pt_list);
__free_page(ptdesc_page(ptdesc));
pagetable_free(ptdesc);
}
}

View File

@ -729,7 +729,7 @@ int pmd_clear_huge(pmd_t *pmd)
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
pmd_t *pmd, *pmd_sv;
pte_t *pte;
struct ptdesc *pt;
int i;
pmd = pud_pgtable(*pud);
@ -750,8 +750,8 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
for (i = 0; i < PTRS_PER_PMD; i++) {
if (!pmd_none(pmd_sv[i])) {
pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
pte_free_kernel(&init_mm, pte);
pt = page_ptdesc(pmd_page(pmd_sv[i]));
pagetable_dtor_free(pt);
}
}
@ -772,15 +772,15 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
*/
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
pte_t *pte;
struct ptdesc *pt;
pte = (pte_t *)pmd_page_vaddr(*pmd);
pt = page_ptdesc(pmd_page(*pmd));
pmd_clear(pmd);
/* INVLPG to clear all paging-structure caches */
flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
pte_free_kernel(&init_mm, pte);
pagetable_dtor_free(pt);
return 1;
}

View File

@ -506,12 +506,6 @@ static bool ghes_do_memory_failure(u64 physical_addr, int flags)
return false;
pfn = PHYS_PFN(physical_addr);
if (!pfn_valid(pfn) && !arch_is_platform_page(physical_addr)) {
pr_warn_ratelimited(FW_WARN GHES_PFX
"Invalid address in generic error data: %#llx\n",
physical_addr);
return false;
}
if (flags == MF_ACTION_REQUIRED && current->mm) {
twcb = (void *)gen_pool_alloc(ghes_estatus_pool, sizeof(*twcb));

View File

@ -198,15 +198,15 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr,
break;
default:
WARN_ON(1);
return sysfs_emit(buf, "ERROR-UNKNOWN-%ld\n", mem->state);
return sysfs_emit(buf, "ERROR-UNKNOWN-%d\n", mem->state);
}
return sysfs_emit(buf, "%s\n", output);
}
int memory_notify(unsigned long val, void *v)
int memory_notify(enum memory_block_state state, void *v)
{
return blocking_notifier_call_chain(&memory_chain, val, v);
return blocking_notifier_call_chain(&memory_chain, state, v);
}
#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)

View File

@ -676,50 +676,6 @@ static void node_device_release(struct device *dev)
kfree(to_node(dev));
}
/*
* register_node - Setup a sysfs device for a node.
* @num - Node number to use when creating the device.
*
* Initialize and register the node device.
*/
static int register_node(struct node *node, int num)
{
int error;
node->dev.id = num;
node->dev.bus = &node_subsys;
node->dev.release = node_device_release;
node->dev.groups = node_dev_groups;
error = device_register(&node->dev);
if (error) {
put_device(&node->dev);
} else {
hugetlb_register_node(node);
compaction_register_node(node);
reclaim_register_node(node);
}
return error;
}
/**
* unregister_node - unregister a node device
* @node: node going away
*
* Unregisters a node device @node. All the devices on the node must be
* unregistered before calling this function.
*/
void unregister_node(struct node *node)
{
hugetlb_unregister_node(node);
compaction_unregister_node(node);
reclaim_unregister_node(node);
node_remove_accesses(node);
node_remove_caches(node);
device_unregister(&node->dev);
}
struct node *node_devices[MAX_NUMNODES];
/*
@ -907,7 +863,13 @@ void register_memory_blocks_under_node_hotplug(int nid, unsigned long start_pfn,
}
#endif /* CONFIG_MEMORY_HOTPLUG */
int register_one_node(int nid)
/**
* register_node - Initialize and register the node device.
* @nid: Node number to use when creating the device.
*
* Return: 0 on success, -errno otherwise
*/
int register_node(int nid)
{
int error;
int cpu;
@ -918,14 +880,23 @@ int register_one_node(int nid)
return -ENOMEM;
INIT_LIST_HEAD(&node->access_list);
node_devices[nid] = node;
error = register_node(node_devices[nid], nid);
node->dev.id = nid;
node->dev.bus = &node_subsys;
node->dev.release = node_device_release;
node->dev.groups = node_dev_groups;
error = device_register(&node->dev);
if (error) {
node_devices[nid] = NULL;
put_device(&node->dev);
return error;
}
node_devices[nid] = node;
hugetlb_register_node(node);
compaction_register_node(node);
reclaim_register_node(node);
/* link cpu under this node */
for_each_present_cpu(cpu) {
if (cpu_to_node(cpu) == nid)
@ -936,13 +907,26 @@ int register_one_node(int nid)
return error;
}
void unregister_one_node(int nid)
/**
* unregister_node - unregister a node device
* @nid: nid of the node going away
*
* Unregisters the node device at node id @nid. All the devices on the
* node must be unregistered before calling this function.
*/
void unregister_node(int nid)
{
if (!node_devices[nid])
struct node *node = node_devices[nid];
if (!node)
return;
unregister_node(node_devices[nid]);
hugetlb_unregister_node(node);
compaction_unregister_node(node);
reclaim_unregister_node(node);
node_remove_accesses(node);
node_remove_caches(node);
device_unregister(&node->dev);
node_devices[nid] = NULL;
}
@ -1018,7 +1002,7 @@ void __init node_dev_init(void)
* to already created cpu devices.
*/
for_each_online_node(i) {
ret = register_one_node(i);
ret = register_node(i);
if (ret)
panic("%s() failed to add node: %d\n", __func__, ret);
}

View File

@ -500,8 +500,31 @@ out:
}
#ifdef CONFIG_ZRAM_WRITEBACK
#define INVALID_BDEV_BLOCK (~0UL)
struct zram_wb_ctl {
/* idle list is accessed only by the writeback task, no concurency */
struct list_head idle_reqs;
/* done list is accessed concurrently, protect by done_lock */
struct list_head done_reqs;
wait_queue_head_t done_wait;
spinlock_t done_lock;
atomic_t num_inflight;
};
struct zram_wb_req {
unsigned long blk_idx;
struct page *page;
struct zram_pp_slot *pps;
struct bio_vec bio_vec;
struct bio bio;
struct list_head entry;
};
static ssize_t writeback_limit_enable_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
struct device_attribute *attr,
const char *buf, size_t len)
{
struct zram *zram = dev_to_zram(dev);
u64 val;
@ -510,33 +533,31 @@ static ssize_t writeback_limit_enable_store(struct device *dev,
if (kstrtoull(buf, 10, &val))
return ret;
down_read(&zram->init_lock);
spin_lock(&zram->wb_limit_lock);
down_write(&zram->init_lock);
zram->wb_limit_enable = val;
spin_unlock(&zram->wb_limit_lock);
up_read(&zram->init_lock);
up_write(&zram->init_lock);
ret = len;
return ret;
}
static ssize_t writeback_limit_enable_show(struct device *dev,
struct device_attribute *attr, char *buf)
struct device_attribute *attr,
char *buf)
{
bool val;
struct zram *zram = dev_to_zram(dev);
down_read(&zram->init_lock);
spin_lock(&zram->wb_limit_lock);
val = zram->wb_limit_enable;
spin_unlock(&zram->wb_limit_lock);
up_read(&zram->init_lock);
return sysfs_emit(buf, "%d\n", val);
}
static ssize_t writeback_limit_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
struct device_attribute *attr,
const char *buf, size_t len)
{
struct zram *zram = dev_to_zram(dev);
u64 val;
@ -545,31 +566,71 @@ static ssize_t writeback_limit_store(struct device *dev,
if (kstrtoull(buf, 10, &val))
return ret;
down_read(&zram->init_lock);
spin_lock(&zram->wb_limit_lock);
/*
* When the page size is greater than 4KB, if bd_wb_limit is set to
* a value that is not page - size aligned, it will cause value
* wrapping. For example, when the page size is set to 16KB and
* bd_wb_limit is set to 3, a single write - back operation will
* cause bd_wb_limit to become -1. Even more terrifying is that
* bd_wb_limit is an unsigned number.
*/
val = rounddown(val, PAGE_SIZE / 4096);
down_write(&zram->init_lock);
zram->bd_wb_limit = val;
spin_unlock(&zram->wb_limit_lock);
up_read(&zram->init_lock);
up_write(&zram->init_lock);
ret = len;
return ret;
}
static ssize_t writeback_limit_show(struct device *dev,
struct device_attribute *attr, char *buf)
struct device_attribute *attr, char *buf)
{
u64 val;
struct zram *zram = dev_to_zram(dev);
down_read(&zram->init_lock);
spin_lock(&zram->wb_limit_lock);
val = zram->bd_wb_limit;
spin_unlock(&zram->wb_limit_lock);
up_read(&zram->init_lock);
return sysfs_emit(buf, "%llu\n", val);
}
static ssize_t writeback_batch_size_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t len)
{
struct zram *zram = dev_to_zram(dev);
u32 val;
if (kstrtouint(buf, 10, &val))
return -EINVAL;
if (!val)
return -EINVAL;
down_write(&zram->init_lock);
zram->wb_batch_size = val;
up_write(&zram->init_lock);
return len;
}
static ssize_t writeback_batch_size_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
u32 val;
struct zram *zram = dev_to_zram(dev);
down_read(&zram->init_lock);
val = zram->wb_batch_size;
up_read(&zram->init_lock);
return sysfs_emit(buf, "%u\n", val);
}
static void reset_bdev(struct zram *zram)
{
if (!zram->backing_dev)
@ -697,23 +758,20 @@ out:
return err;
}
static unsigned long alloc_block_bdev(struct zram *zram)
static unsigned long zram_reserve_bdev_block(struct zram *zram)
{
unsigned long blk_idx = 1;
retry:
/* skip 0 bit to confuse zram.handle = 0 */
blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, blk_idx);
unsigned long blk_idx;
blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, 0);
if (blk_idx == zram->nr_pages)
return 0;
if (test_and_set_bit(blk_idx, zram->bitmap))
goto retry;
return INVALID_BDEV_BLOCK;
set_bit(blk_idx, zram->bitmap);
atomic64_inc(&zram->stats.bd_count);
return blk_idx;
}
static void free_block_bdev(struct zram *zram, unsigned long blk_idx)
static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx)
{
int was_set;
@ -734,63 +792,234 @@ static void read_from_bdev_async(struct zram *zram, struct page *page,
submit_bio(bio);
}
static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
static void release_wb_req(struct zram_wb_req *req)
{
unsigned long blk_idx = 0;
struct page *page = NULL;
struct zram_pp_slot *pps;
struct bio_vec bio_vec;
struct bio bio;
int ret = 0, err;
u32 index;
__free_page(req->page);
kfree(req);
}
page = alloc_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
static void release_wb_ctl(struct zram_wb_ctl *wb_ctl)
{
if (!wb_ctl)
return;
/* We should never have inflight requests at this point */
WARN_ON(atomic_read(&wb_ctl->num_inflight));
WARN_ON(!list_empty(&wb_ctl->done_reqs));
while (!list_empty(&wb_ctl->idle_reqs)) {
struct zram_wb_req *req;
req = list_first_entry(&wb_ctl->idle_reqs,
struct zram_wb_req, entry);
list_del(&req->entry);
release_wb_req(req);
}
kfree(wb_ctl);
}
static struct zram_wb_ctl *init_wb_ctl(struct zram *zram)
{
struct zram_wb_ctl *wb_ctl;
int i;
wb_ctl = kmalloc(sizeof(*wb_ctl), GFP_KERNEL);
if (!wb_ctl)
return NULL;
INIT_LIST_HEAD(&wb_ctl->idle_reqs);
INIT_LIST_HEAD(&wb_ctl->done_reqs);
atomic_set(&wb_ctl->num_inflight, 0);
init_waitqueue_head(&wb_ctl->done_wait);
spin_lock_init(&wb_ctl->done_lock);
for (i = 0; i < zram->wb_batch_size; i++) {
struct zram_wb_req *req;
/*
* This is fatal condition only if we couldn't allocate
* any requests at all. Otherwise we just work with the
* requests that we have successfully allocated, so that
* writeback can still proceed, even if there is only one
* request on the idle list.
*/
req = kzalloc(sizeof(*req), GFP_KERNEL | __GFP_NOWARN);
if (!req)
break;
req->page = alloc_page(GFP_KERNEL | __GFP_NOWARN);
if (!req->page) {
kfree(req);
break;
}
list_add(&req->entry, &wb_ctl->idle_reqs);
}
/* We couldn't allocate any requests, so writeabck is not possible */
if (list_empty(&wb_ctl->idle_reqs))
goto release_wb_ctl;
return wb_ctl;
release_wb_ctl:
release_wb_ctl(wb_ctl);
return NULL;
}
static void zram_account_writeback_rollback(struct zram *zram)
{
lockdep_assert_held_read(&zram->init_lock);
if (zram->wb_limit_enable)
zram->bd_wb_limit += 1UL << (PAGE_SHIFT - 12);
}
static void zram_account_writeback_submit(struct zram *zram)
{
lockdep_assert_held_read(&zram->init_lock);
if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
}
static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req)
{
u32 index = req->pps->index;
int err;
err = blk_status_to_errno(req->bio.bi_status);
if (err) {
/*
* Failed wb requests should not be accounted in wb_limit
* (if enabled).
*/
zram_account_writeback_rollback(zram);
zram_release_bdev_block(zram, req->blk_idx);
return err;
}
atomic64_inc(&zram->stats.bd_writes);
zram_slot_lock(zram, index);
/*
* We release slot lock during writeback so slot can change under us:
* slot_free() or slot_free() and zram_write_page(). In both cases
* slot loses ZRAM_PP_SLOT flag. No concurrent post-processing can
* set ZRAM_PP_SLOT on such slots until current post-processing
* finishes.
*/
if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) {
zram_release_bdev_block(zram, req->blk_idx);
goto out;
}
zram_free_page(zram, index);
zram_set_flag(zram, index, ZRAM_WB);
zram_set_handle(zram, index, req->blk_idx);
atomic64_inc(&zram->stats.pages_stored);
out:
zram_slot_unlock(zram, index);
return 0;
}
static void zram_writeback_endio(struct bio *bio)
{
struct zram_wb_req *req = container_of(bio, struct zram_wb_req, bio);
struct zram_wb_ctl *wb_ctl = bio->bi_private;
unsigned long flags;
spin_lock_irqsave(&wb_ctl->done_lock, flags);
list_add(&req->entry, &wb_ctl->done_reqs);
spin_unlock_irqrestore(&wb_ctl->done_lock, flags);
wake_up(&wb_ctl->done_wait);
}
static void zram_submit_wb_request(struct zram *zram,
struct zram_wb_ctl *wb_ctl,
struct zram_wb_req *req)
{
/*
* wb_limit (if enabled) should be adjusted before submission,
* so that we don't over-submit.
*/
zram_account_writeback_submit(zram);
atomic_inc(&wb_ctl->num_inflight);
req->bio.bi_private = wb_ctl;
submit_bio(&req->bio);
}
static int zram_complete_done_reqs(struct zram *zram,
struct zram_wb_ctl *wb_ctl)
{
struct zram_wb_req *req;
unsigned long flags;
int ret = 0, err;
while (atomic_read(&wb_ctl->num_inflight) > 0) {
spin_lock_irqsave(&wb_ctl->done_lock, flags);
req = list_first_entry_or_null(&wb_ctl->done_reqs,
struct zram_wb_req, entry);
if (req)
list_del(&req->entry);
spin_unlock_irqrestore(&wb_ctl->done_lock, flags);
/* ->num_inflight > 0 doesn't mean we have done requests */
if (!req)
break;
err = zram_writeback_complete(zram, req);
if (err)
ret = err;
atomic_dec(&wb_ctl->num_inflight);
release_pp_slot(zram, req->pps);
req->pps = NULL;
list_add(&req->entry, &wb_ctl->idle_reqs);
}
return ret;
}
static struct zram_wb_req *zram_select_idle_req(struct zram_wb_ctl *wb_ctl)
{
struct zram_wb_req *req;
req = list_first_entry_or_null(&wb_ctl->idle_reqs,
struct zram_wb_req, entry);
if (req)
list_del(&req->entry);
return req;
}
static int zram_writeback_slots(struct zram *zram,
struct zram_pp_ctl *ctl,
struct zram_wb_ctl *wb_ctl)
{
unsigned long blk_idx = INVALID_BDEV_BLOCK;
struct zram_wb_req *req = NULL;
struct zram_pp_slot *pps;
int ret = 0, err = 0;
u32 index = 0;
while ((pps = select_pp_slot(ctl))) {
spin_lock(&zram->wb_limit_lock);
if (zram->wb_limit_enable && !zram->bd_wb_limit) {
spin_unlock(&zram->wb_limit_lock);
ret = -EIO;
break;
}
spin_unlock(&zram->wb_limit_lock);
if (!blk_idx) {
blk_idx = alloc_block_bdev(zram);
if (!blk_idx) {
ret = -ENOSPC;
while (!req) {
req = zram_select_idle_req(wb_ctl);
if (req)
break;
}
}
index = pps->index;
zram_slot_lock(zram, index);
/*
* scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so
* slots can change in the meantime. If slots are accessed or
* freed they lose ZRAM_PP_SLOT flag and hence we don't
* post-process them.
*/
if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
goto next;
if (zram_read_from_zspool(zram, page, index))
goto next;
zram_slot_unlock(zram, index);
wait_event(wb_ctl->done_wait,
!list_empty(&wb_ctl->done_reqs));
bio_init(&bio, zram->bdev, &bio_vec, 1,
REQ_OP_WRITE | REQ_SYNC);
bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
__bio_add_page(&bio, page, PAGE_SIZE, 0);
/*
* XXX: A single page IO would be inefficient for write
* but it would be not bad as starter.
*/
err = submit_bio_wait(&bio);
if (err) {
release_pp_slot(zram, pps);
err = zram_complete_done_reqs(zram, wb_ctl);
/*
* BIO errors are not fatal, we continue and simply
* attempt to writeback the remaining objects (pages).
@ -799,43 +1028,69 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl)
* them) were not successful and we do so by returning
* the most recent BIO error.
*/
ret = err;
continue;
if (err)
ret = err;
}
atomic64_inc(&zram->stats.bd_writes);
if (blk_idx == INVALID_BDEV_BLOCK) {
blk_idx = zram_reserve_bdev_block(zram);
if (blk_idx == INVALID_BDEV_BLOCK) {
ret = -ENOSPC;
break;
}
}
index = pps->index;
zram_slot_lock(zram, index);
/*
* Same as above, we release slot lock during writeback so
* slot can change under us: slot_free() or slot_free() and
* reallocation (zram_write_page()). In both cases slot loses
* ZRAM_PP_SLOT flag. No concurrent post-processing can set
* ZRAM_PP_SLOT on such slots until current post-processing
* finishes.
* scan_slots() sets ZRAM_PP_SLOT and releases slot lock, so
* slots can change in the meantime. If slots are accessed or
* freed they lose ZRAM_PP_SLOT flag and hence we don't
* post-process them.
*/
if (!zram_test_flag(zram, index, ZRAM_PP_SLOT))
goto next;
if (zram_read_from_zspool(zram, req->page, index))
goto next;
zram_slot_unlock(zram, index);
/*
* From now on pp-slot is owned by the req, remove it from
* its pp bucket.
*/
list_del_init(&pps->entry);
req->blk_idx = blk_idx;
req->pps = pps;
bio_init(&req->bio, zram->bdev, &req->bio_vec, 1, REQ_OP_WRITE);
req->bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9);
req->bio.bi_end_io = zram_writeback_endio;
__bio_add_page(&req->bio, req->page, PAGE_SIZE, 0);
zram_submit_wb_request(zram, wb_ctl, req);
blk_idx = INVALID_BDEV_BLOCK;
req = NULL;
cond_resched();
continue;
zram_free_page(zram, index);
zram_set_flag(zram, index, ZRAM_WB);
zram_set_handle(zram, index, blk_idx);
blk_idx = 0;
atomic64_inc(&zram->stats.pages_stored);
spin_lock(&zram->wb_limit_lock);
if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
spin_unlock(&zram->wb_limit_lock);
next:
zram_slot_unlock(zram, index);
release_pp_slot(zram, pps);
cond_resched();
}
if (blk_idx)
free_block_bdev(zram, blk_idx);
if (page)
__free_page(page);
/*
* Selected idle req, but never submitted it due to some error or
* wb limit.
*/
if (req)
release_wb_req(req);
while (atomic_read(&wb_ctl->num_inflight) > 0) {
wait_event(wb_ctl->done_wait, !list_empty(&wb_ctl->done_reqs));
err = zram_complete_done_reqs(zram, wb_ctl);
if (err)
ret = err;
}
return ret;
}
@ -948,7 +1203,8 @@ static ssize_t writeback_store(struct device *dev,
struct zram *zram = dev_to_zram(dev);
u64 nr_pages = zram->disksize >> PAGE_SHIFT;
unsigned long lo = 0, hi = nr_pages;
struct zram_pp_ctl *ctl = NULL;
struct zram_pp_ctl *pp_ctl = NULL;
struct zram_wb_ctl *wb_ctl = NULL;
char *args, *param, *val;
ssize_t ret = len;
int err, mode = 0;
@ -970,8 +1226,14 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
ctl = init_pp_ctl();
if (!ctl) {
pp_ctl = init_pp_ctl();
if (!pp_ctl) {
ret = -ENOMEM;
goto release_init_lock;
}
wb_ctl = init_wb_ctl(zram);
if (!wb_ctl) {
ret = -ENOMEM;
goto release_init_lock;
}
@ -1000,7 +1262,7 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
scan_slots_for_writeback(zram, mode, lo, hi, ctl);
scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
break;
}
@ -1011,7 +1273,7 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
scan_slots_for_writeback(zram, mode, lo, hi, ctl);
scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
break;
}
@ -1022,7 +1284,7 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
scan_slots_for_writeback(zram, mode, lo, hi, ctl);
scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
continue;
}
@ -1033,17 +1295,18 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
scan_slots_for_writeback(zram, mode, lo, hi, ctl);
scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl);
continue;
}
}
err = zram_writeback_slots(zram, ctl);
err = zram_writeback_slots(zram, pp_ctl, wb_ctl);
if (err)
ret = err;
release_init_lock:
release_pp_ctl(zram, ctl);
release_pp_ctl(zram, pp_ctl);
release_wb_ctl(wb_ctl);
atomic_set(&zram->pp_in_progress, 0);
up_read(&zram->init_lock);
@ -1112,7 +1375,9 @@ static int read_from_bdev(struct zram *zram, struct page *page,
return -EIO;
}
static void free_block_bdev(struct zram *zram, unsigned long blk_idx) {};
static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx)
{
}
#endif
#ifdef CONFIG_ZRAM_MEMORY_TRACKING
@ -1634,7 +1899,7 @@ static void zram_free_page(struct zram *zram, size_t index)
if (zram_test_flag(zram, index, ZRAM_WB)) {
zram_clear_flag(zram, index, ZRAM_WB);
free_block_bdev(zram, zram_get_handle(zram, index));
zram_release_bdev_block(zram, zram_get_handle(zram, index));
goto out;
}
@ -1740,14 +2005,14 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index,
ret = zram_read_from_zspool(zram, page, index);
zram_slot_unlock(zram, index);
} else {
unsigned long blk_idx = zram_get_handle(zram, index);
/*
* The slot should be unlocked before reading from the backing
* device.
*/
zram_slot_unlock(zram, index);
ret = read_from_bdev(zram, page, zram_get_handle(zram, index),
parent);
ret = read_from_bdev(zram, page, blk_idx, parent);
}
/* Should NEVER happen. Return bio error if it does. */
@ -2610,6 +2875,7 @@ static DEVICE_ATTR_RW(backing_dev);
static DEVICE_ATTR_WO(writeback);
static DEVICE_ATTR_RW(writeback_limit);
static DEVICE_ATTR_RW(writeback_limit_enable);
static DEVICE_ATTR_RW(writeback_batch_size);
#endif
#ifdef CONFIG_ZRAM_MULTI_COMP
static DEVICE_ATTR_RW(recomp_algorithm);
@ -2631,6 +2897,7 @@ static struct attribute *zram_disk_attrs[] = {
&dev_attr_writeback.attr,
&dev_attr_writeback_limit.attr,
&dev_attr_writeback_limit_enable.attr,
&dev_attr_writeback_batch_size.attr,
#endif
&dev_attr_io_stat.attr,
&dev_attr_mm_stat.attr,
@ -2692,7 +2959,7 @@ static int zram_add(void)
init_rwsem(&zram->init_lock);
#ifdef CONFIG_ZRAM_WRITEBACK
spin_lock_init(&zram->wb_limit_lock);
zram->wb_batch_size = 32;
#endif
/* gendisk structure */

View File

@ -127,8 +127,8 @@ struct zram {
bool claim; /* Protected by disk->open_mutex */
#ifdef CONFIG_ZRAM_WRITEBACK
struct file *backing_dev;
spinlock_t wb_limit_lock;
bool wb_limit_enable;
u32 wb_batch_size;
u64 bd_wb_limit;
struct block_device *bdev;
unsigned long *bitmap;

View File

@ -304,13 +304,13 @@ static unsigned zero_mmap_capabilities(struct file *file)
}
/* can't do an in-place private mapping if there's no MMU */
static inline int private_mapping_ok(struct vm_area_struct *vma)
static inline int private_mapping_ok(struct vm_area_desc *desc)
{
return is_nommu_shared_mapping(vma->vm_flags);
return is_nommu_shared_mapping(desc->vm_flags);
}
#else
static inline int private_mapping_ok(struct vm_area_struct *vma)
static inline int private_mapping_ok(struct vm_area_desc *desc)
{
return 1;
}
@ -322,46 +322,49 @@ static const struct vm_operations_struct mmap_mem_ops = {
#endif
};
static int mmap_mem(struct file *file, struct vm_area_struct *vma)
static int mmap_filter_error(int err)
{
size_t size = vma->vm_end - vma->vm_start;
phys_addr_t offset = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
return -EAGAIN;
}
static int mmap_mem_prepare(struct vm_area_desc *desc)
{
struct file *file = desc->file;
const size_t size = vma_desc_size(desc);
const phys_addr_t offset = (phys_addr_t)desc->pgoff << PAGE_SHIFT;
/* Does it even fit in phys_addr_t? */
if (offset >> PAGE_SHIFT != vma->vm_pgoff)
if (offset >> PAGE_SHIFT != desc->pgoff)
return -EINVAL;
/* It's illegal to wrap around the end of the physical address space. */
if (offset + (phys_addr_t)size - 1 < offset)
return -EINVAL;
if (!valid_mmap_phys_addr_range(vma->vm_pgoff, size))
if (!valid_mmap_phys_addr_range(desc->pgoff, size))
return -EINVAL;
if (!private_mapping_ok(vma))
if (!private_mapping_ok(desc))
return -ENOSYS;
if (!range_is_allowed(vma->vm_pgoff, size))
if (!range_is_allowed(desc->pgoff, size))
return -EPERM;
if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size,
&vma->vm_page_prot))
if (!phys_mem_access_prot_allowed(file, desc->pgoff, size,
&desc->page_prot))
return -EINVAL;
vma->vm_page_prot = phys_mem_access_prot(file, vma->vm_pgoff,
size,
vma->vm_page_prot);
desc->page_prot = phys_mem_access_prot(file, desc->pgoff,
size,
desc->page_prot);
vma->vm_ops = &mmap_mem_ops;
desc->vm_ops = &mmap_mem_ops;
/* Remap-pfn-range will mark the range VM_IO. */
mmap_action_remap_full(desc, desc->pgoff);
/* We filter remap errors to -EAGAIN. */
desc->action.error_hook = mmap_filter_error;
/* Remap-pfn-range will mark the range VM_IO */
if (remap_pfn_range(vma,
vma->vm_start,
vma->vm_pgoff,
size,
vma->vm_page_prot)) {
return -EAGAIN;
}
return 0;
}
@ -501,14 +504,26 @@ static ssize_t read_zero(struct file *file, char __user *buf,
return cleared;
}
static int mmap_zero(struct file *file, struct vm_area_struct *vma)
static int mmap_zero_private_success(const struct vm_area_struct *vma)
{
/*
* This is a highly unique situation where we mark a MAP_PRIVATE mapping
* of /dev/zero anonymous, despite it not being.
*/
vma_set_anonymous((struct vm_area_struct *)vma);
return 0;
}
static int mmap_zero_prepare(struct vm_area_desc *desc)
{
#ifndef CONFIG_MMU
return -ENOSYS;
#endif
if (vma->vm_flags & VM_SHARED)
return shmem_zero_setup(vma);
vma_set_anonymous(vma);
if (desc->vm_flags & VM_SHARED)
return shmem_zero_setup_desc(desc);
desc->action.success_hook = mmap_zero_private_success;
return 0;
}
@ -526,10 +541,11 @@ static unsigned long get_unmapped_area_zero(struct file *file,
{
if (flags & MAP_SHARED) {
/*
* mmap_zero() will call shmem_zero_setup() to create a file,
* so use shmem's get_unmapped_area in case it can be huge;
* and pass NULL for file as in mmap.c's get_unmapped_area(),
* so as not to confuse shmem with our handle on "/dev/zero".
* mmap_zero_prepare() will call shmem_zero_setup() to create a
* file, so use shmem's get_unmapped_area in case it can be
* huge; and pass NULL for file as in mmap.c's
* get_unmapped_area(), so as not to confuse shmem with our
* handle on "/dev/zero".
*/
return shmem_get_unmapped_area(NULL, addr, len, pgoff, flags);
}
@ -542,7 +558,7 @@ static unsigned long get_unmapped_area_zero(struct file *file,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
return thp_get_unmapped_area(file, addr, len, pgoff, flags);
#else
return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
return mm_get_unmapped_area(file, addr, len, pgoff, flags);
#endif
}
#endif /* CONFIG_MMU */
@ -632,7 +648,7 @@ static const struct file_operations __maybe_unused mem_fops = {
.llseek = memory_lseek,
.read = read_mem,
.write = write_mem,
.mmap = mmap_mem,
.mmap_prepare = mmap_mem_prepare,
.open = open_mem,
#ifndef CONFIG_MMU
.get_unmapped_area = get_unmapped_area_mem,
@ -668,7 +684,7 @@ static const struct file_operations zero_fops = {
.write_iter = write_iter_zero,
.splice_read = copy_splice_read,
.splice_write = splice_write_zero,
.mmap = mmap_zero,
.mmap_prepare = mmap_zero_prepare,
.get_unmapped_area = get_unmapped_area_zero,
#ifndef CONFIG_MMU
.mmap_capabilities = zero_mmap_capabilities,

View File

@ -199,7 +199,7 @@ static int ni_670x_auto_attach(struct comedi_device *dev,
const struct comedi_lrange **range_table_list;
range_table_list = kmalloc_array(32,
sizeof(struct comedi_lrange *),
sizeof(*range_table_list),
GFP_KERNEL);
if (!range_table_list)
return -ENOMEM;

View File

@ -13,8 +13,9 @@
#include "dax-private.h"
#include "bus.h"
static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
const char *func)
static int __check_vma(struct dev_dax *dev_dax, vm_flags_t vm_flags,
unsigned long start, unsigned long end, struct file *file,
const char *func)
{
struct device *dev = &dev_dax->dev;
unsigned long mask;
@ -23,7 +24,7 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
return -ENXIO;
/* prevent private mappings from being established */
if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
if ((vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
dev_info_ratelimited(dev,
"%s: %s: fail, attempted private mapping\n",
current->comm, func);
@ -31,15 +32,15 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
}
mask = dev_dax->align - 1;
if (vma->vm_start & mask || vma->vm_end & mask) {
if (start & mask || end & mask) {
dev_info_ratelimited(dev,
"%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
current->comm, func, vma->vm_start, vma->vm_end,
current->comm, func, start, end,
mask);
return -EINVAL;
}
if (!vma_is_dax(vma)) {
if (!file_is_dax(file)) {
dev_info_ratelimited(dev,
"%s: %s: fail, vma is not DAX capable\n",
current->comm, func);
@ -49,6 +50,13 @@ static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
return 0;
}
static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
const char *func)
{
return __check_vma(dev_dax, vma->vm_flags, vma->vm_start, vma->vm_end,
vma->vm_file, func);
}
/* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */
__weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
unsigned long size)
@ -285,8 +293,9 @@ static const struct vm_operations_struct dax_vm_ops = {
.pagesize = dev_dax_pagesize,
};
static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
static int dax_mmap_prepare(struct vm_area_desc *desc)
{
struct file *filp = desc->file;
struct dev_dax *dev_dax = filp->private_data;
int rc, id;
@ -297,13 +306,14 @@ static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
* fault time.
*/
id = dax_read_lock();
rc = check_vma(dev_dax, vma, __func__);
rc = __check_vma(dev_dax, desc->vm_flags, desc->start, desc->end, filp,
__func__);
dax_read_unlock(id);
if (rc)
return rc;
vma->vm_ops = &dax_vm_ops;
vm_flags_set(vma, VM_HUGEPAGE);
desc->vm_ops = &dax_vm_ops;
desc->vm_flags |= VM_HUGEPAGE;
return 0;
}
@ -330,14 +340,13 @@ static unsigned long dax_get_unmapped_area(struct file *filp,
if ((off + len_align) < off)
goto out;
addr_align = mm_get_unmapped_area(current->mm, filp, addr, len_align,
pgoff, flags);
addr_align = mm_get_unmapped_area(filp, addr, len_align, pgoff, flags);
if (!IS_ERR_VALUE(addr_align)) {
addr_align += (off - addr_align) & (align - 1);
return addr_align;
}
out:
return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
return mm_get_unmapped_area(filp, addr, len, pgoff, flags);
}
static const struct address_space_operations dev_dax_aops = {
@ -377,7 +386,7 @@ static const struct file_operations dax_fops = {
.open = dax_open,
.release = dax_release,
.get_unmapped_area = dax_get_unmapped_area,
.mmap = dax_mmap,
.mmap_prepare = dax_mmap_prepare,
.fop_flags = FOP_MMAP_SYNC,
};

View File

@ -12,18 +12,18 @@
#include <linux/io.h>
#include <linux/memblock.h>
#include <linux/mm_types.h>
#include <linux/pgalloc.h>
#include <linux/pgtable.h>
#include <linux/preempt.h>
#include <linux/rbtree.h>
#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/efi.h>
#include <asm/mmu.h>
#include <asm/pgalloc.h>
#if defined(CONFIG_PTDUMP_DEBUGFS) || defined(CONFIG_ARM_PTDUMP_DEBUGFS)
#include <asm/ptdump.h>

View File

@ -14,18 +14,18 @@
#include <linux/io.h>
#include <linux/memblock.h>
#include <linux/mm_types.h>
#include <linux/pgalloc.h>
#include <linux/pgtable.h>
#include <linux/preempt.h>
#include <linux/rbtree.h>
#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/efi.h>
#include <asm/mmu.h>
#include <asm/pgalloc.h>
static bool __init efi_virtmap_init(void)
{

View File

@ -217,7 +217,7 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn)
page = pfn_to_page(pfn);
svm_range_bo_ref(prange->svm_bo);
page->zone_device_data = prange->svm_bo;
zone_device_page_init(page);
zone_device_page_init(page, 0);
}
static void
@ -567,8 +567,9 @@ out:
return r < 0 ? r : 0;
}
static void svm_migrate_page_free(struct page *page)
static void svm_migrate_folio_free(struct folio *folio)
{
struct page *page = &folio->page;
struct svm_range_bo *svm_bo = page->zone_device_data;
if (svm_bo) {
@ -1008,7 +1009,7 @@ out_mmput:
}
static const struct dev_pagemap_ops svm_migrate_pgmap_ops = {
.page_free = svm_migrate_page_free,
.folio_free = svm_migrate_folio_free,
.migrate_to_ram = svm_migrate_to_ram,
};

View File

@ -196,7 +196,7 @@ static void drm_pagemap_get_devmem_page(struct page *page,
struct drm_pagemap_zdd *zdd)
{
page->zone_device_data = drm_pagemap_zdd_get(zdd);
zone_device_page_init(page);
zone_device_page_init(page, 0);
}
/**
@ -752,15 +752,15 @@ err_out:
}
/**
* drm_pagemap_page_free() - Put GPU SVM zone device data associated with a page
* @page: Pointer to the page
* drm_pagemap_folio_free() - Put GPU SVM zone device data associated with a folio
* @folio: Pointer to the folio
*
* This function is a callback used to put the GPU SVM zone device data
* associated with a page when it is being released.
*/
static void drm_pagemap_page_free(struct page *page)
static void drm_pagemap_folio_free(struct folio *folio)
{
drm_pagemap_zdd_put(page->zone_device_data);
drm_pagemap_zdd_put(folio->page.zone_device_data);
}
/**
@ -788,7 +788,7 @@ static vm_fault_t drm_pagemap_migrate_to_ram(struct vm_fault *vmf)
}
static const struct dev_pagemap_ops drm_pagemap_pagemap_ops = {
.page_free = drm_pagemap_page_free,
.folio_free = drm_pagemap_folio_free,
.migrate_to_ram = drm_pagemap_migrate_to_ram,
};

View File

@ -425,7 +425,7 @@ static int __drm_universal_plane_init(struct drm_device *dev,
plane->modifier_count = format_modifier_count;
plane->modifiers = kmalloc_array(format_modifier_count,
sizeof(format_modifiers[0]),
sizeof(*plane->modifiers),
GFP_KERNEL);
if (format_modifier_count && !plane->modifiers) {

View File

@ -50,6 +50,7 @@
*/
#define DMEM_CHUNK_SIZE (2UL << 20)
#define DMEM_CHUNK_NPAGES (DMEM_CHUNK_SIZE >> PAGE_SHIFT)
#define NR_CHUNKS (128)
enum nouveau_aper {
NOUVEAU_APER_VIRT,
@ -83,9 +84,15 @@ struct nouveau_dmem {
struct list_head chunks;
struct mutex mutex;
struct page *free_pages;
struct folio *free_folios;
spinlock_t lock;
};
struct nouveau_dmem_dma_info {
dma_addr_t dma_addr;
size_t size;
};
static struct nouveau_dmem_chunk *nouveau_page_to_chunk(struct page *page)
{
return container_of(page_pgmap(page), struct nouveau_dmem_chunk,
@ -108,14 +115,20 @@ unsigned long nouveau_dmem_page_addr(struct page *page)
return chunk->bo->offset + off;
}
static void nouveau_dmem_page_free(struct page *page)
static void nouveau_dmem_folio_free(struct folio *folio)
{
struct page *page = &folio->page;
struct nouveau_dmem_chunk *chunk = nouveau_page_to_chunk(page);
struct nouveau_dmem *dmem = chunk->drm->dmem;
spin_lock(&dmem->lock);
page->zone_device_data = dmem->free_pages;
dmem->free_pages = page;
if (folio_order(folio)) {
page->zone_device_data = dmem->free_folios;
dmem->free_folios = folio;
} else {
page->zone_device_data = dmem->free_pages;
dmem->free_pages = page;
}
WARN_ON(!chunk->callocated);
chunk->callocated--;
@ -139,20 +152,28 @@ static void nouveau_dmem_fence_done(struct nouveau_fence **fence)
}
}
static int nouveau_dmem_copy_one(struct nouveau_drm *drm, struct page *spage,
struct page *dpage, dma_addr_t *dma_addr)
static int nouveau_dmem_copy_folio(struct nouveau_drm *drm,
struct folio *sfolio, struct folio *dfolio,
struct nouveau_dmem_dma_info *dma_info)
{
struct device *dev = drm->dev->dev;
struct page *dpage = folio_page(dfolio, 0);
struct page *spage = folio_page(sfolio, 0);
lock_page(dpage);
folio_lock(dfolio);
*dma_addr = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL);
if (dma_mapping_error(dev, *dma_addr))
dma_info->dma_addr = dma_map_page(dev, dpage, 0, page_size(dpage),
DMA_BIDIRECTIONAL);
dma_info->size = page_size(dpage);
if (dma_mapping_error(dev, dma_info->dma_addr))
return -EIO;
if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_HOST, *dma_addr,
NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage))) {
dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
if (drm->dmem->migrate.copy_func(drm, folio_nr_pages(sfolio),
NOUVEAU_APER_HOST, dma_info->dma_addr,
NOUVEAU_APER_VRAM,
nouveau_dmem_page_addr(spage))) {
dma_unmap_page(dev, dma_info->dma_addr, page_size(dpage),
DMA_BIDIRECTIONAL);
return -EIO;
}
@ -165,21 +186,48 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
struct nouveau_dmem *dmem = drm->dmem;
struct nouveau_fence *fence;
struct nouveau_svmm *svmm;
struct page *spage, *dpage;
unsigned long src = 0, dst = 0;
dma_addr_t dma_addr = 0;
struct page *dpage;
vm_fault_t ret = 0;
int err;
struct migrate_vma args = {
.vma = vmf->vma,
.start = vmf->address,
.end = vmf->address + PAGE_SIZE,
.src = &src,
.dst = &dst,
.pgmap_owner = drm->dev,
.fault_page = vmf->page,
.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE |
MIGRATE_VMA_SELECT_COMPOUND,
.src = NULL,
.dst = NULL,
};
unsigned int order, nr;
struct folio *sfolio, *dfolio;
struct nouveau_dmem_dma_info dma_info;
sfolio = page_folio(vmf->page);
order = folio_order(sfolio);
nr = 1 << order;
/*
* Handle partial unmap faults, where the folio is large, but
* the pmd is split.
*/
if (vmf->pte) {
order = 0;
nr = 1;
}
if (order)
args.flags |= MIGRATE_VMA_SELECT_COMPOUND;
args.start = ALIGN_DOWN(vmf->address, (PAGE_SIZE << order));
args.vma = vmf->vma;
args.end = args.start + (PAGE_SIZE << order);
args.src = kcalloc(nr, sizeof(*args.src), GFP_KERNEL);
args.dst = kcalloc(nr, sizeof(*args.dst), GFP_KERNEL);
if (!args.src || !args.dst) {
ret = VM_FAULT_OOM;
goto err;
}
/*
* FIXME what we really want is to find some heuristic to migrate more
* than just one page on CPU fault. When such fault happens it is very
@ -190,22 +238,28 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
if (!args.cpages)
return 0;
spage = migrate_pfn_to_page(src);
if (!spage || !(src & MIGRATE_PFN_MIGRATE))
if (order)
dpage = folio_page(vma_alloc_folio(GFP_HIGHUSER | __GFP_ZERO,
order, vmf->vma, vmf->address), 0);
else
dpage = alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vmf->vma,
vmf->address);
if (!dpage) {
ret = VM_FAULT_OOM;
goto done;
}
dpage = alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vmf->vma, vmf->address);
if (!dpage)
goto done;
args.dst[0] = migrate_pfn(page_to_pfn(dpage));
if (order)
args.dst[0] |= MIGRATE_PFN_COMPOUND;
dfolio = page_folio(dpage);
dst = migrate_pfn(page_to_pfn(dpage));
svmm = spage->zone_device_data;
svmm = folio_zone_device_data(sfolio);
mutex_lock(&svmm->mutex);
nouveau_svmm_invalidate(svmm, args.start, args.end);
ret = nouveau_dmem_copy_one(drm, spage, dpage, &dma_addr);
err = nouveau_dmem_copy_folio(drm, sfolio, dfolio, &dma_info);
mutex_unlock(&svmm->mutex);
if (ret) {
if (err) {
ret = VM_FAULT_SIGBUS;
goto done;
}
@ -213,25 +267,40 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
nouveau_fence_new(&fence, dmem->migrate.chan);
migrate_vma_pages(&args);
nouveau_dmem_fence_done(&fence);
dma_unmap_page(drm->dev->dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
dma_unmap_page(drm->dev->dev, dma_info.dma_addr, PAGE_SIZE,
DMA_BIDIRECTIONAL);
done:
migrate_vma_finalize(&args);
err:
kfree(args.src);
kfree(args.dst);
return ret;
}
static void nouveau_dmem_folio_split(struct folio *head, struct folio *tail)
{
if (tail == NULL)
return;
tail->pgmap = head->pgmap;
tail->mapping = head->mapping;
folio_set_zone_device_data(tail, folio_zone_device_data(head));
}
static const struct dev_pagemap_ops nouveau_dmem_pagemap_ops = {
.page_free = nouveau_dmem_page_free,
.folio_free = nouveau_dmem_folio_free,
.migrate_to_ram = nouveau_dmem_migrate_to_ram,
.folio_split = nouveau_dmem_folio_split,
};
static int
nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage)
nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage,
bool is_large)
{
struct nouveau_dmem_chunk *chunk;
struct resource *res;
struct page *page;
void *ptr;
unsigned long i, pfn_first;
unsigned long i, pfn_first, pfn;
int ret;
chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
@ -241,7 +310,7 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage)
}
/* Allocate unused physical address space for device private pages. */
res = request_free_mem_region(&iomem_resource, DMEM_CHUNK_SIZE,
res = request_free_mem_region(&iomem_resource, DMEM_CHUNK_SIZE * NR_CHUNKS,
"nouveau_dmem");
if (IS_ERR(res)) {
ret = PTR_ERR(res);
@ -274,16 +343,40 @@ nouveau_dmem_chunk_alloc(struct nouveau_drm *drm, struct page **ppage)
pfn_first = chunk->pagemap.range.start >> PAGE_SHIFT;
page = pfn_to_page(pfn_first);
spin_lock(&drm->dmem->lock);
for (i = 0; i < DMEM_CHUNK_NPAGES - 1; ++i, ++page) {
page->zone_device_data = drm->dmem->free_pages;
drm->dmem->free_pages = page;
pfn = pfn_first;
for (i = 0; i < NR_CHUNKS; i++) {
int j;
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) || !is_large) {
for (j = 0; j < DMEM_CHUNK_NPAGES - 1; j++, pfn++) {
page = pfn_to_page(pfn);
page->zone_device_data = drm->dmem->free_pages;
drm->dmem->free_pages = page;
}
} else {
page = pfn_to_page(pfn);
page->zone_device_data = drm->dmem->free_folios;
drm->dmem->free_folios = page_folio(page);
pfn += DMEM_CHUNK_NPAGES;
}
}
*ppage = page;
/* Move to next page */
if (is_large) {
*ppage = &drm->dmem->free_folios->page;
drm->dmem->free_folios = (*ppage)->zone_device_data;
} else {
*ppage = drm->dmem->free_pages;
drm->dmem->free_pages = (*ppage)->zone_device_data;
}
chunk->callocated++;
spin_unlock(&drm->dmem->lock);
NV_INFO(drm, "DMEM: registered %ldMB of device memory\n",
DMEM_CHUNK_SIZE >> 20);
NV_INFO(drm, "DMEM: registered %ldMB of %sdevice memory %lx %lx\n",
NR_CHUNKS * DMEM_CHUNK_SIZE >> 20, is_large ? "THP " : "", pfn_first,
nouveau_dmem_page_addr(page));
return 0;
@ -298,27 +391,41 @@ out:
}
static struct page *
nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm)
nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm, bool is_large)
{
struct nouveau_dmem_chunk *chunk;
struct page *page = NULL;
struct folio *folio = NULL;
int ret;
unsigned int order = 0;
spin_lock(&drm->dmem->lock);
if (drm->dmem->free_pages) {
if (is_large && drm->dmem->free_folios) {
folio = drm->dmem->free_folios;
page = &folio->page;
drm->dmem->free_folios = page->zone_device_data;
chunk = nouveau_page_to_chunk(&folio->page);
chunk->callocated++;
spin_unlock(&drm->dmem->lock);
order = ilog2(DMEM_CHUNK_NPAGES);
} else if (!is_large && drm->dmem->free_pages) {
page = drm->dmem->free_pages;
drm->dmem->free_pages = page->zone_device_data;
chunk = nouveau_page_to_chunk(page);
chunk->callocated++;
spin_unlock(&drm->dmem->lock);
folio = page_folio(page);
} else {
spin_unlock(&drm->dmem->lock);
ret = nouveau_dmem_chunk_alloc(drm, &page);
ret = nouveau_dmem_chunk_alloc(drm, &page, is_large);
if (ret)
return NULL;
folio = page_folio(page);
if (is_large)
order = ilog2(DMEM_CHUNK_NPAGES);
}
zone_device_page_init(page);
zone_device_folio_init(folio, order);
return page;
}
@ -369,12 +476,12 @@ nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk)
{
unsigned long i, npages = range_len(&chunk->pagemap.range) >> PAGE_SHIFT;
unsigned long *src_pfns, *dst_pfns;
dma_addr_t *dma_addrs;
struct nouveau_dmem_dma_info *dma_info;
struct nouveau_fence *fence;
src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL);
dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL);
dma_addrs = kvcalloc(npages, sizeof(*dma_addrs), GFP_KERNEL | __GFP_NOFAIL);
dma_info = kvcalloc(npages, sizeof(*dma_info), GFP_KERNEL | __GFP_NOFAIL);
migrate_device_range(src_pfns, chunk->pagemap.range.start >> PAGE_SHIFT,
npages);
@ -382,17 +489,28 @@ nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk)
for (i = 0; i < npages; i++) {
if (src_pfns[i] & MIGRATE_PFN_MIGRATE) {
struct page *dpage;
struct folio *folio = page_folio(
migrate_pfn_to_page(src_pfns[i]));
unsigned int order = folio_order(folio);
if (src_pfns[i] & MIGRATE_PFN_COMPOUND) {
dpage = folio_page(
folio_alloc(
GFP_HIGHUSER_MOVABLE, order), 0);
} else {
/*
* _GFP_NOFAIL because the GPU is going away and there
* is nothing sensible we can do if we can't copy the
* data back.
*/
dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL);
}
/*
* _GFP_NOFAIL because the GPU is going away and there
* is nothing sensible we can do if we can't copy the
* data back.
*/
dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL);
dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
nouveau_dmem_copy_one(chunk->drm,
migrate_pfn_to_page(src_pfns[i]), dpage,
&dma_addrs[i]);
nouveau_dmem_copy_folio(chunk->drm,
page_folio(migrate_pfn_to_page(src_pfns[i])),
page_folio(dpage),
&dma_info[i]);
}
}
@ -403,8 +521,9 @@ nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk)
kvfree(src_pfns);
kvfree(dst_pfns);
for (i = 0; i < npages; i++)
dma_unmap_page(chunk->drm->dev->dev, dma_addrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL);
kvfree(dma_addrs);
dma_unmap_page(chunk->drm->dev->dev, dma_info[i].dma_addr,
dma_info[i].size, DMA_BIDIRECTIONAL);
kvfree(dma_info);
}
void
@ -607,31 +726,36 @@ nouveau_dmem_init(struct nouveau_drm *drm)
static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm,
struct nouveau_svmm *svmm, unsigned long src,
dma_addr_t *dma_addr, u64 *pfn)
struct nouveau_dmem_dma_info *dma_info, u64 *pfn)
{
struct device *dev = drm->dev->dev;
struct page *dpage, *spage;
unsigned long paddr;
bool is_large = false;
unsigned long mpfn;
spage = migrate_pfn_to_page(src);
if (!(src & MIGRATE_PFN_MIGRATE))
goto out;
dpage = nouveau_dmem_page_alloc_locked(drm);
is_large = src & MIGRATE_PFN_COMPOUND;
dpage = nouveau_dmem_page_alloc_locked(drm, is_large);
if (!dpage)
goto out;
paddr = nouveau_dmem_page_addr(dpage);
if (spage) {
*dma_addr = dma_map_page(dev, spage, 0, page_size(spage),
dma_info->dma_addr = dma_map_page(dev, spage, 0, page_size(spage),
DMA_BIDIRECTIONAL);
if (dma_mapping_error(dev, *dma_addr))
dma_info->size = page_size(spage);
if (dma_mapping_error(dev, dma_info->dma_addr))
goto out_free_page;
if (drm->dmem->migrate.copy_func(drm, 1,
NOUVEAU_APER_VRAM, paddr, NOUVEAU_APER_HOST, *dma_addr))
if (drm->dmem->migrate.copy_func(drm, folio_nr_pages(page_folio(spage)),
NOUVEAU_APER_VRAM, paddr, NOUVEAU_APER_HOST,
dma_info->dma_addr))
goto out_dma_unmap;
} else {
*dma_addr = DMA_MAPPING_ERROR;
dma_info->dma_addr = DMA_MAPPING_ERROR;
if (drm->dmem->migrate.clear_func(drm, page_size(dpage),
NOUVEAU_APER_VRAM, paddr))
goto out_free_page;
@ -642,10 +766,13 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm,
((paddr >> PAGE_SHIFT) << NVIF_VMM_PFNMAP_V0_ADDR_SHIFT);
if (src & MIGRATE_PFN_WRITE)
*pfn |= NVIF_VMM_PFNMAP_V0_W;
return migrate_pfn(page_to_pfn(dpage));
mpfn = migrate_pfn(page_to_pfn(dpage));
if (folio_order(page_folio(dpage)))
mpfn |= MIGRATE_PFN_COMPOUND;
return mpfn;
out_dma_unmap:
dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
dma_unmap_page(dev, dma_info->dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
out_free_page:
nouveau_dmem_page_free_locked(drm, dpage);
out:
@ -655,27 +782,38 @@ out:
static void nouveau_dmem_migrate_chunk(struct nouveau_drm *drm,
struct nouveau_svmm *svmm, struct migrate_vma *args,
dma_addr_t *dma_addrs, u64 *pfns)
struct nouveau_dmem_dma_info *dma_info, u64 *pfns)
{
struct nouveau_fence *fence;
unsigned long addr = args->start, nr_dma = 0, i;
unsigned long order = 0;
for (i = 0; addr < args->end; ) {
struct folio *folio;
for (i = 0; addr < args->end; i++) {
args->dst[i] = nouveau_dmem_migrate_copy_one(drm, svmm,
args->src[i], dma_addrs + nr_dma, pfns + i);
if (!dma_mapping_error(drm->dev->dev, dma_addrs[nr_dma]))
args->src[i], dma_info + nr_dma, pfns + i);
if (!args->dst[i]) {
i++;
addr += PAGE_SIZE;
continue;
}
if (!dma_mapping_error(drm->dev->dev, dma_info[nr_dma].dma_addr))
nr_dma++;
addr += PAGE_SIZE;
folio = page_folio(migrate_pfn_to_page(args->dst[i]));
order = folio_order(folio);
i += 1 << order;
addr += (1 << order) * PAGE_SIZE;
}
nouveau_fence_new(&fence, drm->dmem->migrate.chan);
migrate_vma_pages(args);
nouveau_dmem_fence_done(&fence);
nouveau_pfns_map(svmm, args->vma->vm_mm, args->start, pfns, i);
nouveau_pfns_map(svmm, args->vma->vm_mm, args->start, pfns, i, order);
while (nr_dma--) {
dma_unmap_page(drm->dev->dev, dma_addrs[nr_dma], PAGE_SIZE,
DMA_BIDIRECTIONAL);
dma_unmap_page(drm->dev->dev, dma_info[nr_dma].dma_addr,
dma_info[nr_dma].size, DMA_BIDIRECTIONAL);
}
migrate_vma_finalize(args);
}
@ -688,20 +826,27 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
unsigned long end)
{
unsigned long npages = (end - start) >> PAGE_SHIFT;
unsigned long max = min(SG_MAX_SINGLE_ALLOC, npages);
dma_addr_t *dma_addrs;
unsigned long max = npages;
struct migrate_vma args = {
.vma = vma,
.start = start,
.pgmap_owner = drm->dev,
.flags = MIGRATE_VMA_SELECT_SYSTEM,
.flags = MIGRATE_VMA_SELECT_SYSTEM
| MIGRATE_VMA_SELECT_COMPOUND,
};
unsigned long i;
u64 *pfns;
int ret = -ENOMEM;
struct nouveau_dmem_dma_info *dma_info;
if (drm->dmem == NULL)
return -ENODEV;
if (drm->dmem == NULL) {
ret = -ENODEV;
goto out;
}
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
if (max > (unsigned long)HPAGE_PMD_NR)
max = (unsigned long)HPAGE_PMD_NR;
args.src = kcalloc(max, sizeof(*args.src), GFP_KERNEL);
if (!args.src)
@ -710,8 +855,8 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
if (!args.dst)
goto out_free_src;
dma_addrs = kmalloc_array(max, sizeof(*dma_addrs), GFP_KERNEL);
if (!dma_addrs)
dma_info = kmalloc_array(max, sizeof(*dma_info), GFP_KERNEL);
if (!dma_info)
goto out_free_dst;
pfns = nouveau_pfns_alloc(max);
@ -729,7 +874,7 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
goto out_free_pfns;
if (args.cpages)
nouveau_dmem_migrate_chunk(drm, svmm, &args, dma_addrs,
nouveau_dmem_migrate_chunk(drm, svmm, &args, dma_info,
pfns);
args.start = args.end;
}
@ -738,7 +883,7 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
out_free_pfns:
nouveau_pfns_free(pfns);
out_free_dma:
kfree(dma_addrs);
kfree(dma_info);
out_free_dst:
kfree(args.dst);
out_free_src:

View File

@ -921,12 +921,14 @@ nouveau_pfns_free(u64 *pfns)
void
nouveau_pfns_map(struct nouveau_svmm *svmm, struct mm_struct *mm,
unsigned long addr, u64 *pfns, unsigned long npages)
unsigned long addr, u64 *pfns, unsigned long npages,
unsigned int page_shift)
{
struct nouveau_pfnmap_args *args = nouveau_pfns_to_args(pfns);
args->p.addr = addr;
args->p.size = npages << PAGE_SHIFT;
args->p.size = npages << page_shift;
args->p.page = page_shift;
mutex_lock(&svmm->mutex);

View File

@ -33,7 +33,8 @@ void nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit);
u64 *nouveau_pfns_alloc(unsigned long npages);
void nouveau_pfns_free(u64 *pfns);
void nouveau_pfns_map(struct nouveau_svmm *svmm, struct mm_struct *mm,
unsigned long addr, u64 *pfns, unsigned long npages);
unsigned long addr, u64 *pfns, unsigned long npages,
unsigned int page_shift);
#else /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */
static inline void nouveau_svm_init(struct nouveau_drm *drm) {}
static inline void nouveau_svm_fini(struct nouveau_drm *drm) {}

View File

@ -10,6 +10,8 @@
#include "iommu-priv.h"
static DEFINE_MUTEX(iommu_sva_lock);
static bool iommu_sva_present;
static LIST_HEAD(iommu_sva_mms);
static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
struct mm_struct *mm);
@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de
return ERR_PTR(-ENOSPC);
}
iommu_mm->pasid = pasid;
iommu_mm->mm = mm;
INIT_LIST_HEAD(&iommu_mm->sva_domains);
/*
* Make sure the write to mm->iommu_mm is not reordered in front of
@ -132,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
if (ret)
goto out_free_domain;
domain->users = 1;
list_add(&domain->next, &mm->iommu_mm->sva_domains);
if (list_empty(&iommu_mm->sva_domains)) {
if (list_empty(&iommu_sva_mms))
iommu_sva_present = true;
list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
}
list_add(&domain->next, &iommu_mm->sva_domains);
out:
refcount_set(&handle->users, 1);
mutex_unlock(&iommu_sva_lock);
@ -175,6 +183,13 @@ void iommu_sva_unbind_device(struct iommu_sva *handle)
list_del(&domain->next);
iommu_domain_free(domain);
}
if (list_empty(&iommu_mm->sva_domains)) {
list_del(&iommu_mm->mm_list_elm);
if (list_empty(&iommu_sva_mms))
iommu_sva_present = false;
}
mutex_unlock(&iommu_sva_lock);
kfree(handle);
}
@ -312,3 +327,15 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
return domain;
}
void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end)
{
struct iommu_mm_data *iommu_mm;
guard(mutex)(&iommu_sva_lock);
if (!iommu_sva_present)
return;
list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
}

View File

@ -1212,5 +1212,5 @@ void iris_hfi_gen2_command_ops_init(struct iris_core *core)
struct iris_inst *iris_hfi_gen2_get_instance(void)
{
return kzalloc(sizeof(struct iris_inst_hfi_gen2), GFP_KERNEL);
return (struct iris_inst *)kzalloc(sizeof(struct iris_inst_hfi_gen2), GFP_KERNEL);
}

View File

@ -200,8 +200,9 @@ static const struct attribute_group p2pmem_group = {
.name = "p2pmem",
};
static void p2pdma_page_free(struct page *page)
static void p2pdma_folio_free(struct folio *folio)
{
struct page *page = &folio->page;
struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page));
/* safe to dereference while a reference is held to the percpu ref */
struct pci_p2pdma *p2pdma = rcu_dereference_protected(
@ -214,7 +215,7 @@ static void p2pdma_page_free(struct page *page)
}
static const struct dev_pagemap_ops p2pdma_pgmap_ops = {
.page_free = p2pdma_page_free,
.folio_free = p2pdma_folio_free,
};
static void pci_p2pdma_release(void *data)

View File

@ -16,8 +16,7 @@
#include <linux/vmalloc.h>
#include <linux/async.h>
#include <linux/mutex.h>
#include <asm/pgalloc.h>
#include <linux/pgalloc.h>
#include "sclp.h"

View File

@ -17,8 +17,6 @@
#define IOVA_START_PFN 1
#define INVALID_PHYS_ADDR (~(phys_addr_t)0)
#define BOUNCE_MAP_SHIFT 12
#define BOUNCE_MAP_SIZE (1 << BOUNCE_MAP_SHIFT)
#define BOUNCE_MAP_MASK (~(BOUNCE_MAP_SIZE - 1))

View File

@ -598,7 +598,7 @@ static void detach_attrs(struct config_item * item)
static int populate_attrs(struct config_item *item)
{
const struct config_item_type *t = item->ci_type;
struct configfs_group_operations *ops;
const struct configfs_group_operations *ops;
struct configfs_attribute *attr;
struct configfs_bin_attribute *bin_attr;
int error = 0;

View File

@ -30,7 +30,7 @@ struct configfs_buffer {
size_t count;
loff_t pos;
char * page;
struct configfs_item_operations * ops;
const struct configfs_item_operations *ops;
struct mutex mutex;
int needs_read_fill;
bool read_in_progress;

View File

@ -24,7 +24,7 @@
#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
#include <linux/rmap.h>
#include <asm/pgalloc.h>
#include <linux/pgalloc.h>
#define CREATE_TRACE_POINTS
#include <trace/events/fs_dax.h>

View File

@ -96,8 +96,15 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
#define PGOFF_LOFFT_MAX \
(((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1)))
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma)
{
/* Unfortunate we have to reassign vma->vm_private_data. */
return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma);
}
static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc)
{
struct file *file = desc->file;
struct inode *inode = file_inode(file);
loff_t len, vma_len;
int ret;
@ -112,8 +119,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
* way when do_mmap unwinds (may be important on powerpc
* and ia64).
*/
vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
vma->vm_ops = &hugetlb_vm_ops;
desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
desc->vm_ops = &hugetlb_vm_ops;
/*
* page based offset in vm_pgoff could be sufficiently large to
@ -122,16 +129,16 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
* sizeof(unsigned long). So, only check in those instances.
*/
if (sizeof(unsigned long) == sizeof(loff_t)) {
if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
if (desc->pgoff & PGOFF_LOFFT_MAX)
return -EINVAL;
}
/* must be huge page aligned */
if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
return -EINVAL;
vma_len = (loff_t)(vma->vm_end - vma->vm_start);
len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
vma_len = (loff_t)vma_desc_size(desc);
len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT);
/* check for overflow */
if (len < vma_len)
return -EINVAL;
@ -141,7 +148,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
ret = -ENOMEM;
vm_flags = vma->vm_flags;
vm_flags = desc->vm_flags;
/*
* for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
* reserving here. Note: only for SHM hugetlbfs file, the inode
@ -151,17 +158,30 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
vm_flags |= VM_NORESERVE;
if (hugetlb_reserve_pages(inode,
vma->vm_pgoff >> huge_page_order(h),
len >> huge_page_shift(h), vma,
vm_flags) < 0)
desc->pgoff >> huge_page_order(h),
len >> huge_page_shift(h), desc,
vm_flags) < 0)
goto out;
ret = 0;
if (vma->vm_flags & VM_WRITE && inode->i_size < len)
if ((desc->vm_flags & VM_WRITE) && inode->i_size < len)
i_size_write(inode, len);
out:
inode_unlock(inode);
if (!ret) {
/* Allocate the VMA lock after we set it up. */
desc->action.success_hook = hugetlb_file_mmap_prepare_success;
/*
* We cannot permit the rmap finding this VMA in the time
* between the VMA being inserted into the VMA tree and the
* completion/success hook being invoked.
*
* This is because we establish a per-VMA hugetlb lock which can
* be raced by rmap.
*/
desc->action.hide_from_rmap_until_complete = true;
}
return ret;
}
@ -184,8 +204,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (addr)
addr0 = ALIGN(addr, huge_page_size(h));
return mm_get_unmapped_area_vmflags(current->mm, file, addr0, len, pgoff,
flags, 0);
return mm_get_unmapped_area_vmflags(file, addr0, len, pgoff, flags, 0);
}
/*
@ -1221,7 +1240,7 @@ static void init_once(void *foo)
static const struct file_operations hugetlbfs_file_operations = {
.read_iter = hugetlbfs_read_iter,
.mmap = hugetlbfs_file_mmap,
.mmap_prepare = hugetlbfs_file_mmap_prepare,
.fsync = noop_fsync,
.get_unmapped_area = hugetlb_get_unmapped_area,
.llseek = default_llseek,

View File

@ -379,7 +379,7 @@ static int ntfs_file_mmap_prepare(struct vm_area_desc *desc)
if (rw) {
u64 to = min_t(loff_t, i_size_read(inode),
from + desc->end - desc->start);
from + vma_desc_size(desc));
if (is_sparsed(ni)) {
/* Allocate clusters for rw map. */

View File

@ -1481,31 +1481,16 @@ static struct file_system_type pipe_fs_type = {
};
#ifdef CONFIG_SYSCTL
static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
unsigned int *valp,
int write, void *data)
{
if (write) {
unsigned int val;
val = round_pipe_size(*lvalp);
if (val == 0)
return -EINVAL;
*valp = val;
} else {
unsigned int val = *valp;
*lvalp = (unsigned long) val;
}
return 0;
}
static SYSCTL_USER_TO_KERN_UINT_CONV(_pipe_maxsz, round_pipe_size)
static SYSCTL_UINT_CONV_CUSTOM(_pipe_maxsz,
sysctl_user_to_kern_uint_conv_pipe_maxsz,
sysctl_kern_to_user_uint_conv, true)
static int proc_dopipe_max_size(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
return do_proc_douintvec(table, write, buffer, lenp, ppos,
do_proc_dopipe_max_size_conv, NULL);
return proc_douintvec_conv(table, write, buffer, lenp, ppos,
do_proc_uint_conv_pipe_maxsz);
}
static const struct ctl_table fs_pipe_sysctls[] = {
@ -1515,6 +1500,7 @@ static const struct ctl_table fs_pipe_sysctls[] = {
.maxlen = sizeof(pipe_max_size),
.mode = 0644,
.proc_handler = proc_dopipe_max_size,
.extra1 = SYSCTL_ONE,
},
{
.procname = "pipe-user-pages-hard",

View File

@ -443,7 +443,7 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo
return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags);
#ifdef CONFIG_MMU
return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags);
return mm_get_unmapped_area(file, orig_addr, len, pgoff, flags);
#endif
return orig_addr;

View File

@ -14,7 +14,7 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/sched/mm.h>
#include <linux/swapops.h>
#include <linux/leafops.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
@ -1017,14 +1017,16 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
young = pte_young(ptent);
dirty = pte_dirty(ptent);
present = true;
} else if (is_swap_pte(ptent)) {
swp_entry_t swpent = pte_to_swp_entry(ptent);
} else if (pte_none(ptent)) {
smaps_pte_hole_lookup(addr, walk);
} else {
const softleaf_t entry = softleaf_from_pte(ptent);
if (!non_swap_entry(swpent)) {
if (softleaf_is_swap(entry)) {
int mapcount;
mss->swap += PAGE_SIZE;
mapcount = swp_swapcount(swpent);
mapcount = swp_swapcount(entry);
if (mapcount >= 2) {
u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
@ -1033,14 +1035,11 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else {
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
}
} else if (is_pfn_swap_entry(swpent)) {
if (is_device_private_entry(swpent))
} else if (softleaf_has_pfn(entry)) {
if (softleaf_is_device_private(entry))
present = true;
page = pfn_swap_entry_to_page(swpent);
page = softleaf_to_page(entry);
}
} else {
smaps_pte_hole_lookup(addr, walk);
return;
}
if (!page)
@ -1060,14 +1059,16 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
bool present = false;
struct folio *folio;
if (pmd_none(*pmd))
return;
if (pmd_present(*pmd)) {
page = vm_normal_page_pmd(vma, addr, *pmd);
present = true;
} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
} else if (unlikely(thp_migration_supported())) {
const softleaf_t entry = softleaf_from_pmd(*pmd);
if (is_pfn_swap_entry(entry))
page = pfn_swap_entry_to_page(entry);
if (softleaf_has_pfn(entry))
page = softleaf_to_page(entry);
}
if (IS_ERR_OR_NULL(page))
return;
@ -1146,6 +1147,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_MAYSHARE)] = "ms",
[ilog2(VM_GROWSDOWN)] = "gd",
[ilog2(VM_PFNMAP)] = "pf",
[ilog2(VM_MAYBE_GUARD)] = "gu",
[ilog2(VM_LOCKED)] = "lo",
[ilog2(VM_IO)] = "io",
[ilog2(VM_SEQ_READ)] = "sr",
@ -1181,10 +1183,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_PKEY_BIT0)] = "",
[ilog2(VM_PKEY_BIT1)] = "",
[ilog2(VM_PKEY_BIT2)] = "",
#if VM_PKEY_BIT3
#if CONFIG_ARCH_PKEY_BITS > 3
[ilog2(VM_PKEY_BIT3)] = "",
#endif
#if VM_PKEY_BIT4
#if CONFIG_ARCH_PKEY_BITS > 4
[ilog2(VM_PKEY_BIT4)] = "",
#endif
#endif /* CONFIG_ARCH_HAS_PKEYS */
@ -1230,11 +1232,11 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
if (pte_present(ptent)) {
folio = page_folio(pte_page(ptent));
present = true;
} else if (is_swap_pte(ptent)) {
swp_entry_t swpent = pte_to_swp_entry(ptent);
} else {
const softleaf_t entry = softleaf_from_pte(ptent);
if (is_pfn_swap_entry(swpent))
folio = pfn_swap_entry_folio(swpent);
if (softleaf_has_pfn(entry))
folio = softleaf_to_folio(entry);
}
if (folio) {
@ -1582,8 +1584,6 @@ struct clear_refs_private {
enum clear_refs_types type;
};
#ifdef CONFIG_MEM_SOFT_DIRTY
static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
struct folio *folio;
@ -1603,6 +1603,8 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
static inline void clear_soft_dirty(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte)
{
if (!pgtable_supports_soft_dirty())
return;
/*
* The soft-dirty tracker uses #PF-s to catch writes
* to pages, so write-protect the pte as well. See the
@ -1611,6 +1613,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
*/
pte_t ptent = ptep_get(pte);
if (pte_none(ptent))
return;
if (pte_present(ptent)) {
pte_t old_pte;
@ -1620,24 +1625,21 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
ptent = pte_wrprotect(old_pte);
ptent = pte_clear_soft_dirty(ptent);
ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
} else if (is_swap_pte(ptent)) {
} else {
ptent = pte_swp_clear_soft_dirty(ptent);
set_pte_at(vma->vm_mm, addr, pte, ptent);
}
}
#else
static inline void clear_soft_dirty(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte)
{
}
#endif
#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
pmd_t old, pmd = *pmdp;
if (!pgtable_supports_soft_dirty())
return;
if (pmd_present(pmd)) {
/* See comment in change_huge_pmd() */
old = pmdp_invalidate(vma, addr, pmdp);
@ -1650,7 +1652,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
pmd = pmd_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
} else if (pmd_is_migration_entry(pmd)) {
pmd = pmd_swp_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
@ -1923,6 +1925,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
struct page *page = NULL;
struct folio *folio;
if (pte_none(pte))
goto out;
if (pte_present(pte)) {
if (pm->show_pfn)
frame = pte_pfn(pte);
@ -1932,32 +1937,34 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
flags |= PM_SOFT_DIRTY;
if (pte_uffd_wp(pte))
flags |= PM_UFFD_WP;
} else if (is_swap_pte(pte)) {
swp_entry_t entry;
} else {
softleaf_t entry;
if (pte_swp_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
if (pte_swp_uffd_wp(pte))
flags |= PM_UFFD_WP;
entry = pte_to_swp_entry(pte);
entry = softleaf_from_pte(pte);
if (pm->show_pfn) {
pgoff_t offset;
/*
* For PFN swap offsets, keeping the offset field
* to be PFN only to be compatible with old smaps.
*/
if (is_pfn_swap_entry(entry))
offset = swp_offset_pfn(entry);
if (softleaf_has_pfn(entry))
offset = softleaf_to_pfn(entry);
else
offset = swp_offset(entry);
frame = swp_type(entry) |
(offset << MAX_SWAPFILES_SHIFT);
}
flags |= PM_SWAP;
if (is_pfn_swap_entry(entry))
page = pfn_swap_entry_to_page(entry);
if (pte_marker_entry_uffd_wp(entry))
if (softleaf_has_pfn(entry))
page = softleaf_to_page(entry);
if (softleaf_is_uffd_wp_marker(entry))
flags |= PM_UFFD_WP;
if (is_guard_swp_entry(entry))
if (softleaf_is_guard_marker(entry))
flags |= PM_GUARD_REGION;
}
@ -1969,12 +1976,93 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
__folio_page_mapped_exclusively(folio, page))
flags |= PM_MMAP_EXCLUSIVE;
}
out:
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
return make_pme(frame, flags);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr,
unsigned long end, struct vm_area_struct *vma,
struct pagemapread *pm)
{
unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT;
u64 flags = 0, frame = 0;
pmd_t pmd = *pmdp;
struct page *page = NULL;
struct folio *folio = NULL;
int err = 0;
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
if (pmd_none(pmd))
goto populate_pagemap;
if (pmd_present(pmd)) {
page = pmd_page(pmd);
flags |= PM_PRESENT;
if (pmd_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
if (pmd_uffd_wp(pmd))
flags |= PM_UFFD_WP;
if (pm->show_pfn)
frame = pmd_pfn(pmd) + idx;
} else if (thp_migration_supported()) {
const softleaf_t entry = softleaf_from_pmd(pmd);
unsigned long offset;
if (pm->show_pfn) {
if (softleaf_has_pfn(entry))
offset = softleaf_to_pfn(entry) + idx;
else
offset = swp_offset(entry) + idx;
frame = swp_type(entry) |
(offset << MAX_SWAPFILES_SHIFT);
}
flags |= PM_SWAP;
if (pmd_swp_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
if (pmd_swp_uffd_wp(pmd))
flags |= PM_UFFD_WP;
VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd));
page = softleaf_to_page(entry);
}
if (page) {
folio = page_folio(page);
if (!folio_test_anon(folio))
flags |= PM_FILE;
}
populate_pagemap:
for (; addr != end; addr += PAGE_SIZE, idx++) {
u64 cur_flags = flags;
pagemap_entry_t pme;
if (folio && (flags & PM_PRESENT) &&
__folio_page_mapped_exclusively(folio, page))
cur_flags |= PM_MMAP_EXCLUSIVE;
pme = make_pme(frame, cur_flags);
err = add_to_pagemap(&pme, pm);
if (err)
break;
if (pm->show_pfn) {
if (flags & PM_PRESENT)
frame++;
else if (flags & PM_SWAP)
frame += (1 << MAX_SWAPFILES_SHIFT);
}
}
return err;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
@ -1983,82 +2071,15 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
spinlock_t *ptl;
pte_t *pte, *orig_pte;
int err = 0;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
ptl = pmd_trans_huge_lock(pmdp, vma);
if (ptl) {
unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT;
u64 flags = 0, frame = 0;
pmd_t pmd = *pmdp;
struct page *page = NULL;
struct folio *folio = NULL;
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
if (pmd_present(pmd)) {
page = pmd_page(pmd);
flags |= PM_PRESENT;
if (pmd_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
if (pmd_uffd_wp(pmd))
flags |= PM_UFFD_WP;
if (pm->show_pfn)
frame = pmd_pfn(pmd) + idx;
}
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
else if (is_swap_pmd(pmd)) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
unsigned long offset;
if (pm->show_pfn) {
if (is_pfn_swap_entry(entry))
offset = swp_offset_pfn(entry) + idx;
else
offset = swp_offset(entry) + idx;
frame = swp_type(entry) |
(offset << MAX_SWAPFILES_SHIFT);
}
flags |= PM_SWAP;
if (pmd_swp_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
if (pmd_swp_uffd_wp(pmd))
flags |= PM_UFFD_WP;
VM_BUG_ON(!is_pmd_migration_entry(pmd));
page = pfn_swap_entry_to_page(entry);
}
#endif
if (page) {
folio = page_folio(page);
if (!folio_test_anon(folio))
flags |= PM_FILE;
}
for (; addr != end; addr += PAGE_SIZE, idx++) {
u64 cur_flags = flags;
pagemap_entry_t pme;
if (folio && (flags & PM_PRESENT) &&
__folio_page_mapped_exclusively(folio, page))
cur_flags |= PM_MMAP_EXCLUSIVE;
pme = make_pme(frame, cur_flags);
err = add_to_pagemap(&pme, pm);
if (err)
break;
if (pm->show_pfn) {
if (flags & PM_PRESENT)
frame++;
else if (flags & PM_SWAP)
frame += (1 << MAX_SWAPFILES_SHIFT);
}
}
err = pagemap_pmd_range_thp(pmdp, addr, end, vma, pm);
spin_unlock(ptl);
return err;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
/*
* We can assume that @vma always points to a valid one and @end never
@ -2310,12 +2331,16 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
struct vm_area_struct *vma,
unsigned long addr, pte_t pte)
{
unsigned long categories = 0;
unsigned long categories;
if (pte_none(pte))
return 0;
if (pte_present(pte)) {
struct page *page;
categories |= PAGE_IS_PRESENT;
categories = PAGE_IS_PRESENT;
if (!pte_uffd_wp(pte))
categories |= PAGE_IS_WRITTEN;
@ -2329,19 +2354,20 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
categories |= PAGE_IS_PFNZERO;
if (pte_soft_dirty(pte))
categories |= PAGE_IS_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
swp_entry_t swp;
} else {
softleaf_t entry;
categories = PAGE_IS_SWAPPED;
categories |= PAGE_IS_SWAPPED;
if (!pte_swp_uffd_wp_any(pte))
categories |= PAGE_IS_WRITTEN;
swp = pte_to_swp_entry(pte);
if (is_guard_swp_entry(swp))
entry = softleaf_from_pte(pte);
if (softleaf_is_guard_marker(entry))
categories |= PAGE_IS_GUARD;
else if ((p->masks_of_interest & PAGE_IS_FILE) &&
is_pfn_swap_entry(swp) &&
!folio_test_anon(pfn_swap_entry_folio(swp)))
softleaf_has_pfn(entry) &&
!folio_test_anon(softleaf_to_folio(entry)))
categories |= PAGE_IS_FILE;
if (pte_swp_soft_dirty(pte))
@ -2360,12 +2386,12 @@ static void make_uffd_wp_pte(struct vm_area_struct *vma,
old_pte = ptep_modify_prot_start(vma, addr, pte);
ptent = pte_mkuffd_wp(old_pte);
ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
} else if (is_swap_pte(ptent)) {
ptent = pte_swp_mkuffd_wp(ptent);
set_pte_at(vma->vm_mm, addr, pte, ptent);
} else {
} else if (pte_none(ptent)) {
set_pte_at(vma->vm_mm, addr, pte,
make_pte_marker(PTE_MARKER_UFFD_WP));
} else {
ptent = pte_swp_mkuffd_wp(ptent);
set_pte_at(vma->vm_mm, addr, pte, ptent);
}
}
@ -2376,6 +2402,9 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
{
unsigned long categories = PAGE_IS_HUGE;
if (pmd_none(pmd))
return categories;
if (pmd_present(pmd)) {
struct page *page;
@ -2393,9 +2422,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
categories |= PAGE_IS_PFNZERO;
if (pmd_soft_dirty(pmd))
categories |= PAGE_IS_SOFT_DIRTY;
} else if (is_swap_pmd(pmd)) {
swp_entry_t swp;
} else {
categories |= PAGE_IS_SWAPPED;
if (!pmd_swp_uffd_wp(pmd))
categories |= PAGE_IS_WRITTEN;
@ -2403,9 +2430,10 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
categories |= PAGE_IS_SOFT_DIRTY;
if (p->masks_of_interest & PAGE_IS_FILE) {
swp = pmd_to_swp_entry(pmd);
if (is_pfn_swap_entry(swp) &&
!folio_test_anon(pfn_swap_entry_folio(swp)))
const softleaf_t entry = softleaf_from_pmd(pmd);
if (softleaf_has_pfn(entry) &&
!folio_test_anon(softleaf_to_folio(entry)))
categories |= PAGE_IS_FILE;
}
}
@ -2422,7 +2450,7 @@ static void make_uffd_wp_pmd(struct vm_area_struct *vma,
old = pmdp_invalidate_ad(vma, addr, pmdp);
pmd = pmd_mkuffd_wp(old);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
} else if (pmd_is_migration_entry(pmd)) {
pmd = pmd_swp_mkuffd_wp(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
@ -2434,6 +2462,9 @@ static unsigned long pagemap_hugetlb_category(pte_t pte)
{
unsigned long categories = PAGE_IS_HUGE;
if (pte_none(pte))
return categories;
/*
* According to pagemap_hugetlb_range(), file-backed HugeTLB
* page cannot be swapped. So PAGE_IS_FILE is not checked for
@ -2441,6 +2472,7 @@ static unsigned long pagemap_hugetlb_category(pte_t pte)
*/
if (pte_present(pte)) {
categories |= PAGE_IS_PRESENT;
if (!huge_pte_uffd_wp(pte))
categories |= PAGE_IS_WRITTEN;
if (!PageAnon(pte_page(pte)))
@ -2449,8 +2481,9 @@ static unsigned long pagemap_hugetlb_category(pte_t pte)
categories |= PAGE_IS_PFNZERO;
if (pte_soft_dirty(pte))
categories |= PAGE_IS_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
} else {
categories |= PAGE_IS_SWAPPED;
if (!pte_swp_uffd_wp_any(pte))
categories |= PAGE_IS_WRITTEN;
if (pte_swp_soft_dirty(pte))
@ -2464,22 +2497,25 @@ static void make_uffd_wp_huge_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t ptent)
{
unsigned long psize;
const unsigned long psize = huge_page_size(hstate_vma(vma));
softleaf_t entry;
if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent))
return;
psize = huge_page_size(hstate_vma(vma));
if (is_hugetlb_entry_migration(ptent))
set_huge_pte_at(vma->vm_mm, addr, ptep,
pte_swp_mkuffd_wp(ptent), psize);
else if (!huge_pte_none(ptent))
huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
huge_pte_mkuffd_wp(ptent));
else
if (huge_pte_none(ptent)) {
set_huge_pte_at(vma->vm_mm, addr, ptep,
make_pte_marker(PTE_MARKER_UFFD_WP), psize);
return;
}
entry = softleaf_from_pte(ptent);
if (softleaf_is_hwpoison(entry) || softleaf_is_marker(entry))
return;
if (softleaf_is_migration(entry))
set_huge_pte_at(vma->vm_mm, addr, ptep,
pte_swp_mkuffd_wp(ptent), psize);
else
huge_ptep_modify_prot_commit(vma, addr, ptep, ptent,
huge_pte_mkuffd_wp(ptent));
}
#endif /* CONFIG_HUGETLB_PAGE */

View File

@ -864,6 +864,8 @@ static int ramoops_probe(struct platform_device *pdev)
ramoops_console_size = pdata->console_size;
ramoops_pmsg_size = pdata->pmsg_size;
ramoops_ftrace_size = pdata->ftrace_size;
mem_type = pdata->mem_type;
ramoops_ecc = pdata->ecc_info.ecc_size;
pr_info("using 0x%lx@0x%llx, ecc: %d\n",
cxt->size, (unsigned long long)cxt->phys_addr,

View File

@ -35,7 +35,7 @@ static unsigned long ramfs_mmu_get_unmapped_area(struct file *file,
unsigned long addr, unsigned long len, unsigned long pgoff,
unsigned long flags)
{
return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
return mm_get_unmapped_area(file, addr, len, pgoff, flags);
}
const struct file_operations ramfs_file_operations = {

View File

@ -995,10 +995,11 @@ static const struct vm_operations_struct pseudo_mmap_ops = {
.mremap = pseudo_lock_dev_mremap,
};
static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
static int pseudo_lock_dev_mmap_prepare(struct vm_area_desc *desc)
{
unsigned long vsize = vma->vm_end - vma->vm_start;
unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
unsigned long off = desc->pgoff << PAGE_SHIFT;
unsigned long vsize = vma_desc_size(desc);
struct file *filp = desc->file;
struct pseudo_lock_region *plr;
struct rdtgroup *rdtgrp;
unsigned long physical;
@ -1043,7 +1044,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
* Ensure changes are carried directly to the memory being mapped,
* do not allow copy-on-write mapping.
*/
if (!(vma->vm_flags & VM_SHARED)) {
if (!(desc->vm_flags & VM_SHARED)) {
mutex_unlock(&rdtgroup_mutex);
return -EINVAL;
}
@ -1055,12 +1056,9 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
memset(plr->kmem + off, 0, vsize);
if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff,
vsize, vma->vm_page_prot)) {
mutex_unlock(&rdtgroup_mutex);
return -EAGAIN;
}
vma->vm_ops = &pseudo_mmap_ops;
desc->vm_ops = &pseudo_mmap_ops;
mmap_action_remap_full(desc, physical + desc->pgoff);
mutex_unlock(&rdtgroup_mutex);
return 0;
}
@ -1071,7 +1069,7 @@ static const struct file_operations pseudo_lock_dev_fops = {
.write = NULL,
.open = pseudo_lock_dev_open,
.release = pseudo_lock_dev_release,
.mmap = pseudo_lock_dev_mmap,
.mmap_prepare = pseudo_lock_dev_mmap_prepare,
};
int rdt_pseudo_lock_init(void)

View File

@ -29,7 +29,7 @@
#include <linux/ioctl.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/swapops.h>
#include <linux/leafops.h>
#include <linux/miscdevice.h>
#include <linux/uio.h>
@ -233,40 +233,48 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
{
struct vm_area_struct *vma = vmf->vma;
pte_t *ptep, pte;
bool ret = true;
assert_fault_locked(vmf);
ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
if (!ptep)
goto out;
return true;
ret = false;
pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep);
/*
* Lockless access: we're in a wait_event so it's ok if it
* changes under us. PTE markers should be handled the same as none
* ptes here.
* changes under us.
*/
/* Entry is still missing, wait for userspace to resolve the fault. */
if (huge_pte_none(pte))
return true;
/* UFFD PTE markers require userspace to resolve the fault. */
if (pte_is_uffd_marker(pte))
return true;
/*
* If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
* resolve the fault.
*/
if (huge_pte_none_mostly(pte))
ret = true;
if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
ret = true;
out:
return ret;
return true;
return false;
}
#else
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
struct vm_fault *vmf,
unsigned long reason)
{
return false; /* should never get here */
/* Should never get here. */
VM_WARN_ON_ONCE(1);
return false;
}
#endif /* CONFIG_HUGETLB_PAGE */
/*
* Verify the pagetables are still not ok after having reigstered into
* Verify the pagetables are still not ok after having registered into
* the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
* userfault that has already been resolved, if userfaultfd_read_iter and
* UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
@ -284,53 +292,63 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
pmd_t *pmd, _pmd;
pte_t *pte;
pte_t ptent;
bool ret = true;
bool ret;
assert_fault_locked(vmf);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
goto out;
return true;
p4d = p4d_offset(pgd, address);
if (!p4d_present(*p4d))
goto out;
return true;
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
goto out;
return true;
pmd = pmd_offset(pud, address);
again:
_pmd = pmdp_get_lockless(pmd);
if (pmd_none(_pmd))
return true;
/*
* A race could arise which would result in a softleaf entry such as
* migration entry unexpectedly being present in the PMD, so explicitly
* check for this and bail out if so.
*/
if (!pmd_present(_pmd))
return false;
if (pmd_trans_huge(_pmd))
return !pmd_write(_pmd) && (reason & VM_UFFD_WP);
pte = pte_offset_map(pmd, address);
if (!pte)
goto again;
/*
* Lockless access: we're in a wait_event so it's ok if it
* changes under us.
*/
ptent = ptep_get(pte);
ret = true;
/* Entry is still missing, wait for userspace to resolve the fault. */
if (pte_none(ptent))
goto out;
/* UFFD PTE markers require userspace to resolve the fault. */
if (pte_is_uffd_marker(ptent))
goto out;
/*
* If VMA has UFFD WP faults enabled and WP fault, wait for userspace to
* resolve the fault.
*/
if (!pte_write(ptent) && (reason & VM_UFFD_WP))
goto out;
ret = false;
if (!pmd_present(_pmd))
goto out;
if (pmd_trans_huge(_pmd)) {
if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
ret = true;
goto out;
}
pte = pte_offset_map(pmd, address);
if (!pte) {
ret = true;
goto again;
}
/*
* Lockless access: we're in a wait_event so it's ok if it
* changes under us. PTE markers should be handled the same as none
* ptes here.
*/
ptent = ptep_get(pte);
if (pte_none_mostly(ptent))
ret = true;
if (!pte_write(ptent) && (reason & VM_UFFD_WP))
ret = true;
pte_unmap(pte);
out:
pte_unmap(pte);
return ret;
}
@ -490,12 +508,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
set_current_state(blocking_state);
spin_unlock_irq(&ctx->fault_pending_wqh.lock);
if (!is_vm_hugetlb_page(vma))
must_wait = userfaultfd_must_wait(ctx, vmf, reason);
else
if (is_vm_hugetlb_page(vma)) {
must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
if (is_vm_hugetlb_page(vma))
hugetlb_vma_unlock_read(vma);
} else {
must_wait = userfaultfd_must_wait(ctx, vmf, reason);
}
release_fault_lock(vmf);
if (likely(must_wait && !READ_ONCE(ctx->released))) {
@ -1270,9 +1289,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
vm_flags |= VM_UFFD_MISSING;
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
goto out;
#endif
if (!pgtable_supports_uffd_wp())
goto out;
vm_flags |= VM_UFFD_WP;
}
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
@ -1980,14 +1999,14 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
uffdio_api.features &=
~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
#endif
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
#endif
#ifndef CONFIG_PTE_MARKER_UFFD_WP
uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
#endif
if (!pgtable_supports_uffd_wp())
uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
if (!uffd_supports_wp_marker()) {
uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
}
ret = -EINVAL;
if (features & ~uffdio_api.features)

View File

@ -97,14 +97,6 @@ static inline int huge_pte_none(pte_t pte)
}
#endif
/* Please refer to comments above pte_none_mostly() for the usage */
#ifndef __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY
static inline int huge_pte_none_mostly(pte_t pte)
{
return huge_pte_none(pte) || is_pte_marker(pte);
}
#endif
#ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)

View File

@ -18,8 +18,7 @@
*/
static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm)
{
struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL &
~__GFP_HIGHMEM, 0);
struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL, 0);
if (!ptdesc)
return NULL;
@ -28,6 +27,8 @@ static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm)
return NULL;
}
ptdesc_set_kernel(ptdesc);
return ptdesc_address(ptdesc);
}
#define __pte_alloc_one_kernel(...) alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__))
@ -146,6 +147,10 @@ static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long ad
pagetable_free(ptdesc);
return NULL;
}
if (mm == &init_mm)
ptdesc_set_kernel(ptdesc);
return ptdesc_address(ptdesc);
}
#define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__))
@ -172,13 +177,16 @@ static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long
if (mm == &init_mm)
gfp = GFP_PGTABLE_KERNEL;
gfp &= ~__GFP_HIGHMEM;
ptdesc = pagetable_alloc_noprof(gfp, 0);
if (!ptdesc)
return NULL;
pagetable_pud_ctor(ptdesc);
if (mm == &init_mm)
ptdesc_set_kernel(ptdesc);
return ptdesc_address(ptdesc);
}
#define __pud_alloc_one(...) alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__))
@ -226,13 +234,16 @@ static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long
if (mm == &init_mm)
gfp = GFP_PGTABLE_KERNEL;
gfp &= ~__GFP_HIGHMEM;
ptdesc = pagetable_alloc_noprof(gfp, 0);
if (!ptdesc)
return NULL;
pagetable_p4d_ctor(ptdesc);
if (mm == &init_mm)
ptdesc_set_kernel(ptdesc);
return ptdesc_address(ptdesc);
}
#define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__))
@ -270,13 +281,16 @@ static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order
if (mm == &init_mm)
gfp = GFP_PGTABLE_KERNEL;
gfp &= ~__GFP_HIGHMEM;
ptdesc = pagetable_alloc_noprof(gfp, order);
if (!ptdesc)
return NULL;
pagetable_pgd_ctor(ptdesc);
if (mm == &init_mm)
ptdesc_set_kernel(ptdesc);
return ptdesc_address(ptdesc);
}
#define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__))

View File

@ -1,6 +1,23 @@
#ifndef _ASM_GENERIC_PGTABLE_UFFD_H
#define _ASM_GENERIC_PGTABLE_UFFD_H
/*
* Some platforms can customize the uffd-wp bit, making it unavailable
* even if the architecture provides the resource.
* Adding this API allows architectures to add their own checks for the
* devices on which the kernel is running.
* Note: When overriding it, please make sure the
* CONFIG_HAVE_ARCH_USERFAULTFD_WP is part of this macro.
*/
#ifndef pgtable_supports_uffd_wp
#define pgtable_supports_uffd_wp() IS_ENABLED(CONFIG_HAVE_ARCH_USERFAULTFD_WP)
#endif
static inline bool uffd_supports_wp_marker(void)
{
return pgtable_supports_uffd_wp() && IS_ENABLED(CONFIG_PTE_MARKER_UFFD_WP);
}
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static __always_inline int pte_uffd_wp(pte_t pte)
{

View File

@ -1065,6 +1065,7 @@
*(.no_trim_symbol) \
/* ld.bfd warns about .gnu.version* even when not emitted */ \
*(.gnu.version*) \
*(__tracepoint_check) \
#define DISCARDS \
/DISCARD/ : { \

View File

@ -64,8 +64,8 @@ extern void config_item_put(struct config_item *);
struct config_item_type {
struct module *ct_owner;
struct configfs_item_operations *ct_item_ops;
struct configfs_group_operations *ct_group_ops;
const struct configfs_item_operations *ct_item_ops;
const struct configfs_group_operations *ct_group_ops;
struct configfs_attribute **ct_attrs;
struct configfs_bin_attribute **ct_bin_attrs;
};

View File

@ -91,17 +91,23 @@ struct damon_region {
* @nr_regions: Number of monitoring target regions of this target.
* @regions_list: Head of the monitoring target regions of this target.
* @list: List head for siblings.
* @obsolete: Whether the commit destination target is obsolete.
*
* Each monitoring context could have multiple targets. For example, a context
* for virtual memory address spaces could have multiple target processes. The
* @pid should be set for appropriate &struct damon_operations including the
* virtual address spaces monitoring operations.
*
* @obsolete is used only for damon_commit_targets() source targets, to specify
* the matching destination targets are obsolete. Read damon_commit_targets()
* to see how it is handled.
*/
struct damon_target {
struct pid *pid;
unsigned int nr_regions;
struct list_head regions_list;
struct list_head list;
bool obsolete;
};
/**
@ -147,6 +153,8 @@ enum damos_action {
* @DAMOS_QUOTA_SOME_MEM_PSI_US: System level some memory PSI in us.
* @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node.
* @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node.
* @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup.
* @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup.
* @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics.
*
* Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported.
@ -156,6 +164,8 @@ enum damos_quota_goal_metric {
DAMOS_QUOTA_SOME_MEM_PSI_US,
DAMOS_QUOTA_NODE_MEM_USED_BP,
DAMOS_QUOTA_NODE_MEM_FREE_BP,
DAMOS_QUOTA_NODE_MEMCG_USED_BP,
DAMOS_QUOTA_NODE_MEMCG_FREE_BP,
NR_DAMOS_QUOTA_GOAL_METRICS,
};
@ -166,6 +176,7 @@ enum damos_quota_goal_metric {
* @current_value: Current value of @metric.
* @last_psi_total: Last measured total PSI
* @nid: Node id.
* @memcg_id: Memcg id.
* @list: List head for siblings.
*
* Data structure for getting the current score of the quota tuning goal. The
@ -176,6 +187,12 @@ enum damos_quota_goal_metric {
* If @metric is DAMOS_QUOTA_USER_INPUT, @current_value should be manually
* entered by the user, probably inside the kdamond callbacks. Otherwise,
* DAMON sets @current_value with self-measured value of @metric.
*
* If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node
* id of the target node to account the used/free memory.
*
* If @metric is DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP, @nid and @memcg_id
* represents the node id and the cgroup to account the used memory for.
*/
struct damos_quota_goal {
enum damos_quota_goal_metric metric;
@ -184,7 +201,10 @@ struct damos_quota_goal {
/* metric-dependent fields */
union {
u64 last_psi_total;
int nid;
struct {
int nid;
unsigned short memcg_id;
};
};
struct list_head list;
};
@ -472,7 +492,7 @@ struct damos_migrate_dests {
* @wmarks: Watermarks for automated (in)activation of this scheme.
* @migrate_dests: Destination nodes if @action is "migrate_{hot,cold}".
* @target_nid: Destination node if @action is "migrate_{hot,cold}".
* @filters: Additional set of &struct damos_filter for &action.
* @core_filters: Additional set of &struct damos_filter for &action.
* @ops_filters: ops layer handling &struct damos_filter objects list.
* @last_applied: Last @action applied ops-managing entity.
* @stat: Statistics of this scheme.
@ -498,7 +518,7 @@ struct damos_migrate_dests {
*
* Before applying the &action to a memory region, &struct damon_operations
* implementation could check pages of the region and skip &action to respect
* &filters
* &core_filters
*
* The minimum entity that @action can be applied depends on the underlying
* &struct damon_operations. Since it may not be aligned with the core layer
@ -542,7 +562,7 @@ struct damos {
struct damos_migrate_dests migrate_dests;
};
};
struct list_head filters;
struct list_head core_filters;
struct list_head ops_filters;
void *last_applied;
struct damos_stat stat;
@ -851,11 +871,11 @@ static inline unsigned long damon_sz_region(struct damon_region *r)
#define damos_for_each_quota_goal_safe(goal, next, quota) \
list_for_each_entry_safe(goal, next, &(quota)->goals, list)
#define damos_for_each_filter(f, scheme) \
list_for_each_entry(f, &(scheme)->filters, list)
#define damos_for_each_core_filter(f, scheme) \
list_for_each_entry(f, &(scheme)->core_filters, list)
#define damos_for_each_filter_safe(f, next, scheme) \
list_for_each_entry_safe(f, next, &(scheme)->filters, list)
#define damos_for_each_core_filter_safe(f, next, scheme) \
list_for_each_entry_safe(f, next, &(scheme)->core_filters, list)
#define damos_for_each_ops_filter(f, scheme) \
list_for_each_entry(f, &(scheme)->ops_filters, list)
@ -947,7 +967,8 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control);
int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control);
int damon_set_region_biggest_system_ram_default(struct damon_target *t,
unsigned long *start, unsigned long *end);
unsigned long *start, unsigned long *end,
unsigned long min_sz_region);
#endif /* CONFIG_DAMON */

View File

@ -7,6 +7,7 @@
#include <linux/ftrace.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/rhashtable.h>
#include <linux/slab.h>
struct fprobe;
@ -26,7 +27,7 @@ typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip,
* @fp: The fprobe which owns this.
*/
struct fprobe_hlist_node {
struct hlist_node hlist;
struct rhlist_head hlist;
unsigned long addr;
struct fprobe *fp;
};

View File

@ -2041,14 +2041,14 @@ static inline bool can_mmap_file(struct file *file)
return true;
}
int __compat_vma_mmap_prepare(const struct file_operations *f_op,
int __compat_vma_mmap(const struct file_operations *f_op,
struct file *file, struct vm_area_struct *vma);
int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma);
int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
{
if (file->f_op->mmap_prepare)
return compat_vma_mmap_prepare(file, vma);
return compat_vma_mmap(file, vma);
return file->f_op->mmap(file, vma);
}

View File

@ -1167,17 +1167,14 @@ static inline void ftrace_init(void) { }
*/
struct ftrace_graph_ent {
unsigned long func; /* Current function */
int depth;
unsigned long depth;
} __packed;
/*
* Structure that defines an entry function trace with retaddr.
* It's already packed but the attribute "packed" is needed
* to remove extra padding at the end.
*/
struct fgraph_retaddr_ent {
unsigned long func; /* Current function */
int depth;
struct ftrace_graph_ent ent;
unsigned long retaddr; /* Return address */
} __packed;

View File

@ -387,7 +387,7 @@ extern void free_pages(unsigned long addr, unsigned int order);
#define free_page(addr) free_pages((addr), 0)
void page_alloc_init_cpuhp(void);
int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);

View File

@ -364,20 +364,35 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add
unsigned long len, unsigned long pgoff, unsigned long flags,
vm_flags_t vm_flags);
enum split_type {
SPLIT_TYPE_UNIFORM,
SPLIT_TYPE_NON_UNIFORM,
};
bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
unsigned int new_order);
int folio_split_unmapped(struct folio *folio, unsigned int new_order);
int min_order_for_split(struct folio *folio);
int split_folio_to_list(struct folio *folio, struct list_head *list);
bool uniform_split_supported(struct folio *folio, unsigned int new_order,
bool warns);
bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
bool warns);
bool folio_split_supported(struct folio *folio, unsigned int new_order,
enum split_type split_type, bool warns);
int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
struct list_head *list);
/*
* try_folio_split_to_order - try to split a @folio at @page to @new_order using
* non uniform split.
static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
unsigned int new_order)
{
return __split_huge_page_to_list_to_order(page, list, new_order);
}
static inline int split_huge_page_to_order(struct page *page, unsigned int new_order)
{
return split_huge_page_to_list_to_order(page, NULL, new_order);
}
/**
* try_folio_split_to_order() - try to split a @folio at @page to @new_order
* using non uniform split.
* @folio: folio to be split
* @page: split to @new_order at the given page
* @new_order: the target split order
@ -387,14 +402,13 @@ int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
* folios are put back to LRU list. Use min_order_for_split() to get the lower
* bound of @new_order.
*
* Return: 0: split is successful, otherwise split failed.
* Return: 0 - split is successful, otherwise split failed.
*/
static inline int try_folio_split_to_order(struct folio *folio,
struct page *page, unsigned int new_order)
{
if (!non_uniform_split_supported(folio, new_order, /* warns= */ false))
return split_huge_page_to_list_to_order(&folio->page, NULL,
new_order);
if (!folio_split_supported(folio, new_order, SPLIT_TYPE_NON_UNIFORM, /* warns= */ false))
return split_huge_page_to_order(&folio->page, new_order);
return folio_split(folio, new_order, page, NULL);
}
static inline int split_huge_page(struct page *page)
@ -402,14 +416,43 @@ static inline int split_huge_page(struct page *page)
return split_huge_page_to_list_to_order(page, NULL, 0);
}
void deferred_split_folio(struct folio *folio, bool partially_mapped);
#ifdef CONFIG_MEMCG
void reparent_deferred_split_queue(struct mem_cgroup *memcg);
#endif
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze);
/**
* pmd_is_huge() - Is this PMD either a huge PMD entry or a software leaf entry?
* @pmd: The PMD to check.
*
* A huge PMD entry is a non-empty entry which is present and marked huge or a
* software leaf entry. This check be performed without the appropriate locks
* held, in which case the condition should be rechecked after they are
* acquired.
*
* Returns: true if this PMD is huge, false otherwise.
*/
static inline bool pmd_is_huge(pmd_t pmd)
{
if (pmd_present(pmd)) {
return pmd_trans_huge(pmd);
} else if (!pmd_none(pmd)) {
/*
* Non-present PMDs must be valid huge non-present entries. We
* cannot assert that here due to header dependency issues.
*/
return true;
}
return false;
}
#define split_huge_pmd(__vma, __pmd, __address) \
do { \
pmd_t *____pmd = (__pmd); \
if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd)) \
if (pmd_is_huge(*____pmd)) \
__split_huge_pmd(__vma, __pmd, __address, \
false); \
} while (0)
@ -447,19 +490,14 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma);
static inline int is_swap_pmd(pmd_t pmd)
{
return !pmd_none(pmd) && !pmd_present(pmd);
}
/* mmap_lock must be held on entry */
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
struct vm_area_struct *vma)
{
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd))
if (pmd_is_huge(*pmd))
return __pmd_trans_huge_lock(pmd, vma);
else
return NULL;
return NULL;
}
static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
struct vm_area_struct *vma)
@ -473,6 +511,8 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
/**
* folio_test_pmd_mappable - Can we map this folio with a PMD?
* @folio: The folio to test
*
* Return: true - @folio can be mapped, false - @folio cannot be mapped.
*/
static inline bool folio_test_pmd_mappable(struct folio *folio)
{
@ -481,6 +521,8 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf);
extern struct folio *huge_zero_folio;
extern unsigned long huge_zero_pfn;
@ -524,6 +566,8 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, bool freeze);
bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmdp, struct folio *folio);
void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd,
struct vm_area_struct *vma, unsigned long haddr);
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
@ -576,6 +620,11 @@ split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
VM_WARN_ON_ONCE_PAGE(1, page);
return -EINVAL;
}
static inline int split_huge_page_to_order(struct page *page, unsigned int new_order)
{
VM_WARN_ON_ONCE_PAGE(1, page);
return -EINVAL;
}
static inline int split_huge_page(struct page *page)
{
VM_WARN_ON_ONCE_PAGE(1, page);
@ -602,6 +651,7 @@ static inline int try_folio_split_to_order(struct folio *folio,
}
static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {}
#define split_huge_pmd(__vma, __pmd, __address) \
do { } while (0)
@ -642,10 +692,6 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
struct vm_area_struct *next)
{
}
static inline int is_swap_pmd(pmd_t pmd)
{
return 0;
}
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
struct vm_area_struct *vma)
{
@ -662,6 +708,11 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
return 0;
}
static inline vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf)
{
return 0;
}
static inline bool is_huge_zero_folio(const struct folio *folio)
{
return false;
@ -682,12 +733,6 @@ static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
return;
}
static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{
return NULL;
}
static inline bool thp_migration_supported(void)
{
return false;
@ -720,6 +765,11 @@ static inline struct folio *get_persistent_huge_zero_folio(void)
{
return NULL;
}
static inline bool pmd_is_huge(pmd_t pmd)
{
return false;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline int split_folio_to_list_to_order(struct folio *folio,

View File

@ -150,8 +150,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
struct folio **foliop);
#endif /* CONFIG_USERFAULTFD */
long hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_struct *vma,
vm_flags_t vm_flags);
struct vm_area_desc *desc, vm_flags_t vm_flags);
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed);
bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
@ -172,7 +171,7 @@ bool hugetlbfs_pagecache_present(struct hstate *h,
struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio);
extern int sysctl_hugetlb_shm_group;
extern int sysctl_hugetlb_shm_group __read_mostly;
extern struct list_head huge_boot_pages[MAX_NUMNODES];
void hugetlb_bootmem_alloc(void);
@ -275,11 +274,10 @@ void hugetlb_vma_lock_release(struct kref *kref);
long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot,
unsigned long cp_flags);
bool is_hugetlb_entry_migration(pte_t pte);
bool is_hugetlb_entry_hwpoisoned(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
void fixup_hugetlb_reservations(struct vm_area_struct *vma);
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
int hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
#else /* !CONFIG_HUGETLB_PAGE */
@ -466,6 +464,11 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
{
return 0;
}
#endif /* !CONFIG_HUGETLB_PAGE */
#ifndef pgd_write

View File

@ -2,22 +2,27 @@
#ifndef _LINUX_HUGETLB_INLINE_H
#define _LINUX_HUGETLB_INLINE_H
#ifdef CONFIG_HUGETLB_PAGE
#include <linux/mm.h>
static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
#ifdef CONFIG_HUGETLB_PAGE
static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags)
{
return !!(vma->vm_flags & VM_HUGETLB);
return !!(vm_flags & VM_HUGETLB);
}
#else
static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags)
{
return false;
}
#endif
static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
{
return is_vm_hugetlb_flags(vma->vm_flags);
}
#endif

View File

@ -1135,7 +1135,9 @@ struct iommu_sva {
struct iommu_mm_data {
u32 pasid;
struct mm_struct *mm;
struct list_head sva_domains;
struct list_head mm_list_elm;
};
int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode);
@ -1616,6 +1618,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev,
struct mm_struct *mm);
void iommu_sva_unbind_device(struct iommu_sva *handle);
u32 iommu_sva_get_pasid(struct iommu_sva *handle);
void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end);
#else
static inline struct iommu_sva *
iommu_sva_bind_device(struct device *dev, struct mm_struct *mm)
@ -1640,6 +1643,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm)
}
static inline void mm_pasid_drop(struct mm_struct *mm) {}
static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {}
#endif /* CONFIG_IOMMU_SVA */
#ifdef CONFIG_IOMMU_IOPF

View File

@ -611,4 +611,16 @@ extern unsigned long nsecs_to_jiffies(u64 n);
#define TIMESTAMP_SIZE 30
struct ctl_table;
int proc_dointvec_jiffies(const struct ctl_table *table, int dir, void *buffer,
size_t *lenp, loff_t *ppos);
int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos);
int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos);
int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer,
size_t *lenp, loff_t *ppos);
int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
void *buffer, size_t *lenp, loff_t *ppos);
#endif

View File

@ -571,11 +571,27 @@ static inline void kasan_init_hw_tags(void) { }
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask);
void kasan_release_vmalloc(unsigned long start, unsigned long end,
int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask);
static inline int kasan_populate_vmalloc(unsigned long addr,
unsigned long size, gfp_t gfp_mask)
{
if (kasan_enabled())
return __kasan_populate_vmalloc(addr, size, gfp_mask);
return 0;
}
void __kasan_release_vmalloc(unsigned long start, unsigned long end,
unsigned long free_region_start,
unsigned long free_region_end,
unsigned long flags);
static inline void kasan_release_vmalloc(unsigned long start, unsigned long end,
unsigned long free_region_start,
unsigned long free_region_end,
unsigned long flags)
{
if (kasan_enabled())
return __kasan_release_vmalloc(start, end, free_region_start,
free_region_end, flags);
}
#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */

View File

@ -133,6 +133,7 @@ void kmsan_kfree_large(const void *ptr);
* @prot: page protection flags used for vmap.
* @pages: array of pages.
* @page_shift: page_shift passed to vmap_range_noflush().
* @gfp_mask: gfp_mask to use internally.
*
* KMSAN maps shadow and origin pages of @pages into contiguous ranges in
* vmalloc metadata address range. Returns 0 on success, callers must check
@ -142,7 +143,8 @@ int __must_check kmsan_vmap_pages_range_noflush(unsigned long start,
unsigned long end,
pgprot_t prot,
struct page **pages,
unsigned int page_shift);
unsigned int page_shift,
gfp_t gfp_mask);
/**
* kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap.
@ -347,7 +349,7 @@ static inline void kmsan_kfree_large(const void *ptr)
static inline int __must_check kmsan_vmap_pages_range_noflush(
unsigned long start, unsigned long end, pgprot_t prot,
struct page **pages, unsigned int page_shift)
struct page **pages, unsigned int page_shift, gfp_t gfp_mask)
{
return 0;
}

View File

@ -17,7 +17,7 @@
#ifdef CONFIG_KSM
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
unsigned long end, int advice, vm_flags_t *vm_flags);
vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file,
vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file,
vm_flags_t vm_flags);
int ksm_enable_merge_any(struct mm_struct *mm);
int ksm_disable_merge_any(struct mm_struct *mm);
@ -103,7 +103,7 @@ bool ksm_process_mergeable(struct mm_struct *mm);
#else /* !CONFIG_KSM */
static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm,
static inline vm_flags_t ksm_vma_flags(struct mm_struct *mm,
const struct file *file, vm_flags_t vm_flags)
{
return vm_flags;

Some files were not shown because too many files have changed in this diff Show More