1
0
mirror of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2026-01-11 17:10:13 +00:00

Compare commits

...

45 Commits

Author SHA1 Message Date
Linus Torvalds
24172e0d79 arm64 fixes for -rc6
- Avoid sleeping in atomic context when changing linear map permissions
   for DEBUG_PAGEALLOC or KFENCE.
 
 - Rework printing of Spectre mitigation status to avoid hardlockup when
   enabling per-task mitigations on the context-switch path.
 
 - Reject kernel modules when instruction patching fails either due to
   the DWARF-based SCS patching or because of an alternatives callback
   residing outside of the core kernel text.
 
 - Propagate error when updating kernel memory permissions in kprobes.
 
 - Drop pointless, incorrect message when enabling the ACPI SPCR console.
 
 - Use value-returning LSE instructions for per-cpu atomics to reduce
   latency in SRCU locking routines.
 -----BEGIN PGP SIGNATURE-----
 
 iQFEBAABCgAuFiEEPxTL6PPUbjXGY88ct6xw3ITBYzQFAmkSAqQQHHdpbGxAa2Vy
 bmVsLm9yZwAKCRC3rHDchMFjNHIpB/0SubZVLevJmInp4nu93ghHwu/8UhYG1Jpg
 P4DayUJ0Ghnox6PMSNci4s2+RSQc8NbdF2I4kcJNa8v8kMt9sXDL87614nZXDVtK
 FaEMK4PnnV3iFcQUr58kKWEf8cowG7gIi9Lq61InADAbZhQCDi/KAnlr5ydjF8hT
 Ixo9PwIDOlWiBi6IwJRt1yWsswtNFOcDhor3boFL+e19jjbwmgCXqejbwb74KtK7
 C5xcECzC8uHOuukn3Q0cZbKqpc+x9Nc98FnA44n9Ht+eoi/svEAYVeJuk1PJNGnA
 viAv0DJ8QUP2OLYrMmuOPReg5+n/RL7i9rNXJvcBQDmRUOVmMPQh
 =kjVk
 -----END PGP SIGNATURE-----

Merge tag 'arm64-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux

Pull arm64 fixes from Will Deacon:
 "There's more here than I would ideally like at this stage, but there's
  been a steady trickle of fixes and some of them took a few rounds of
  review.

  The bulk of the changes are fixing some fallout from the recent BBM
  level two support which allows the linear map to be split from block
  to page mappings at runtime, but inadvertently led to sleeping in
  atomic context on some paths where the linear map was already mapped
  with page granularity. The fix is simply to avoid splitting in those
  cases but the implementation of that is a little involved.

  The other interesting fix is addressing a catastophic performance
  issue with our per-cpu atomics discovered by Paul in the SRCU locking
  code but which took some interactions with the hardware folks to
  resolve.

  Summary:

   - Avoid sleeping in atomic context when changing linear map
     permissions for DEBUG_PAGEALLOC or KFENCE

   - Rework printing of Spectre mitigation status to avoid hardlockup
     when enabling per-task mitigations on the context-switch path

   - Reject kernel modules when instruction patching fails either due to
     the DWARF-based SCS patching or because of an alternatives callback
     residing outside of the core kernel text

   - Propagate error when updating kernel memory permissions in kprobes

   - Drop pointless, incorrect message when enabling the ACPI SPCR
     console

   - Use value-returning LSE instructions for per-cpu atomics to reduce
     latency in SRCU locking routines"

* tag 'arm64-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux:
  arm64: Reject modules with internal alternative callbacks
  arm64: Fail module loading if dynamic SCS patching fails
  arm64: proton-pack: Fix hard lockup due to print in scheduler context
  arm64: proton-pack: Drop print when !CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY
  arm64: mm: Tidy up force_pte_mapping()
  arm64: mm: Optimize range_split_to_ptes()
  arm64: mm: Don't sleep in split_kernel_leaf_mapping() when in atomic context
  arm64: kprobes: check the return value of set_memory_rox()
  arm64: acpi: Drop message logging SPCR default console
  Revert "ACPI: Suppress misleading SPCR console message when SPCR table is absent"
  arm64: Use load LSE atomics for the non-return per-CPU atomic operations
2025-11-11 10:31:17 -08:00
Linus Torvalds
8341374f67 for-6.18-rc5-tag
-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmkS/s4ACgkQxWXV+ddt
 WDvm6RAApWCEpZXx6RFCZOY2VoogkSWVRY+KcRzkJGjmgy0Llxcqq6SGDHEyUIs+
 fQ2AjjOInCrsEVrO80vdpazYizU0jVCoupLDMYw2FulI225dWq2Vsicq4Yv7Zubq
 HerTJgrowVP+CMPRIPpXY1W9O7zYz+T6irdamsedphORZ3yhs5XhRhUH2lrEjZSA
 O3LBlVQMslFWSZ2/u+XvgD2D4RA8kkZmRM8oUN3rPvjfrBgrRnnjvLDfxV3vRM0F
 CKd1SYMu2jtglmBa+9L8uO9RKLiIdszArcJSN9tYPmrbZOYN5Sa5jfm4D65SEnC1
 pTrWydGyJZCXbBXYgvUa/SBgurNPjeo0yh9nspqNflBsqvYvqqQjNq4/BLCPBxkd
 vShbWSqU/sj86jSkIc6bzeQBg4m6UsSCeyARqsrII6eqQuHqXzeMAnZEozd3Q7Fj
 Xc7d568GF6oTo0towpYVbAmeZAKyYBcHcVE0xjx5zLW0bonVvtvV7BDp0kS0ibot
 3JADPAQcaC1aDrZ0ZY+Hfdru2kcl1Yrg7xcAIc48hHaBGwETxb4RQZV3+1ldsaoA
 qGzvGCxNhLQzx3MOR1AqrnUIsEFW6ItoS6KLfyRgWMHyLlChjjFu132sVewB/9gN
 oSqEz8pOxjPqhUB9i+CwOxheJ6V5wxp2hJe0b4NiG7JMdPzAtvQ=
 =jVku
 -----END PGP SIGNATURE-----

Merge tag 'for-6.18-rc5-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

 - fix new inode name tracking in tree-log

 - fix conventional zone and stripe calculations in zoned mode

 - fix bio reference counts on error paths in relocation and scrub

* tag 'for-6.18-rc5-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: release root after error in data_reloc_print_warning_inode()
  btrfs: scrub: put bio after errors in scrub_raid56_parity_stripe()
  btrfs: do not update last_log_commit when logging inode due to a new name
  btrfs: zoned: fix stripe width calculation
  btrfs: zoned: fix conventional zone capacity calculation
2025-11-11 10:13:17 -08:00
Linus Torvalds
537d196186 26 hotfixes. 22(!) are cc:stable, 22 are MM.
- a three patch series from Pasha Tatashin which addresses some Kexec
   Handover issues
 
 - a two patch series from Kiryl Shutsemau which fixes handling of large
   folios which are mapped outside i_size
 
 - a two patch series from Quanmin Yan which fixes some DAMON time issues
   on 32-bit machines
 
 Plus the usual shower of singletome.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCaRKt7wAKCRDdBJ7gKXxA
 jvP8AP9f35mSzY3bUjEYp4IoSDKOeNprOVQLpoz5tN1Dz48UnAEA9Gk6PnkgJovJ
 8mXoAI4p6OqMgBkHkxVxaITMusk1IwY=
 =9Mei
 -----END PGP SIGNATURE-----

Merge tag 'mm-hotfixes-stable-2025-11-10-19-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull misc fixes from Andrew Morton:
 "26 hotfixes.  22(!) are cc:stable, 22 are MM.

   - address some Kexec Handover issues (Pasha Tatashin)

   - fix handling of large folios which are mapped outside i_size (Kiryl
     Shutsemau)

   - fix some DAMON time issues on 32-bit machines (Quanmin Yan)

  Plus the usual shower of singletons"

* tag 'mm-hotfixes-stable-2025-11-10-19-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (26 commits)
  kho: warn and exit when unpreserved page wasn't preserved
  kho: fix unpreservation of higher-order vmalloc preservations
  kho: fix out-of-bounds access of vmalloc chunk
  MAINTAINERS: add Chris and Kairui as the swap maintainer
  mm/secretmem: fix use-after-free race in fault handler
  mm/huge_memory: initialise the tags of the huge zero folio
  nilfs2: avoid having an active sc_timer before freeing sci
  scripts/decode_stacktrace.sh: fix build ID and PC source parsing
  mm/damon/sysfs: change next_update_jiffies to a global variable
  mm/damon/stat: change last_refresh_jiffies to a global variable
  maple_tree: fix tracepoint string pointers
  codetag: debug: handle existing CODETAG_EMPTY in mark_objexts_empty for slabobj_ext
  mm/mremap: honour writable bit in mremap pte batching
  gcov: add support for GCC 15
  mm/mm_init: fix hash table order logging in alloc_large_system_hash()
  mm/truncate: unmap large folio on split failure
  mm/memory: do not populate page table entries beyond i_size
  fs/proc: fix uaf in proc_readdir_de()
  mm/huge_memory: preserve PG_has_hwpoisoned if a folio is split to >0 order
  ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
  ...
2025-11-11 09:49:56 -08:00
Pratyush Yadav
b05addf6f0 kho: warn and exit when unpreserved page wasn't preserved
Calling __kho_unpreserve() on a pair of (pfn, end_pfn) that wasn't
preserved is a bug.  Currently, if that is done, the physxa or bits can be
NULL.  This results in a soft lockup since a NULL physxa or bits results
in redoing the loop without ever making any progress.

Return when physxa or bits are not found, but WARN first to loudly
indicate invalid behaviour.

Link: https://lkml.kernel.org/r/20251103180235.71409-3-pratyush@kernel.org
Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation")
Signed-off-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:47 -08:00
Pratyush Yadav
7ecd2e439d kho: fix unpreservation of higher-order vmalloc preservations
kho_vmalloc_unpreserve_chunk() calls __kho_unpreserve() with end_pfn as
pfn + 1.  This happens to work for 0-order pages, but leaks higher order
pages.

For example, say order 2 pages back the allocation.  During preservation,
they get preserved in the order 2 bitmaps, but
kho_vmalloc_unpreserve_chunk() would try to unpreserve them from the order
0 bitmaps, which should not have these bits set anyway, leaving the order
2 bitmaps untouched.  This results in the pages being carried over to the
next kernel.  Nothing will free those pages in the next boot, leaking
them.

Fix this by taking the order into account when calculating the end PFN for
__kho_unpreserve().

Link: https://lkml.kernel.org/r/20251103180235.71409-2-pratyush@kernel.org
Fixes: a667300bd53f ("kho: add support for preserving vmalloc allocations")
Signed-off-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:47 -08:00
Pratyush Yadav
0b07092d09 kho: fix out-of-bounds access of vmalloc chunk
The list of pages in a vmalloc chunk is NULL-terminated.  So when looping
through the pages in a vmalloc chunk, both kho_restore_vmalloc() and
kho_vmalloc_unpreserve_chunk() rightly make sure to stop when encountering
a NULL page.  But when the chunk is full, the loops do not stop and go
past the bounds of chunk->phys, resulting in out-of-bounds memory access,
and possibly the restoration or unpreservation of an invalid page.

Fix this by making sure the processing of chunk stops at the end of the
array.

Link: https://lkml.kernel.org/r/20251103110159.8399-1-pratyush@kernel.org
Fixes: a667300bd53f ("kho: add support for preserving vmalloc allocations")
Signed-off-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:47 -08:00
Chris Li
bba717bbc4 MAINTAINERS: add Chris and Kairui as the swap maintainer
We have been collaborating on a systematic effort to clean up and improve
the Linux swap system, and might as well take responsibility for it.

Link: https://lkml.kernel.org/r/20251102-swap-m-v1-1-582f275d5bce@kernel.org
Signed-off-by: Chris Li <chrisl@kernel.org>
Acked-by: Kairui Song <kasong@tencent.com>
Acked-by: Barry Song <baohua@kernel.org>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: SeongJae Park <sj@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:46 -08:00
Lance Yang
6f86d0534f mm/secretmem: fix use-after-free race in fault handler
When a page fault occurs in a secret memory file created with
`memfd_secret(2)`, the kernel will allocate a new folio for it, mark the
underlying page as not-present in the direct map, and add it to the file
mapping.

If two tasks cause a fault in the same page concurrently, both could end
up allocating a folio and removing the page from the direct map, but only
one would succeed in adding the folio to the file mapping.  The task that
failed undoes the effects of its attempt by (a) freeing the folio again
and (b) putting the page back into the direct map.  However, by doing
these two operations in this order, the page becomes available to the
allocator again before it is placed back in the direct mapping.

If another task attempts to allocate the page between (a) and (b), and the
kernel tries to access it via the direct map, it would result in a
supervisor not-present page fault.

Fix the ordering to restore the direct map before the folio is freed.

Link: https://lkml.kernel.org/r/20251031120955.92116-1-lance.yang@linux.dev
Fixes: 1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas")
Signed-off-by: Lance Yang <lance.yang@linux.dev>
Reported-by: Google Big Sleep <big-sleep-vuln-reports@google.com>
Closes: https://lore.kernel.org/linux-mm/CAEXGt5QeDpiHTu3K9tvjUTPqo+d-=wuCNYPa+6sWKrdQJ-ATdg@mail.gmail.com/
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:46 -08:00
Catalin Marinas
adfb6609c6 mm/huge_memory: initialise the tags of the huge zero folio
On arm64 with MTE enabled, a page mapped as Normal Tagged (PROT_MTE) in
user space will need to have its allocation tags initialised.  This is
normally done in the arm64 set_pte_at() after checking the memory
attributes.  Such page is also marked with the PG_mte_tagged flag to avoid
subsequent clearing.  Since this relies on having a struct page,
pte_special() mappings are ignored.

Commit d82d09e48219 ("mm/huge_memory: mark PMD mappings of the huge zero
folio special") maps the huge zero folio special and the arm64
set_pmd_at() will no longer zero the tags.  There is no guarantee that the
tags are zero, especially if parts of this huge page have been previously
tagged.

It's fairly easy to detect this by regularly dropping the caches to
force the reallocation of the huge zero folio.

Allocate the huge zero folio with the __GFP_ZEROTAGS flag.  In addition,
do not warn in the arm64 __access_remote_tags() when reading tags from the
huge zero page.

I bundled the arm64 change in here as well since they are both related to
the commit mapping the huge zero folio as special.

[catalin.marinas@arm.com: handle arch mte_zero_clear_page_tags() code issuing MTE instructions]
  Link: https://lkml.kernel.org/r/aQi8dA_QpXM8XqrE@arm.com
Link: https://lkml.kernel.org/r/20251031170133.280742-1-catalin.marinas@arm.com
Fixes: d82d09e48219 ("mm/huge_memory: mark PMD mappings of the huge zero folio special")
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Tested-by: Beleswar Padhi <b-padhi@ti.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Aishwarya TCV <aishwarya.tcv@arm.com>
Cc: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:46 -08:00
Edward Adam Davis
9a6b60cb14 nilfs2: avoid having an active sc_timer before freeing sci
Because kthread_stop did not stop sc_task properly and returned -EINTR,
the sc_timer was not properly closed, ultimately causing the problem [1]
reported by syzbot when freeing sci due to the sc_timer not being closed.

Because the thread sc_task main function nilfs_segctor_thread() returns 0
when it succeeds, when the return value of kthread_stop() is not 0 in
nilfs_segctor_destroy(), we believe that it has not properly closed
sc_timer.

We use timer_shutdown_sync() to sync wait for sc_timer to shutdown, and
set the value of sc_task to NULL under the protection of lock
sc_state_lock, so as to avoid the issue caused by sc_timer not being
properly shutdowned.

[1]
ODEBUG: free active (active state 0) object: 00000000dacb411a object type: timer_list hint: nilfs_construction_timeout
Call trace:
 nilfs_segctor_destroy fs/nilfs2/segment.c:2811 [inline]
 nilfs_detach_log_writer+0x668/0x8cc fs/nilfs2/segment.c:2877
 nilfs_put_super+0x4c/0x12c fs/nilfs2/super.c:509

Link: https://lkml.kernel.org/r/20251029225226.16044-1-konishi.ryusuke@gmail.com
Fixes: 3f66cc261ccb ("nilfs2: use kthread_create and kthread_stop for the log writer thread")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Reported-by: syzbot+24d8b70f039151f65590@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=24d8b70f039151f65590
Tested-by: syzbot+24d8b70f039151f65590@syzkaller.appspotmail.com
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
Cc: <stable@vger.kernel.org>	[6.12+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:46 -08:00
Carlos Llamas
7d9f7d390f scripts/decode_stacktrace.sh: fix build ID and PC source parsing
Support for parsing PC source info in stacktraces (e.g.  '(P)') was added
in commit 2bff77c665ed ("scripts/decode_stacktrace.sh: fix decoding of
lines with an additional info").  However, this logic was placed after the
build ID processing.  This incorrect order fails to parse lines containing
both elements, e.g.:

  drm_gem_mmap_obj+0x114/0x200 [drm 03d0564e0529947d67bb2008c3548be77279fd27] (P)

This patch fixes the problem by extracting the PC source info first and
then processing the module build ID.  With this change, the line above is
now properly parsed as such:

  drm_gem_mmap_obj (./include/linux/mmap_lock.h:212 ./include/linux/mm.h:811 drivers/gpu/drm/drm_gem.c:1177) drm (P)

While here, also add a brief explanation the build ID section.

Link: https://lkml.kernel.org/r/20251030010347.2731925-1-cmllamas@google.com
Fixes: 2bff77c665ed ("scripts/decode_stacktrace.sh: fix decoding of lines with an additional info")
Signed-off-by: Carlos Llamas <cmllamas@google.com>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Reviewed-by: Luca Ceresoli <luca.ceresoli@bootlin.com>
Cc: Breno Leitao <leitao@debian.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Matthieu Baerts <matttbe@kernel.org>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Puranjay Mohan <puranjay@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:45 -08:00
Quanmin Yan
9fd7bb5083 mm/damon/sysfs: change next_update_jiffies to a global variable
In DAMON's damon_sysfs_repeat_call_fn(), time_before() is used to compare
the current jiffies with next_update_jiffies to determine whether to
update the sysfs files at this moment.

On 32-bit systems, the kernel initializes jiffies to "-5 minutes" to make
jiffies wrap bugs appear earlier. However, this causes time_before() in
damon_sysfs_repeat_call_fn() to unexpectedly return true during the first
5 minutes after boot on 32-bit systems (see [1] for more explanation,
which fixes another jiffies-related issue before). As a result, DAMON
does not update sysfs files during that period.

There is also an issue unrelated to the system's word size[2]: if the
user stops DAMON just after next_update_jiffies is updated and restarts
it after 'refresh_ms' or a longer delay, next_update_jiffies will retain
an older value, causing time_before() to return false and the update to
happen earlier than expected.

Fix these issues by making next_update_jiffies a global variable and
initializing it each time DAMON is started.

Link: https://lkml.kernel.org/r/20251030020746.967174-3-yanquanmin1@huawei.com
Link: https://lkml.kernel.org/r/20250822025057.1740854-1-ekffu200098@gmail.com [1]
Link: https://lore.kernel.org/all/20251029013038.66625-1-sj@kernel.org/ [2]
Fixes: d809a7c64ba8 ("mm/damon/sysfs: implement refresh_ms file internal work")
Suggested-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Quanmin Yan <yanquanmin1@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: ze zuo <zuoze1@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:45 -08:00
Quanmin Yan
2f6ce7e714 mm/damon/stat: change last_refresh_jiffies to a global variable
Patch series "mm/damon: fixes for the jiffies-related issues", v2.

On 32-bit systems, the kernel initializes jiffies to "-5 minutes" to make
jiffies wrap bugs appear earlier.  However, this may cause the
time_before() series of functions to return unexpected values, resulting
in DAMON not functioning as intended.  Meanwhile, similar issues exist in
some specific user operation scenarios.

This patchset addresses these issues.  The first patch is about the
DAMON_STAT module, and the second patch is about the core layer's sysfs.


This patch (of 2):

In DAMON_STAT's damon_stat_damon_call_fn(), time_before_eq() is used to
avoid unnecessarily frequent stat update.

On 32-bit systems, the kernel initializes jiffies to "-5 minutes" to make
jiffies wrap bugs appear earlier.  However, this causes time_before_eq()
in DAMON_STAT to unexpectedly return true during the first 5 minutes after
boot on 32-bit systems (see [1] for more explanation, which fixes another
jiffies-related issue before).  As a result, DAMON_STAT does not update
any monitoring results during that period, which becomes more confusing
when DAMON_STAT_ENABLED_DEFAULT is enabled.

There is also an issue unrelated to the system's word size[2]: if the user
stops DAMON_STAT just after last_refresh_jiffies is updated and restarts
it after 5 seconds or a longer delay, last_refresh_jiffies will retain an
older value, causing time_before_eq() to return false and the update to
happen earlier than expected.

Fix these issues by making last_refresh_jiffies a global variable and
initializing it each time DAMON_STAT is started.

Link: https://lkml.kernel.org/r/20251030020746.967174-2-yanquanmin1@huawei.com
Link: https://lkml.kernel.org/r/20250822025057.1740854-1-ekffu200098@gmail.com [1]
Link: https://lore.kernel.org/all/20251028143250.50144-1-sj@kernel.org/ [2]
Fixes: fabdd1e911da ("mm/damon/stat: calculate and expose estimated memory bandwidth")
Signed-off-by: Quanmin Yan <yanquanmin1@huawei.com>
Suggested-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: ze zuo <zuoze1@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:45 -08:00
Martin Kaiser
91a5409002 maple_tree: fix tracepoint string pointers
maple_tree tracepoints contain pointers to function names. Such a pointer
is saved when a tracepoint logs an event. There's no guarantee that it's
still valid when the event is parsed later and the pointer is dereferenced.

The kernel warns about these unsafe pointers.

	event 'ma_read' has unsafe pointer field 'fn'
	WARNING: kernel/trace/trace.c:3779 at ignore_event+0x1da/0x1e4

Mark the function names as tracepoint_string() to fix the events.

One case that doesn't work without my patch would be trace-cmd record
to save the binary ringbuffer and trace-cmd report to parse it in
userspace.  The address of __func__ can't be dereferenced from
userspace but tracepoint_string will add an entry to
/sys/kernel/tracing/printk_formats

Link: https://lkml.kernel.org/r/20251030155537.87972-1-martin@kaiser.cx
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Martin Kaiser <martin@kaiser.cx>
Acked-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:45 -08:00
Hao Ge
1abbdf3d57 codetag: debug: handle existing CODETAG_EMPTY in mark_objexts_empty for slabobj_ext
When alloc_slab_obj_exts() fails and then later succeeds in allocating a
slab extension vector, it calls handle_failed_objexts_alloc() to mark all
objects in the vector as empty.  As a result all objects in this slab
(slabA) will have their extensions set to CODETAG_EMPTY.

Later on if this slabA is used to allocate a slabobj_ext vector for
another slab (slabB), we end up with the slabB->obj_exts pointing to a
slabobj_ext vector that itself has a non-NULL slabobj_ext equal to
CODETAG_EMPTY.  When slabB gets freed, free_slab_obj_exts() is called to
free slabB->obj_exts vector.  

free_slab_obj_exts() calls mark_objexts_empty(slabB->obj_exts) which will
generate a warning because it expects slabobj_ext vectors to have a NULL
obj_ext, not CODETAG_EMPTY.

Modify mark_objexts_empty() to skip the warning and setting the obj_ext
value if it's already set to CODETAG_EMPTY.


To quickly detect this WARN, I modified the code from
WARN_ON(slab_exts[offs].ref.ct) to BUG_ON(slab_exts[offs].ref.ct == 1);

We then obtained this message:

[21630.898561] ------------[ cut here ]------------
[21630.898596] kernel BUG at mm/slub.c:2050!
[21630.898611] Internal error: Oops - BUG: 00000000f2000800 [#1] SMP
[21630.900372] Modules linked in: squashfs isofs vfio_iommu_type1 
vhost_vsock vfio vhost_net vmw_vsock_virtio_transport_common vhost tap 
vhost_iotlb iommufd vsock binfmt_misc nfsv3 nfs_acl nfs lockd grace 
netfs tls rds dns_resolver tun brd overlay ntfs3 exfat btrfs 
blake2b_generic xor xor_neon raid6_pq loop sctp ip6_udp_tunnel 
udp_tunnel nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib 
nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct 
nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 
nf_tables rfkill ip_set sunrpc vfat fat joydev sg sch_fq_codel nfnetlink 
virtio_gpu sr_mod cdrom drm_client_lib virtio_dma_buf drm_shmem_helper 
drm_kms_helper drm ghash_ce backlight virtio_net virtio_blk virtio_scsi 
net_failover virtio_console failover virtio_mmio dm_mirror 
dm_region_hash dm_log dm_multipath dm_mod fuse i2c_dev virtio_pci 
virtio_pci_legacy_dev virtio_pci_modern_dev virtio virtio_ring autofs4 
aes_neon_bs aes_ce_blk [last unloaded: hwpoison_inject]
[21630.909177] CPU: 3 UID: 0 PID: 3787 Comm: kylin-process-m Kdump: 
loaded Tainted: G        W           6.18.0-rc1+ #74 PREEMPT(voluntary)
[21630.910495] Tainted: [W]=WARN
[21630.910867] Hardware name: QEMU KVM Virtual Machine, BIOS unknown 
2/2/2022
[21630.911625] pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS 
BTYPE=--)
[21630.912392] pc : __free_slab+0x228/0x250
[21630.912868] lr : __free_slab+0x18c/0x250[21630.913334] sp : 
ffff8000a02f73e0
[21630.913830] x29: ffff8000a02f73e0 x28: fffffdffc43fc800 x27: 
ffff0000c0011c40
[21630.914677] x26: ffff0000c000cac0 x25: ffff00010fe5e5f0 x24: 
ffff000102199b40
[21630.915469] x23: 0000000000000003 x22: 0000000000000003 x21: 
ffff0000c0011c40
[21630.916259] x20: fffffdffc4086600 x19: fffffdffc43fc800 x18: 
0000000000000000
[21630.917048] x17: 0000000000000000 x16: 0000000000000000 x15: 
0000000000000000
[21630.917837] x14: 0000000000000000 x13: 0000000000000000 x12: 
ffff70001405ee66
[21630.918640] x11: 1ffff0001405ee65 x10: ffff70001405ee65 x9 : 
ffff800080a295dc
[21630.919442] x8 : ffff8000a02f7330 x7 : 0000000000000000 x6 : 
0000000000003000
[21630.920232] x5 : 0000000024924925 x4 : 0000000000000001 x3 : 
0000000000000007
[21630.921021] x2 : 0000000000001b40 x1 : 000000000000001f x0 : 
0000000000000001
[21630.921810] Call trace:
[21630.922130]  __free_slab+0x228/0x250 (P)
[21630.922669]  free_slab+0x38/0x118
[21630.923079]  free_to_partial_list+0x1d4/0x340
[21630.923591]  __slab_free+0x24c/0x348
[21630.924024]  ___cache_free+0xf0/0x110
[21630.924468]  qlist_free_all+0x78/0x130
[21630.924922]  kasan_quarantine_reduce+0x114/0x148
[21630.925525]  __kasan_slab_alloc+0x7c/0xb0
[21630.926006]  kmem_cache_alloc_noprof+0x164/0x5c8
[21630.926699]  __alloc_object+0x44/0x1f8
[21630.927153]  __create_object+0x34/0xc8
[21630.927604]  kmemleak_alloc+0xb8/0xd8
[21630.928052]  kmem_cache_alloc_noprof+0x368/0x5c8
[21630.928606]  getname_flags.part.0+0xa4/0x610
[21630.929112]  getname_flags+0x80/0xd8
[21630.929557]  vfs_fstatat+0xc8/0xe0
[21630.929975]  __do_sys_newfstatat+0xa0/0x100
[21630.930469]  __arm64_sys_newfstatat+0x90/0xd8
[21630.931046]  invoke_syscall+0xd4/0x258
[21630.931685]  el0_svc_common.constprop.0+0xb4/0x240
[21630.932467]  do_el0_svc+0x48/0x68
[21630.932972]  el0_svc+0x40/0xe0
[21630.933472]  el0t_64_sync_handler+0xa0/0xe8
[21630.934151]  el0t_64_sync+0x1ac/0x1b0
[21630.934923] Code: aa1803e0 97ffef2b a9446bf9 17ffff9c (d4210000)
[21630.936461] SMP: stopping secondary CPUs
[21630.939550] Starting crashdump kernel...
[21630.940108] Bye!

Link: https://lkml.kernel.org/r/20251029014317.1533488-1-hao.ge@linux.dev
Fixes: 09c46563ff6d ("codetag: debug: introduce OBJEXTS_ALLOC_FAIL to mark failed slab_ext allocations")
Signed-off-by: Hao Ge <gehao@kylinos.cn>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Cc: Christoph Lameter (Ampere) <cl@gentwo.org>
Cc: David Rientjes <rientjes@google.com>
Cc: gehao <gehao@kylinos.cn>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:44 -08:00
Dev Jain
04d1c9d60c mm/mremap: honour writable bit in mremap pte batching
Currently mremap folio pte batch ignores the writable bit during figuring
out a set of similar ptes mapping the same folio.  Suppose that the first
pte of the batch is writable while the others are not - set_ptes will end
up setting the writable bit on the other ptes, which is a violation of
mremap semantics.  Therefore, use FPB_RESPECT_WRITE to check the writable
bit while determining the pte batch.

Link: https://lkml.kernel.org/r/20251028063952.90313-1-dev.jain@arm.com
Signed-off-by: Dev Jain <dev.jain@arm.com>
Fixes: f822a9a81a31 ("mm: optimize mremap() by PTE batching")
Reported-by: David Hildenbrand <david@redhat.com>
Debugged-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Jann Horn <jannh@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>	[6.17+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:44 -08:00
Peter Oberparleiter
ec4d11fc4b gcov: add support for GCC 15
Using gcov on kernels compiled with GCC 15 results in truncated 16-byte
long .gcda files with no usable data.  To fix this, update GCOV_COUNTERS
to match the value defined by GCC 15.

Tested with GCC 14.3.0 and GCC 15.2.0.

Link: https://lkml.kernel.org/r/20251028115125.1319410-1-oberpar@linux.ibm.com
Signed-off-by: Peter Oberparleiter <oberpar@linux.ibm.com>
Reported-by: Matthieu Baerts <matttbe@kernel.org>
Closes: https://github.com/linux-test-project/lcov/issues/445
Tested-by: Matthieu Baerts <matttbe@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:44 -08:00
Isaac J. Manjarres
0d6c356dd6 mm/mm_init: fix hash table order logging in alloc_large_system_hash()
When emitting the order of the allocation for a hash table,
alloc_large_system_hash() unconditionally subtracts PAGE_SHIFT from log
base 2 of the allocation size.  This is not correct if the allocation size
is smaller than a page, and yields a negative value for the order as seen
below:

TCP established hash table entries: 32 (order: -4, 256 bytes, linear) TCP
bind hash table entries: 32 (order: -2, 1024 bytes, linear)

Use get_order() to compute the order when emitting the hash table
information to correctly handle cases where the allocation size is smaller
than a page:

TCP established hash table entries: 32 (order: 0, 256 bytes, linear) TCP
bind hash table entries: 32 (order: 0, 1024 bytes, linear)

Link: https://lkml.kernel.org/r/20251028191020.413002-1-isaacmanjarres@google.com
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:44 -08:00
Kiryl Shutsemau
fa04f5b60f mm/truncate: unmap large folio on split failure
Accesses within VMA, but beyond i_size rounded up to PAGE_SIZE are
supposed to generate SIGBUS.

This behavior might not be respected on truncation.

During truncation, the kernel splits a large folio in order to reclaim
memory.  As a side effect, it unmaps the folio and destroys PMD mappings
of the folio.  The folio will be refaulted as PTEs and SIGBUS semantics
are preserved.

However, if the split fails, PMD mappings are preserved and the user will
not receive SIGBUS on any accesses within the PMD.

Unmap the folio on split failure.  It will lead to refault as PTEs and
preserve SIGBUS semantics.

Make an exception for shmem/tmpfs that for long time intentionally mapped
with PMDs across i_size.

Link: https://lkml.kernel.org/r/20251027115636.82382-3-kirill@shutemov.name
Fixes: b9a8a4195c7d ("truncate,shmem: Handle truncates that split large folios")
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:43 -08:00
Kiryl Shutsemau
74207de2ba mm/memory: do not populate page table entries beyond i_size
Patch series "Fix SIGBUS semantics with large folios", v3.

Accessing memory within a VMA, but beyond i_size rounded up to the next
page size, is supposed to generate SIGBUS.

Darrick reported[1] an xfstests regression in v6.18-rc1.  generic/749
failed due to missing SIGBUS.  This was caused by my recent changes that
try to fault in the whole folio where possible:

        19773df031bc ("mm/fault: try to map the entire file folio in finish_fault()")
        357b92761d94 ("mm/filemap: map entire large folio faultaround")

These changes did not consider i_size when setting up PTEs, leading to
xfstest breakage.

However, the problem has been present in the kernel for a long time -
since huge tmpfs was introduced in 2016.  The kernel happily maps
PMD-sized folios as PMD without checking i_size.  And huge=always tmpfs
allocates PMD-size folios on any writes.

I considered this corner case when I implemented a large tmpfs, and my
conclusion was that no one in their right mind should rely on receiving a
SIGBUS signal when accessing beyond i_size.  I cannot imagine how it could
be useful for the workload.

But apparently filesystem folks care a lot about preserving strict SIGBUS
semantics.

Generic/749 was introduced last year with reference to POSIX, but no real
workloads were mentioned.  It also acknowledged the tmpfs deviation from
the test case.

POSIX indeed says[3]:

        References within the address range starting at pa and
        continuing for len bytes to whole pages following the end of an
        object shall result in delivery of a SIGBUS signal.

The patchset fixes the regression introduced by recent changes as well as
more subtle SIGBUS breakage due to split failure on truncation.


This patch (of 2):

Accesses within VMA, but beyond i_size rounded up to PAGE_SIZE are
supposed to generate SIGBUS.

Recent changes attempted to fault in full folio where possible.  They did
not respect i_size, which led to populating PTEs beyond i_size and
breaking SIGBUS semantics.

Darrick reported generic/749 breakage because of this.

However, the problem existed before the recent changes.  With huge=always
tmpfs, any write to a file leads to PMD-size allocation.  Following the
fault-in of the folio will install PMD mapping regardless of i_size.

Fix filemap_map_pages() and finish_fault() to not install:
  - PTEs beyond i_size;
  - PMD mappings across i_size;

Make an exception for shmem/tmpfs that for long time intentionally
mapped with PMDs across i_size.

Link: https://lkml.kernel.org/r/20251027115636.82382-1-kirill@shutemov.name
Link: https://lkml.kernel.org/r/20251027115636.82382-2-kirill@shutemov.name
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
Fixes: 6795801366da ("xfs: Support large folios")
Reported-by: "Darrick J. Wong" <djwong@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:43 -08:00
Wei Yang
895b4c0c79 fs/proc: fix uaf in proc_readdir_de()
Pde is erased from subdir rbtree through rb_erase(), but not set the node
to EMPTY, which may result in uaf access.  We should use RB_CLEAR_NODE()
set the erased node to EMPTY, then pde_subdir_next() will return NULL to
avoid uaf access.

We found an uaf issue while using stress-ng testing, need to run testcase
getdent and tun in the same time.  The steps of the issue is as follows:

1) use getdent to traverse dir /proc/pid/net/dev_snmp6/, and current
   pde is tun3;

2) in the [time windows] unregister netdevice tun3 and tun2, and erase
   them from rbtree.  erase tun3 first, and then erase tun2.  the
   pde(tun2) will be released to slab;

3) continue to getdent process, then pde_subdir_next() will return
   pde(tun2) which is released, it will case uaf access.

CPU 0                                      |    CPU 1
-------------------------------------------------------------------------
traverse dir /proc/pid/net/dev_snmp6/      |   unregister_netdevice(tun->dev)   //tun3 tun2
sys_getdents64()                           |
  iterate_dir()                            |
    proc_readdir()                         |
      proc_readdir_de()                    |     snmp6_unregister_dev()
        pde_get(de);                       |       proc_remove()
        read_unlock(&proc_subdir_lock);    |         remove_proc_subtree()
                                           |           write_lock(&proc_subdir_lock);
        [time window]                      |           rb_erase(&root->subdir_node, &parent->subdir);
                                           |           write_unlock(&proc_subdir_lock);
        read_lock(&proc_subdir_lock);      |
        next = pde_subdir_next(de);        |
        pde_put(de);                       |
        de = next;    //UAF                |

rbtree of dev_snmp6
                        |
                    pde(tun3)
                     /    \
                  NULL  pde(tun2)

Link: https://lkml.kernel.org/r/20251025024233.158363-1-albin_yang@163.com
Signed-off-by: Wei Yang <albinwyang@tencent.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: wangzijie <wangzijie1@honor.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:43 -08:00
Zi Yan
fa5a061700 mm/huge_memory: preserve PG_has_hwpoisoned if a folio is split to >0 order
folio split clears PG_has_hwpoisoned, but the flag should be preserved in
after-split folios containing pages with PG_hwpoisoned flag if the folio
is split to >0 order folios.  Scan all pages in a to-be-split folio to
determine which after-split folios need the flag.

An alternatives is to change PG_has_hwpoisoned to PG_maybe_hwpoisoned to
avoid the scan and set it on all after-split folios, but resulting false
positive has undesirable negative impact.  To remove false positive,
caller of folio_test_has_hwpoisoned() and folio_contain_hwpoisoned_page()
needs to do the scan.  That might be causing a hassle for current and
future callers and more costly than doing the scan in the split code. 
More details are discussed in [1].

This issue can be exposed via:
1. splitting a has_hwpoisoned folio to >0 order from debugfs interface;
2. truncating part of a has_hwpoisoned folio in
   truncate_inode_partial_folio().

And later accesses to a hwpoisoned page could be possible due to the
missing has_hwpoisoned folio flag.  This will lead to MCE errors.

Link: https://lore.kernel.org/all/CAHbLzkoOZm0PXxE9qwtF4gKR=cpRXrSrJ9V9Pm2DJexs985q4g@mail.gmail.com/ [1]
Link: https://lkml.kernel.org/r/20251023030521.473097-1-ziy@nvidia.com
Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages")
Signed-off-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yang Shi <yang@os.amperecomputing.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Pankaj Raghav <kernel@pankajraghav.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Luis Chamberalin <mcgrof@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:43 -08:00
Pedro Demarchi Gomes
f5548c318d ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
Currently, scan_get_next_rmap_item() walks every page address in a VMA to
locate mergeable pages.  This becomes highly inefficient when scanning
large virtual memory areas that contain mostly unmapped regions, causing
ksmd to use large amount of cpu without deduplicating much pages.

This patch replaces the per-address lookup with a range walk using
walk_page_range().  The range walker allows KSM to skip over entire
unmapped holes in a VMA, avoiding unnecessary lookups.  This problem was
previously discussed in [1].

Consider the following test program which creates a 32 TiB mapping in the
virtual address space but only populates a single page:

#include <unistd.h>
#include <stdio.h>
#include <sys/mman.h>

/* 32 TiB */
const size_t size = 32ul * 1024 * 1024 * 1024 * 1024;

int main() {
        char *area = mmap(NULL, size, PROT_READ | PROT_WRITE,
                          MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0);

        if (area == MAP_FAILED) {
                perror("mmap() failed\n");
                return -1;
        }

        /* Populate a single page such that we get an anon_vma. */
        *area = 0;

        /* Enable KSM. */
        madvise(area, size, MADV_MERGEABLE);
        pause();
        return 0;
}

$ ./ksm-sparse  &
$ echo 1 > /sys/kernel/mm/ksm/run 

Without this patch ksmd uses 100% of the cpu for a long time (more then 1
hour in my test machine) scanning all the 32 TiB virtual address space
that contain only one mapped page.  This makes ksmd essentially deadlocked
not able to deduplicate anything of value.  With this patch ksmd walks
only the one mapped page and skips the rest of the 32 TiB virtual address
space, making the scan fast using little cpu.

Link: https://lkml.kernel.org/r/20251023035841.41406-1-pedrodemargomes@gmail.com
Link: https://lkml.kernel.org/r/20251022153059.22763-1-pedrodemargomes@gmail.com
Link: https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/ [1]
Fixes: 31dbd01f3143 ("ksm: Kernel SamePage Merging")
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
Co-developed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Reported-by: craftfever <craftfever@airmail.cc>
Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: xu xin <xu.xin16@zte.com.cn>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:42 -08:00
Aleksei Nikiforov
7e76b75e5a mm/kmsan: fix kmsan kmalloc hook when no stack depots are allocated yet
If no stack depot is allocated yet, due to masking out __GFP_RECLAIM flags
kmsan called from kmalloc cannot allocate stack depot.  kmsan fails to
record origin and report issues.  This may result in KMSAN failing to
report issues.

Reusing flags from kmalloc without modifying them should be safe for kmsan.
For example, such chain of calls is possible:
test_uninit_kmalloc -> kmalloc -> __kmalloc_cache_noprof ->
slab_alloc_node -> slab_post_alloc_hook ->
kmsan_slab_alloc -> kmsan_internal_poison_memory.

Only when it is called in a context without flags present should
__GFP_RECLAIM flags be masked.

With this change all kmsan tests start working reliably.

Eric reported:

: Yes, KMSAN seems to be at least partially broken currently.  Besides the
: fact that the kmsan KUnit test is currently failing (which I reported at
: https://lore.kernel.org/r/20250911175145.GA1376@sol), I've confirmed that
: the poly1305 KUnit test causes a KMSAN warning with Aleksei's patch
: applied but does not cause a warning without it.  The warning did get
: reached via syzbot somehow
: (https://lore.kernel.org/r/751b3d80293a6f599bb07770afcef24f623c7da0.1761026343.git.xiaopei01@kylinos.cn/),
: so KMSAN must still work in some cases.  But it didn't work for me.

Link: https://lkml.kernel.org/r/20250930115600.709776-2-aleksei.nikiforov@linux.ibm.com
Link: https://lkml.kernel.org/r/20251022030213.GA35717@sol
Fixes: 97769a53f117 ("mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation")
Signed-off-by: Aleksei Nikiforov <aleksei.nikiforov@linux.ibm.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Eric Biggers <ebiggers@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Marco Elver <elver@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:42 -08:00
Kairui Song
fc745ff317 mm/shmem: fix THP allocation and fallback loop
The order check and fallback loop is updating the index value on every
loop.  This will cause the index to be wrongly aligned by a larger value
while the loop shrinks the order.

This may result in inserting and returning a folio of the wrong index and
cause data corruption with some userspace workloads [1].

[kasong@tencent.com: introduce a temporary variable to improve code]
  Link: https://lkml.kernel.org/r/20251023065913.36925-1-ryncsn@gmail.com
  Link: https://lore.kernel.org/linux-mm/CAMgjq7DqgAmj25nDUwwu1U2cSGSn8n4-Hqpgottedy0S6YYeUw@mail.gmail.com/ [1]
Link: https://lkml.kernel.org/r/20251022105719.18321-1-ryncsn@gmail.com
Link: https://lore.kernel.org/linux-mm/CAMgjq7DqgAmj25nDUwwu1U2cSGSn8n4-Hqpgottedy0S6YYeUw@mail.gmail.com/ [1]
Fixes: e7a2ab7b3bb5 ("mm: shmem: add mTHP support for anonymous shmem")
Closes: https://lore.kernel.org/linux-mm/CAMgjq7DqgAmj25nDUwwu1U2cSGSn8n4-Hqpgottedy0S6YYeUw@mail.gmail.com/
Signed-off-by: Kairui Song <kasong@tencent.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Barry Song <baohua@kernel.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:42 -08:00
Pasha Tatashin
fa759cd75b kho: allocate metadata directly from the buddy allocator
KHO allocates metadata for its preserved memory map using the slab
allocator via kzalloc().  This metadata is temporary and is used by the
next kernel during early boot to find preserved memory.

A problem arises when KFENCE is enabled.  kzalloc() calls can be randomly
intercepted by kfence_alloc(), which services the allocation from a
dedicated KFENCE memory pool.  This pool is allocated early in boot via
memblock.

When booting via KHO, the memblock allocator is restricted to a "scratch
area", forcing the KFENCE pool to be allocated within it.  This creates a
conflict, as the scratch area is expected to be ephemeral and
overwriteable by a subsequent kexec.  If KHO metadata is placed in this
KFENCE pool, it leads to memory corruption when the next kernel is loaded.

To fix this, modify KHO to allocate its metadata directly from the buddy
allocator instead of slab.

Link: https://lkml.kernel.org/r/20251021000852.2924827-4-pasha.tatashin@soleen.com
Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation")
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: David Matlack <dmatlack@google.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:42 -08:00
Pasha Tatashin
a2fff99f92 kho: increase metadata bitmap size to PAGE_SIZE
KHO memory preservation metadata is preserved in 512 byte chunks which
requires their allocation from slab allocator.  Slabs are not safe to be
used with KHO because of kfence, and because partial slabs may lead leaks
to the next kernel.  Change the size to be PAGE_SIZE.

The kfence specifically may cause memory corruption, where it randomly
provides slab objects that can be within the scratch area.  The reason for
that is that kfence allocates its objects prior to KHO scratch is marked
as CMA region.

While this change could potentially increase metadata overhead on systems
with sparsely preserved memory, this is being mitigated by ongoing work to
reduce sparseness during preservation via 1G guest pages.  Furthermore,
this change aligns with future work on a stateless KHO, which will also
use page-sized bitmaps for its radix tree metadata.

Link: https://lkml.kernel.org/r/20251021000852.2924827-3-pasha.tatashin@soleen.com
Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation")
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Matlack <dmatlack@google.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:41 -08:00
Pasha Tatashin
e38f65d317 kho: warn and fail on metadata or preserved memory in scratch area
Patch series "KHO: kfence + KHO memory corruption fix", v3.

This series fixes a memory corruption bug in KHO that occurs when KFENCE
is enabled.

The root cause is that KHO metadata, allocated via kzalloc(), can be
randomly serviced by kfence_alloc().  When a kernel boots via KHO, the
early memblock allocator is restricted to a "scratch area".  This forces
the KFENCE pool to be allocated within this scratch area, creating a
conflict.  If KHO metadata is subsequently placed in this pool, it gets
corrupted during the next kexec operation.

Google is using KHO and have had obscure crashes due to this memory
corruption, with stacks all over the place.  I would prefer this fix to be
properly backported to stable so we can also automatically consume it once
we switch to the upstream KHO.

Patch 1/3 introduces a debug-only feature (CONFIG_KEXEC_HANDOVER_DEBUG)
that adds checks to detect and fail any operation that attempts to place
KHO metadata or preserved memory within the scratch area.  This serves as
a validation and diagnostic tool to confirm the problem without affecting
production builds.

Patch 2/3 Increases bitmap to PAGE_SIZE, so buddy allocator can be used.

Patch 3/3 Provides the fix by modifying KHO to allocate its metadata
directly from the buddy allocator instead of slab.  This bypasses the
KFENCE interception entirely.


This patch (of 3):

It is invalid for KHO metadata or preserved memory regions to be located
within the KHO scratch area, as this area is overwritten when the next
kernel is loaded, and used early in boot by the next kernel.  This can
lead to memory corruption.

Add checks to kho_preserve_* and KHO's internal metadata allocators
(xa_load_or_alloc, new_chunk) to verify that the physical address of the
memory does not overlap with any defined scratch region.  If an overlap is
detected, the operation will fail and a WARN_ON is triggered.  To avoid
performance overhead in production kernels, these checks are enabled only
when CONFIG_KEXEC_HANDOVER_DEBUG is selected.

[rppt@kernel.org: fix KEXEC_HANDOVER_DEBUG Kconfig dependency]
  Link: https://lkml.kernel.org/r/aQHUyyFtiNZhx8jo@kernel.org
[pasha.tatashin@soleen.com: build fix]
  Link: https://lkml.kernel.org/r/CA+CK2bBnorfsTymKtv4rKvqGBHs=y=MjEMMRg_tE-RME6n-zUw@mail.gmail.com
Link: https://lkml.kernel.org/r/20251021000852.2924827-1-pasha.tatashin@soleen.com
Link: https://lkml.kernel.org/r/20251021000852.2924827-2-pasha.tatashin@soleen.com
Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation")
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Mike Rapoport <rppt@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Matlack <dmatlack@google.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:41 -08:00
Zi Yan
77008e1b2e mm/huge_memory: do not change split_huge_page*() target order silently
Page cache folios from a file system that support large block size (LBS)
can have minimal folio order greater than 0, thus a high order folio might
not be able to be split down to order-0.  Commit e220917fa507 ("mm: split
a folio in minimum folio order chunks") bumps the target order of
split_huge_page*() to the minimum allowed order when splitting a LBS
folio.  This causes confusion for some split_huge_page*() callers like
memory failure handling code, since they expect after-split folios all
have order-0 when split succeeds but in reality get min_order_for_split()
order folios and give warnings.

Fix it by failing a split if the folio cannot be split to the target
order.  Rename try_folio_split() to try_folio_split_to_order() to reflect
the added new_order parameter.  Remove its unused list parameter.

[The test poisons LBS folios, which cannot be split to order-0 folios, and
also tries to poison all memory.  The non split LBS folios take more
memory than the test anticipated, leading to OOM.  The patch fixed the
kernel warning and the test needs some change to avoid OOM.]

Link: https://lkml.kernel.org/r/20251017013630.139907-1-ziy@nvidia.com
Fixes: e220917fa507 ("mm: split a folio in minimum folio order chunks")
Signed-off-by: Zi Yan <ziy@nvidia.com>
Reported-by: syzbot+e6367ea2fdab6ed46056@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/68d2c943.a70a0220.1b52b.02b3.GAE@google.com/
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Pankaj Raghav <p.raghav@samsung.com>
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Barry Song <baohua@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Mariano Pache <npache@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-09 21:19:41 -08:00
Adrian Barnaś
8e8ae78896 arm64: Reject modules with internal alternative callbacks
During module loading, check if a callback function used by the
alternatives specified in the '.altinstruction' ELF section (if present)
is located in core kernel .text. If not fail module loading before
callback is called.

Reported-by: Fanqin Cui <cuifq1@chinatelecom.cn>
Closes: https://lore.kernel.org/all/20250807072700.348514-1-fanqincui@163.com/
Signed-off-by: Adrian Barnaś <abarnas@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
[will: Folded in 'noinstr' tweak from Mark]
Signed-off-by: Will Deacon <will@kernel.org>
2025-11-07 15:00:14 +00:00
Adrian Barnaś
6d4a0fbd34 arm64: Fail module loading if dynamic SCS patching fails
Disallow a module to load if SCS dynamic patching fails for its code. For
module loading, instead of running a dry-run to check for patching errors,
try to run patching in the first run and propagate any errors so module
loading will fail.

Signed-off-by: Adrian Barnaś <abarnas@google.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
2025-11-07 15:00:09 +00:00
shechenglong
7f16357378 arm64: proton-pack: Fix hard lockup due to print in scheduler context
Relocate the printk() calls from spectre_v4_mitigations_off() and
spectre_v2_mitigations_off() into setup_system_capabilities() function,
preventing hard lockups caused by printk calls in scheduler context:

  | _raw_spin_lock_nested+168
  | ttwu_queue+180 (rq_lock(rq, &rf); 2nd acquiring the rq->__lock)
  | try_to_wake_up+548
  | wake_up_process+32
  | __up+88
  | up+100
  | __up_console_sem+96
  | console_unlock+696
  | vprintk_emit+428
  | vprintk_default+64
  | vprintk_func+220
  | printk+104
  | spectre_v4_enable_task_mitigation+344
  | __switch_to+100
  | __schedule+1028 (rq_lock(rq, &rf); 1st acquiring the rq->__lock)
  | schedule_idle+48
  | do_idle+388
  | cpu_startup_entry+44
  | secondary_start_kernel+352

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: shechenglong <shechenglong@xfusion.com>
Signed-off-by: Will Deacon <will@kernel.org>
2025-11-07 14:49:12 +00:00
shechenglong
62e72463ca arm64: proton-pack: Drop print when !CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY
Following the pattern established with other Spectre mitigations,
do not print a message when the CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY
Kconfig option is disabled.

Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: shechenglong <shechenglong@xfusion.com>
Signed-off-by: Will Deacon <will@kernel.org>
2025-11-07 14:49:09 +00:00
Ryan Roberts
53357f14f9 arm64: mm: Tidy up force_pte_mapping()
Tidy up the implementation of force_pte_mapping() to make it easier to
read and introduce the split_leaf_mapping_possible() helper to reduce
code duplication in split_kernel_leaf_mapping() and
arch_kfence_init_pool().

Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Yang Shi <yang@os.amperecomputing.com>
Signed-off-by: Will Deacon <will@kernel.org>
2025-11-07 14:43:15 +00:00
Ryan Roberts
40a292f701 arm64: mm: Optimize range_split_to_ptes()
Enter lazy_mmu mode while splitting a range of memory to pte mappings.
This causes barriers, which would otherwise be emitted after every pte
(and pmd/pud) write, to be deferred until exiting lazy_mmu mode.

For large systems, this is expected to significantly speed up fallback
to pte-mapping the linear map for the case where the boot CPU has
BBML2_NOABORT, but secondary CPUs do not. I haven't directly measured
it, but this is equivalent to commit 1fcb7cea8a5f ("arm64: mm: Batch dsb
and isb when populating pgtables").

Note that for the path from arch_kfence_init_pool(), we may sleep while
allocating memory inside the lazy_mmu mode. Sleeping is not allowed by
generic code inside lazy_mmu, but we know that the arm64 implementation
is sleep-safe. So this is ok and follows the same pattern already used
by split_kernel_leaf_mapping().

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Yang Shi <yang@os.amperecomputing.com>
Signed-off-by: Will Deacon <will@kernel.org>
2025-11-07 14:43:15 +00:00
Ryan Roberts
ce2b3a50ad arm64: mm: Don't sleep in split_kernel_leaf_mapping() when in atomic context
It has been reported that split_kernel_leaf_mapping() is trying to sleep
in non-sleepable context. It does this when acquiring the
pgtable_split_lock mutex, when either CONFIG_DEBUG_PAGEALLOC or
CONFIG_KFENCE are enabled, which change linear map permissions within
softirq context during memory allocation and/or freeing. All other paths
into this function are called from sleepable context and so are safe.

But it turns out that the memory for which these 2 features may attempt
to modify the permissions is always mapped by pte, so there is no need
to attempt to split the mapping. So let's exit early in these cases and
avoid attempting to take the mutex.

There is one wrinkle to this approach; late-initialized kfence allocates
it's pool from the buddy which may be block mapped. So we must hook that
allocation and convert it to pte-mappings up front. Previously this was
done as a side-effect of kfence protecting all the individual pages in
its pool at init-time, but this no longer works due to the added early
exit path in split_kernel_leaf_mapping().

So instead, do this via the existing arch_kfence_init_pool() arch hook,
and reuse the existing linear_map_split_to_ptes() infrastructure.

Closes: https://lore.kernel.org/all/f24b9032-0ec9-47b1-8b95-c0eeac7a31c5@roeck-us.net/
Fixes: a166563e7ec3 ("arm64: mm: support large block mapping when rodata=full")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <groeck@google.com>
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: David Hildenbrand (Red Hat) <david@kernel.org>
Reviewed-by: Yang Shi <yang@os.amperecomputing.com>
Signed-off-by: Will Deacon <will@kernel.org>
2025-11-07 14:43:15 +00:00
Yang Shi
0ec364c0c9 arm64: kprobes: check the return value of set_memory_rox()
Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
rodata=full"), __change_memory_common has more chance to fail due to
memory allocation failure when splitting page table. So check the return
value of set_memory_rox(), then bail out if it fails otherwise we may have
RW memory mapping for kprobes insn page.

Fixes: 195a1b7d8388 ("arm64: kprobes: call set_memory_rox() for kprobe page")
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
Signed-off-by: Will Deacon <will@kernel.org>
2025-11-07 14:30:22 +00:00
Punit Agrawal
7991fda619 arm64: acpi: Drop message logging SPCR default console
Commit f5a4af3c7527 ("ACPI: Add acpi=nospcr to disable ACPI SPCR as
default console on ARM64") introduced a command line parameter to
prevent using SPCR provided console as default. It also introduced a
message to log this choice.

Drop the message as it is not particularly useful and can be incorrect
in situations where no SPCR is provided by the firmware.

Link: https://lore.kernel.org/all/aQN0YWUYaPYWpgJM@willie-the-truck/
Signed-off-by: Punit Agrawal <punit.agrawal@oss.qualcomm.com>
Signed-off-by: Will Deacon <will@kernel.org>
2025-11-07 14:23:33 +00:00
Punit Agrawal
eeb8c19896 Revert "ACPI: Suppress misleading SPCR console message when SPCR table is absent"
This reverts commit bad3fa2fb9206f4dcec6ddef094ec2fbf6e8dcb2.

Commit bad3fa2fb920 ("ACPI: Suppress misleading SPCR console message
when SPCR table is absent") mistakenly assumes acpi_parse_spcr()
returning 0 to indicate a failure to parse SPCR. While addressing the
resultant incorrect logging it was deemed that dropping the message is
a better approach as it is not particularly useful.

Roll back the commit introducing the bug as a step towards dropping
the log message.

Link: https://lore.kernel.org/all/aQN0YWUYaPYWpgJM@willie-the-truck/
Signed-off-by: Punit Agrawal <punit.agrawal@oss.qualcomm.com>
Signed-off-by: Will Deacon <will@kernel.org>
2025-11-07 14:23:33 +00:00
Catalin Marinas
535fdfc5a2 arm64: Use load LSE atomics for the non-return per-CPU atomic operations
The non-return per-CPU this_cpu_*() atomic operations are implemented as
STADD/STCLR/STSET when FEAT_LSE is available. On many microarchitecture
implementations, these instructions tend to be executed "far" in the
interconnect or memory subsystem (unless the data is already in the L1
cache). This is in general more efficient when there is contention as it
avoids bouncing cache lines between CPUs. The load atomics (e.g. LDADD
without XZR as destination), OTOH, tend to be executed "near" with the
data loaded into the L1 cache.

STADD executed back to back as in srcu_read_{lock,unlock}*() incur an
additional overhead due to the default posting behaviour on several CPU
implementations. Since the per-CPU atomics are unlikely to be used
concurrently on the same memory location, encourage the hardware to to
execute them "near" by issuing load atomics - LDADD/LDCLR/LDSET - with
the destination register unused (but not XZR).

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/e7d539ed-ced0-4b96-8ecd-048a5b803b85@paulmck-laptop
Reported-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Palmer Dabbelt <palmer@dabbelt.com>
[will: Add comment and link to the discussion thread]
Signed-off-by: Will Deacon <will@kernel.org>
2025-11-07 14:20:07 +00:00
Zilin Guan
c367af440e btrfs: release root after error in data_reloc_print_warning_inode()
data_reloc_print_warning_inode() calls btrfs_get_fs_root() to obtain
local_root, but fails to release its reference when paths_from_inode()
returns an error. This causes a potential memory leak.

Add a missing btrfs_put_root() call in the error path to properly
decrease the reference count of local_root.

Fixes: b9a9a85059cde ("btrfs: output affected files when relocation fails")
CC: stable@vger.kernel.org # 6.6+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Zilin Guan <zilin@seu.edu.cn>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-05 20:01:12 +01:00
Zilin Guan
5fea61aa1c btrfs: scrub: put bio after errors in scrub_raid56_parity_stripe()
scrub_raid56_parity_stripe() allocates a bio with bio_alloc(), but
fails to release it on some error paths, leading to a potential
memory leak.

Add the missing bio_put() calls to properly drop the bio reference
in those error cases.

Fixes: 1009254bf22a3 ("btrfs: scrub: use scrub_stripe to implement RAID56 P/Q scrub")
CC: stable@vger.kernel.org # 6.6+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Zilin Guan <zilin@seu.edu.cn>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-05 20:01:12 +01:00
Filipe Manana
bfe3d755ef btrfs: do not update last_log_commit when logging inode due to a new name
When logging that a new name exists, we skip updating the inode's
last_log_commit field to prevent a later explicit fsync against the inode
from doing nothing (as updating last_log_commit makes btrfs_inode_in_log()
return true). We are detecting, at btrfs_log_inode(), that logging a new
name is happening by checking the logging mode is not LOG_INODE_EXISTS,
but that is not enough because we may log parent directories when logging
a new name of a file in LOG_INODE_ALL mode - we need to check that the
logging_new_name field of the log context too.

An example scenario where this results in an explicit fsync against a
directory not persisting changes to the directory is the following:

  $ mkfs.btrfs -f /dev/sdc
  $ mount /dev/sdc /mnt

  $ touch /mnt/foo

  $ sync

  $ mkdir /mnt/dir

  # Write some data to our file and fsync it.
  $ xfs_io -c "pwrite -S 0xab 0 64K" -c "fsync" /mnt/foo

  # Add a new link to our file. Since the file was logged before, we
  # update it in the log tree by calling btrfs_log_new_name().
  $ ln /mnt/foo /mnt/dir/bar

  # fsync the root directory - we expect it to persist the dentry for
  # the new directory "dir".
  $ xfs_io -c "fsync" /mnt

  <power fail>

After mounting the fs the entry for directory "dir" does not exists,
despite the explicit fsync on the root directory.

Here's why this happens:

1) When we fsync the file we log the inode, so that it's present in the
   log tree;

2) When adding the new link we enter btrfs_log_new_name(), and since the
   inode is in the log tree we proceed to updating the inode in the log
   tree;

3) We first set the inode's last_unlink_trans to the current transaction
   (early in btrfs_log_new_name());

4) We then eventually enter btrfs_log_inode_parent(), and after logging
   the file's inode, we call btrfs_log_all_parents() because the inode's
   last_unlink_trans matches the current transaction's ID (updated in the
   previous step);

5) So btrfs_log_all_parents() logs the root directory by calling
   btrfs_log_inode() for the root's inode with a log mode of LOG_INODE_ALL
   so that new dentries are logged;

6) At btrfs_log_inode(), because the log mode is LOG_INODE_ALL, we
   update root inode's last_log_commit to the last transaction that
   changed the inode (->last_sub_trans field of the inode), which
   corresponds to the current transaction's ID;

7) Then later when user space explicitly calls fsync against the root
   directory, we enter btrfs_sync_file(), which calls skip_inode_logging()
   and that returns true, since its call to btrfs_inode_in_log() returns
   true and there are no ordered extents (it's a directory, never has
   ordered extents). This results in btrfs_sync_file() returning without
   syncing the log or committing the current transaction, so all the
   updates we did when logging the new name, including logging the root
   directory,  are not persisted.

So fix this by but updating the inode's last_log_commit if we are sure
we are not logging a new name (if ctx->logging_new_name is false).

A test case for fstests will follow soon.

Reported-by: Vyacheslav Kovalevsky <slava.kovalevskiy.2014@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/03c5d7ec-5b3d-49d1-95bc-8970a7f82d87@gmail.com/
Fixes: 130341be7ffa ("btrfs: always update the logged transaction when logging new names")
CC: stable@vger.kernel.org # 6.1+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-05 20:01:01 +01:00
Naohiro Aota
6a1ab50135 btrfs: zoned: fix stripe width calculation
The stripe offset calculation in the zoned code for raid0 and raid10
wrongly uses map->stripe_size to calculate it. In fact, map->stripe_size is
the size of the device extent composing the block group, which always is
the zone_size on the zoned setup.

Fix it by using BTRFS_STRIPE_LEN and BTRFS_STRIPE_LEN_SHIFT. Also, optimize
the calculation a bit by doing the common calculation only once.

Fixes: c0d90a79e8e6 ("btrfs: zoned: fix alloc_offset calculation for partly conventional block groups")
CC: stable@vger.kernel.org # 6.17+
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-05 20:00:08 +01:00
Naohiro Aota
94f54924b9 btrfs: zoned: fix conventional zone capacity calculation
When a block group contains both conventional zone and sequential zone, the
capacity of the block group is wrongly set to the block group's full
length. The capacity should be calculated in btrfs_load_block_group_* using
the last allocation offset.

Fixes: 568220fa9657 ("btrfs: zoned: support RAID0/1/10 on top of raid stripe tree")
CC: stable@vger.kernel.org # v6.12+
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2025-11-05 20:00:06 +01:00
49 changed files with 623 additions and 268 deletions

View File

@ -16498,12 +16498,12 @@ F: mm/secretmem.c
MEMORY MANAGEMENT - SWAP
M: Andrew Morton <akpm@linux-foundation.org>
M: Chris Li <chrisl@kernel.org>
M: Kairui Song <kasong@tencent.com>
R: Kemeng Shi <shikemeng@huaweicloud.com>
R: Kairui Song <kasong@tencent.com>
R: Nhat Pham <nphamcs@gmail.com>
R: Baoquan He <bhe@redhat.com>
R: Barry Song <baohua@kernel.org>
R: Chris Li <chrisl@kernel.org>
L: linux-mm@kvack.org
S: Maintained
F: Documentation/mm/swap-table.rst

View File

@ -26,9 +26,12 @@ void __init apply_alternatives_all(void);
bool alternative_is_applied(u16 cpucap);
#ifdef CONFIG_MODULES
void apply_alternatives_module(void *start, size_t length);
int apply_alternatives_module(void *start, size_t length);
#else
static inline void apply_alternatives_module(void *start, size_t length) { }
static inline int apply_alternatives_module(void *start, size_t length)
{
return 0;
}
#endif
void alt_cb_patch_nops(struct alt_instr *alt, __le32 *origptr,

View File

@ -10,8 +10,6 @@
#include <asm/set_memory.h>
static inline bool arch_kfence_init_pool(void) { return true; }
static inline bool kfence_protect_page(unsigned long addr, bool protect)
{
set_memory_valid(addr, 1, !protect);
@ -25,6 +23,7 @@ static inline bool arm64_kfence_can_set_direct_map(void)
{
return !kfence_early_init;
}
bool arch_kfence_init_pool(void);
#else /* CONFIG_KFENCE */
static inline bool arm64_kfence_can_set_direct_map(void) { return false; }
#endif /* CONFIG_KFENCE */

View File

@ -77,7 +77,7 @@ __percpu_##name##_case_##sz(void *ptr, unsigned long val) \
" stxr" #sfx "\t%w[loop], %" #w "[tmp], %[ptr]\n" \
" cbnz %w[loop], 1b", \
/* LSE atomics */ \
#op_lse "\t%" #w "[val], %[ptr]\n" \
#op_lse "\t%" #w "[val], %" #w "[tmp], %[ptr]\n" \
__nops(3)) \
: [loop] "=&r" (loop), [tmp] "=&r" (tmp), \
[ptr] "+Q"(*(u##sz *)ptr) \
@ -124,9 +124,16 @@ PERCPU_RW_OPS(8)
PERCPU_RW_OPS(16)
PERCPU_RW_OPS(32)
PERCPU_RW_OPS(64)
PERCPU_OP(add, add, stadd)
PERCPU_OP(andnot, bic, stclr)
PERCPU_OP(or, orr, stset)
/*
* Use value-returning atomics for CPU-local ops as they are more likely
* to execute "near" to the CPU (e.g. in L1$).
*
* https://lore.kernel.org/r/e7d539ed-ced0-4b96-8ecd-048a5b803b85@paulmck-laptop
*/
PERCPU_OP(add, add, ldadd)
PERCPU_OP(andnot, bic, ldclr)
PERCPU_OP(or, orr, ldset)
PERCPU_RET_OP(add, add, ldadd)
#undef PERCPU_RW_OPS

View File

@ -53,7 +53,7 @@ enum {
EDYNSCS_INVALID_CFA_OPCODE = 4,
};
int __pi_scs_patch(const u8 eh_frame[], int size);
int __pi_scs_patch(const u8 eh_frame[], int size, bool skip_dry_run);
#endif /* __ASSEMBLY __ */

View File

@ -117,6 +117,7 @@ void spectre_bhb_patch_wa3(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst);
void spectre_bhb_patch_clearbhb(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst);
void spectre_print_disabled_mitigations(void);
#endif /* __ASSEMBLY__ */
#endif /* __ASM_SPECTRE_H */

View File

@ -197,8 +197,6 @@ out:
*/
void __init acpi_boot_table_init(void)
{
int ret;
/*
* Enable ACPI instead of device tree unless
* - ACPI has been disabled explicitly (acpi=off), or
@ -252,12 +250,8 @@ done:
* behaviour, use acpi=nospcr to disable console in ACPI SPCR
* table as default serial console.
*/
ret = acpi_parse_spcr(earlycon_acpi_spcr_enable,
acpi_parse_spcr(earlycon_acpi_spcr_enable,
!param_acpi_nospcr);
if (!ret || param_acpi_nospcr || !IS_ENABLED(CONFIG_ACPI_SPCR_TABLE))
pr_info("Use ACPI SPCR as default console: No\n");
else
pr_info("Use ACPI SPCR as default console: Yes\n");
if (IS_ENABLED(CONFIG_ACPI_BGRT))
acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);

View File

@ -139,9 +139,9 @@ static noinstr void clean_dcache_range_nopatch(u64 start, u64 end)
} while (cur += d_size, cur < end);
}
static void __apply_alternatives(const struct alt_region *region,
bool is_module,
unsigned long *cpucap_mask)
static int __apply_alternatives(const struct alt_region *region,
bool is_module,
unsigned long *cpucap_mask)
{
struct alt_instr *alt;
__le32 *origptr, *updptr;
@ -166,10 +166,13 @@ static void __apply_alternatives(const struct alt_region *region,
updptr = is_module ? origptr : lm_alias(origptr);
nr_inst = alt->orig_len / AARCH64_INSN_SIZE;
if (ALT_HAS_CB(alt))
if (ALT_HAS_CB(alt)) {
alt_cb = ALT_REPL_PTR(alt);
else
if (is_module && !core_kernel_text((unsigned long)alt_cb))
return -ENOEXEC;
} else {
alt_cb = patch_alternative;
}
alt_cb(alt, origptr, updptr, nr_inst);
@ -193,6 +196,8 @@ static void __apply_alternatives(const struct alt_region *region,
bitmap_and(applied_alternatives, applied_alternatives,
system_cpucaps, ARM64_NCAPS);
}
return 0;
}
static void __init apply_alternatives_vdso(void)
@ -277,7 +282,7 @@ void __init apply_boot_alternatives(void)
}
#ifdef CONFIG_MODULES
void apply_alternatives_module(void *start, size_t length)
int apply_alternatives_module(void *start, size_t length)
{
struct alt_region region = {
.begin = start,
@ -287,7 +292,7 @@ void apply_alternatives_module(void *start, size_t length)
bitmap_fill(all_capabilities, ARM64_NCAPS);
__apply_alternatives(&region, true, &all_capabilities[0]);
return __apply_alternatives(&region, true, &all_capabilities[0]);
}
#endif

View File

@ -95,6 +95,7 @@
#include <asm/vectors.h>
#include <asm/virt.h>
#include <asm/spectre.h>
/* Kernel representation of AT_HWCAP and AT_HWCAP2 */
static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly;
@ -3875,6 +3876,11 @@ static void __init setup_system_capabilities(void)
*/
if (system_uses_ttbr0_pan())
pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n");
/*
* Report Spectre mitigations status.
*/
spectre_print_disabled_mitigations();
}
void __init setup_system_features(void)

View File

@ -489,16 +489,29 @@ int module_finalize(const Elf_Ehdr *hdr,
int ret;
s = find_section(hdr, sechdrs, ".altinstructions");
if (s)
apply_alternatives_module((void *)s->sh_addr, s->sh_size);
if (s) {
ret = apply_alternatives_module((void *)s->sh_addr, s->sh_size);
if (ret < 0) {
pr_err("module %s: error occurred when applying alternatives\n", me->name);
return ret;
}
}
if (scs_is_dynamic()) {
s = find_section(hdr, sechdrs, ".init.eh_frame");
if (s) {
ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size);
if (ret)
/*
* Because we can reject modules that are malformed
* so SCS patching fails, skip dry run and try to patch
* it in place. If patching fails, the module would not
* be loaded anyway.
*/
ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size, true);
if (ret) {
pr_err("module %s: error occurred during dynamic SCS patching (%d)\n",
me->name, ret);
return -ENOEXEC;
}
}
}

View File

@ -476,7 +476,8 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
folio = page_folio(page);
if (folio_test_hugetlb(folio))
WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio));
WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio) &&
!is_huge_zero_folio(folio));
else
WARN_ON_ONCE(!page_mte_tagged(page) && !is_zero_page(page));

View File

@ -104,7 +104,7 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
if (enable_scs) {
scs_patch(__eh_frame_start + va_offset,
__eh_frame_end - __eh_frame_start);
__eh_frame_end - __eh_frame_start, false);
asm("ic ialluis");
dynamic_scs_is_enabled = true;

View File

@ -225,7 +225,7 @@ static int scs_handle_fde_frame(const struct eh_frame *frame,
return 0;
}
int scs_patch(const u8 eh_frame[], int size)
int scs_patch(const u8 eh_frame[], int size, bool skip_dry_run)
{
int code_alignment_factor = 1;
bool fde_use_sdata8 = false;
@ -277,11 +277,13 @@ int scs_patch(const u8 eh_frame[], int size)
}
} else {
ret = scs_handle_fde_frame(frame, code_alignment_factor,
fde_use_sdata8, true);
fde_use_sdata8, !skip_dry_run);
if (ret)
return ret;
scs_handle_fde_frame(frame, code_alignment_factor,
fde_use_sdata8, false);
if (!skip_dry_run)
scs_handle_fde_frame(frame, code_alignment_factor,
fde_use_sdata8, false);
}
p += sizeof(frame->size) + frame->size;

View File

@ -27,7 +27,7 @@ extern pgd_t init_pg_dir[], init_pg_end[];
void init_feature_override(u64 boot_status, const void *fdt, int chosen);
u64 kaslr_early_init(void *fdt, int chosen);
void relocate_kernel(u64 offset);
int scs_patch(const u8 eh_frame[], int size);
int scs_patch(const u8 eh_frame[], int size, bool skip_dry_run);
void map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa,
pgprot_t prot, int level, pte_t *tbl, bool may_use_cont,

View File

@ -49,7 +49,10 @@ void *alloc_insn_page(void)
addr = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
if (!addr)
return NULL;
set_memory_rox((unsigned long)addr, 1);
if (set_memory_rox((unsigned long)addr, 1)) {
execmem_free(addr);
return NULL;
}
return addr;
}

View File

@ -91,12 +91,7 @@ early_param("nospectre_v2", parse_spectre_v2_param);
static bool spectre_v2_mitigations_off(void)
{
bool ret = __nospectre_v2 || cpu_mitigations_off();
if (ret)
pr_info_once("spectre-v2 mitigation disabled by command line option\n");
return ret;
return __nospectre_v2 || cpu_mitigations_off();
}
static const char *get_bhb_affected_string(enum mitigation_state bhb_state)
@ -421,13 +416,8 @@ early_param("ssbd", parse_spectre_v4_param);
*/
static bool spectre_v4_mitigations_off(void)
{
bool ret = cpu_mitigations_off() ||
__spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED;
if (ret)
pr_info_once("spectre-v4 mitigation disabled by command-line option\n");
return ret;
return cpu_mitigations_off() ||
__spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED;
}
/* Do we need to toggle the mitigation state on entry to/exit from the kernel? */
@ -1042,10 +1032,6 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry)
if (arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) {
/* No point mitigating Spectre-BHB alone. */
} else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) {
pr_info_once("spectre-bhb mitigation disabled by compile time option\n");
} else if (cpu_mitigations_off() || __nospectre_bhb) {
pr_info_once("spectre-bhb mitigation disabled by command line option\n");
} else if (supports_ecbhb(SCOPE_LOCAL_CPU)) {
state = SPECTRE_MITIGATED;
set_bit(BHB_HW, &system_bhb_mitigations);
@ -1199,3 +1185,18 @@ void unpriv_ebpf_notify(int new_state)
pr_err("WARNING: %s", EBPF_WARN);
}
#endif
void spectre_print_disabled_mitigations(void)
{
/* Keep a single copy of the common message suffix to avoid duplication. */
const char *spectre_disabled_suffix = "mitigation disabled by command-line option\n";
if (spectre_v2_mitigations_off())
pr_info("spectre-v2 %s", spectre_disabled_suffix);
if (spectre_v4_mitigations_off())
pr_info("spectre-v4 %s", spectre_disabled_suffix);
if (__nospectre_bhb || cpu_mitigations_off())
pr_info("spectre-bhb %s", spectre_disabled_suffix);
}

View File

@ -969,6 +969,16 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
void tag_clear_highpage(struct page *page)
{
/*
* Check if MTE is supported and fall back to clear_highpage().
* get_huge_zero_folio() unconditionally passes __GFP_ZEROTAGS and
* post_alloc_hook() will invoke tag_clear_highpage().
*/
if (!system_supports_mte()) {
clear_highpage(page);
return;
}
/* Newly allocated page, shouldn't have been tagged yet */
WARN_ON_ONCE(!try_page_mte_tagging(page));
mte_zero_clear_page_tags(page_address(page));

View File

@ -708,6 +708,30 @@ out:
return ret;
}
static inline bool force_pte_mapping(void)
{
const bool bbml2 = system_capabilities_finalized() ?
system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort();
if (debug_pagealloc_enabled())
return true;
if (bbml2)
return false;
return rodata_full || arm64_kfence_can_set_direct_map() || is_realm_world();
}
static inline bool split_leaf_mapping_possible(void)
{
/*
* !BBML2_NOABORT systems should never run into scenarios where we would
* have to split. So exit early and let calling code detect it and raise
* a warning.
*/
if (!system_supports_bbml2_noabort())
return false;
return !force_pte_mapping();
}
static DEFINE_MUTEX(pgtable_split_lock);
int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
@ -715,12 +739,11 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
int ret;
/*
* !BBML2_NOABORT systems should not be trying to change permissions on
* anything that is not pte-mapped in the first place. Just return early
* and let the permission change code raise a warning if not already
* pte-mapped.
* Exit early if the region is within a pte-mapped area or if we can't
* split. For the latter case, the permission change code will raise a
* warning if not already pte-mapped.
*/
if (!system_supports_bbml2_noabort())
if (!split_leaf_mapping_possible() || is_kfence_address((void *)start))
return 0;
/*
@ -758,30 +781,30 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
return ret;
}
static int __init split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
unsigned long next,
struct mm_walk *walk)
static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
gfp_t gfp = *(gfp_t *)walk->private;
pud_t pud = pudp_get(pudp);
int ret = 0;
if (pud_leaf(pud))
ret = split_pud(pudp, pud, GFP_ATOMIC, false);
ret = split_pud(pudp, pud, gfp, false);
return ret;
}
static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
unsigned long next,
struct mm_walk *walk)
static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
gfp_t gfp = *(gfp_t *)walk->private;
pmd_t pmd = pmdp_get(pmdp);
int ret = 0;
if (pmd_leaf(pmd)) {
if (pmd_cont(pmd))
split_contpmd(pmdp);
ret = split_pmd(pmdp, pmd, GFP_ATOMIC, false);
ret = split_pmd(pmdp, pmd, gfp, false);
/*
* We have split the pmd directly to ptes so there is no need to
@ -793,9 +816,8 @@ static int __init split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
return ret;
}
static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
unsigned long next,
struct mm_walk *walk)
static int split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
pte_t pte = __ptep_get(ptep);
@ -805,12 +827,24 @@ static int __init split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
return 0;
}
static const struct mm_walk_ops split_to_ptes_ops __initconst = {
static const struct mm_walk_ops split_to_ptes_ops = {
.pud_entry = split_to_ptes_pud_entry,
.pmd_entry = split_to_ptes_pmd_entry,
.pte_entry = split_to_ptes_pte_entry,
};
static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp)
{
int ret;
arch_enter_lazy_mmu_mode();
ret = walk_kernel_page_table_range_lockless(start, end,
&split_to_ptes_ops, NULL, &gfp);
arch_leave_lazy_mmu_mode();
return ret;
}
static bool linear_map_requires_bbml2 __initdata;
u32 idmap_kpti_bbml2_flag;
@ -847,11 +881,9 @@ static int __init linear_map_split_to_ptes(void *__unused)
* PTE. The kernel alias remains static throughout runtime so
* can continue to be safely mapped with large mappings.
*/
ret = walk_kernel_page_table_range_lockless(lstart, kstart,
&split_to_ptes_ops, NULL, NULL);
ret = range_split_to_ptes(lstart, kstart, GFP_ATOMIC);
if (!ret)
ret = walk_kernel_page_table_range_lockless(kend, lend,
&split_to_ptes_ops, NULL, NULL);
ret = range_split_to_ptes(kend, lend, GFP_ATOMIC);
if (ret)
panic("Failed to split linear map\n");
flush_tlb_kernel_range(lstart, lend);
@ -1002,6 +1034,33 @@ static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp)
memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
__kfence_pool = phys_to_virt(kfence_pool);
}
bool arch_kfence_init_pool(void)
{
unsigned long start = (unsigned long)__kfence_pool;
unsigned long end = start + KFENCE_POOL_SIZE;
int ret;
/* Exit early if we know the linear map is already pte-mapped. */
if (!split_leaf_mapping_possible())
return true;
/* Kfence pool is already pte-mapped for the early init case. */
if (kfence_early_init)
return true;
mutex_lock(&pgtable_split_lock);
ret = range_split_to_ptes(start, end, GFP_PGTABLE_KERNEL);
mutex_unlock(&pgtable_split_lock);
/*
* Since the system supports bbml2_noabort, tlb invalidation is not
* required here; the pgtable mappings have been split to pte but larger
* entries may safely linger in the TLB.
*/
return !ret;
}
#else /* CONFIG_KFENCE */
static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; }
@ -1009,16 +1068,6 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) {
#endif /* CONFIG_KFENCE */
static inline bool force_pte_mapping(void)
{
bool bbml2 = system_capabilities_finalized() ?
system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort();
return (!bbml2 && (rodata_full || arm64_kfence_can_set_direct_map() ||
is_realm_world())) ||
debug_pagealloc_enabled();
}
static void __init map_mem(pgd_t *pgdp)
{
static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);

View File

@ -177,8 +177,10 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
return ret;
}
ret = paths_from_inode(inum, ipath);
if (ret < 0)
if (ret < 0) {
btrfs_put_root(local_root);
goto err;
}
/*
* We deliberately ignore the bit ipath might have been too small to

View File

@ -2203,6 +2203,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
&length, &bioc, NULL, NULL);
if (ret < 0) {
bio_put(bio);
btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
goto out;
@ -2212,6 +2213,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
btrfs_put_bioc(bioc);
if (!rbio) {
ret = -ENOMEM;
bio_put(bio);
btrfs_bio_counter_dec(fs_info);
goto out;
}

View File

@ -7122,7 +7122,7 @@ log_extents:
* a power failure unless the log was synced as part of an fsync
* against any other unrelated inode.
*/
if (inode_only != LOG_INODE_EXISTS)
if (!ctx->logging_new_name && inode_only != LOG_INODE_EXISTS)
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);

View File

@ -1317,6 +1317,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
if (!btrfs_dev_is_sequential(device, info->physical)) {
up_read(&dev_replace->rwsem);
info->alloc_offset = WP_CONVENTIONAL;
info->capacity = device->zone_info->zone_size;
return 0;
}
@ -1522,6 +1523,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
u64 stripe_nr = 0, stripe_offset = 0;
u32 stripe_index = 0;
if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
@ -1529,28 +1532,26 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
return -EINVAL;
}
if (last_alloc) {
u32 factor = map->num_stripes;
stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT;
stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK;
stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
}
for (int i = 0; i < map->num_stripes; i++) {
if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
u64 stripe_nr, full_stripe_nr;
u64 stripe_offset;
int stripe_index;
stripe_nr = div64_u64(last_alloc, map->stripe_size);
stripe_offset = stripe_nr * map->stripe_size;
full_stripe_nr = div_u64(stripe_nr, map->num_stripes);
div_u64_rem(stripe_nr, map->num_stripes, &stripe_index);
zone_info[i].alloc_offset =
full_stripe_nr * map->stripe_size;
zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr);
if (stripe_index > i)
zone_info[i].alloc_offset += map->stripe_size;
zone_info[i].alloc_offset += BTRFS_STRIPE_LEN;
else if (stripe_index == i)
zone_info[i].alloc_offset +=
(last_alloc - stripe_offset);
zone_info[i].alloc_offset += stripe_offset;
}
if (test_bit(0, active) != test_bit(i, active)) {
@ -1574,6 +1575,8 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
u64 stripe_nr = 0, stripe_offset = 0;
u32 stripe_index = 0;
if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
@ -1581,6 +1584,14 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
return -EINVAL;
}
if (last_alloc) {
u32 factor = map->num_stripes / map->sub_stripes;
stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT;
stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK;
stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
}
for (int i = 0; i < map->num_stripes; i++) {
if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
@ -1594,26 +1605,12 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
}
if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
u64 stripe_nr, full_stripe_nr;
u64 stripe_offset;
int stripe_index;
stripe_nr = div64_u64(last_alloc, map->stripe_size);
stripe_offset = stripe_nr * map->stripe_size;
full_stripe_nr = div_u64(stripe_nr,
map->num_stripes / map->sub_stripes);
div_u64_rem(stripe_nr,
(map->num_stripes / map->sub_stripes),
&stripe_index);
zone_info[i].alloc_offset =
full_stripe_nr * map->stripe_size;
zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr);
if (stripe_index > (i / map->sub_stripes))
zone_info[i].alloc_offset += map->stripe_size;
zone_info[i].alloc_offset += BTRFS_STRIPE_LEN;
else if (stripe_index == (i / map->sub_stripes))
zone_info[i].alloc_offset +=
(last_alloc - stripe_offset);
zone_info[i].alloc_offset += stripe_offset;
}
if ((i % map->sub_stripes) == 0) {
@ -1683,8 +1680,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags);
if (num_conventional > 0) {
/* Zone capacity is always zone size in emulation */
cache->zone_capacity = cache->length;
ret = calculate_alloc_pointer(cache, &last_alloc, new);
if (ret) {
btrfs_err(fs_info,
@ -1693,6 +1688,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
} else if (map->num_stripes == num_conventional) {
cache->alloc_offset = last_alloc;
cache->zone_capacity = cache->length;
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags);
goto out;
}

View File

@ -2768,7 +2768,12 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
if (sci->sc_task) {
wake_up(&sci->sc_wait_daemon);
kthread_stop(sci->sc_task);
if (kthread_stop(sci->sc_task)) {
spin_lock(&sci->sc_state_lock);
sci->sc_task = NULL;
timer_shutdown_sync(&sci->sc_timer);
spin_unlock(&sci->sc_state_lock);
}
}
spin_lock(&sci->sc_state_lock);

View File

@ -698,6 +698,12 @@ void pde_put(struct proc_dir_entry *pde)
}
}
static void pde_erase(struct proc_dir_entry *pde, struct proc_dir_entry *parent)
{
rb_erase(&pde->subdir_node, &parent->subdir);
RB_CLEAR_NODE(&pde->subdir_node);
}
/*
* Remove a /proc entry and free it if it's not currently in use.
*/
@ -720,7 +726,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
WARN(1, "removing permanent /proc entry '%s'", de->name);
de = NULL;
} else {
rb_erase(&de->subdir_node, &parent->subdir);
pde_erase(de, parent);
if (S_ISDIR(de->mode))
parent->nlink--;
}
@ -764,7 +770,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
root->parent->name, root->name);
return -EINVAL;
}
rb_erase(&root->subdir_node, &parent->subdir);
pde_erase(root, parent);
de = root;
while (1) {
@ -776,7 +782,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
next->parent->name, next->name);
return -EINVAL;
}
rb_erase(&next->subdir_node, &de->subdir);
pde_erase(next, de);
de = next;
continue;
}

View File

@ -7,6 +7,7 @@
#include <linux/mmzone.h>
#include <linux/topology.h>
#include <linux/alloc_tag.h>
#include <linux/cleanup.h>
#include <linux/sched.h>
struct vm_area_struct;
@ -463,4 +464,6 @@ static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
/* This should be paired with folio_put() rather than free_contig_range(). */
#define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__))
DEFINE_FREE(free_page, void *, free_page((unsigned long)_T))
#endif /* __LINUX_GFP_H */

View File

@ -376,45 +376,30 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
struct list_head *list);
/*
* try_folio_split - try to split a @folio at @page using non uniform split.
* try_folio_split_to_order - try to split a @folio at @page to @new_order using
* non uniform split.
* @folio: folio to be split
* @page: split to order-0 at the given page
* @list: store the after-split folios
* @page: split to @new_order at the given page
* @new_order: the target split order
*
* Try to split a @folio at @page using non uniform split to order-0, if
* non uniform split is not supported, fall back to uniform split.
* Try to split a @folio at @page using non uniform split to @new_order, if
* non uniform split is not supported, fall back to uniform split. After-split
* folios are put back to LRU list. Use min_order_for_split() to get the lower
* bound of @new_order.
*
* Return: 0: split is successful, otherwise split failed.
*/
static inline int try_folio_split(struct folio *folio, struct page *page,
struct list_head *list)
static inline int try_folio_split_to_order(struct folio *folio,
struct page *page, unsigned int new_order)
{
int ret = min_order_for_split(folio);
if (ret < 0)
return ret;
if (!non_uniform_split_supported(folio, 0, false))
return split_huge_page_to_list_to_order(&folio->page, list,
ret);
return folio_split(folio, ret, page, list);
if (!non_uniform_split_supported(folio, new_order, /* warns= */ false))
return split_huge_page_to_list_to_order(&folio->page, NULL,
new_order);
return folio_split(folio, new_order, page, NULL);
}
static inline int split_huge_page(struct page *page)
{
struct folio *folio = page_folio(page);
int ret = min_order_for_split(folio);
if (ret < 0)
return ret;
/*
* split_huge_page() locks the page before splitting and
* expects the same page that has been split to be locked when
* returned. split_folio(page_folio(page)) cannot be used here
* because it converts the page to folio and passes the head
* page to be split.
*/
return split_huge_page_to_list_to_order(page, NULL, ret);
return split_huge_page_to_list_to_order(page, NULL, 0);
}
void deferred_split_folio(struct folio *folio, bool partially_mapped);
@ -597,14 +582,20 @@ static inline int split_huge_page(struct page *page)
return -EINVAL;
}
static inline int min_order_for_split(struct folio *folio)
{
VM_WARN_ON_ONCE_FOLIO(1, folio);
return -EINVAL;
}
static inline int split_folio_to_list(struct folio *folio, struct list_head *list)
{
VM_WARN_ON_ONCE_FOLIO(1, folio);
return -EINVAL;
}
static inline int try_folio_split(struct folio *folio, struct page *page,
struct list_head *list)
static inline int try_folio_split_to_order(struct folio *folio,
struct page *page, unsigned int new_order)
{
VM_WARN_ON_ONCE_FOLIO(1, folio);
return -EINVAL;

View File

@ -109,6 +109,15 @@ config KEXEC_HANDOVER
to keep data or state alive across the kexec. For this to work,
both source and target kernels need to have this option enabled.
config KEXEC_HANDOVER_DEBUG
bool "Enable Kexec Handover debug checks"
depends on KEXEC_HANDOVER
help
This option enables extra sanity checks for the Kexec Handover
subsystem. Since, KHO performance is crucial in live update
scenarios and the extra code might be adding overhead it is
only optionally enabled.
config CRASH_DUMP
bool "kernel crash dumps"
default ARCH_DEFAULT_CRASH_DUMP

View File

@ -83,6 +83,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o
obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup/

View File

@ -18,7 +18,9 @@
#include <linux/mm.h>
#include "gcov.h"
#if (__GNUC__ >= 14)
#if (__GNUC__ >= 15)
#define GCOV_COUNTERS 10
#elif (__GNUC__ >= 14)
#define GCOV_COUNTERS 9
#elif (__GNUC__ >= 10)
#define GCOV_COUNTERS 8

View File

@ -8,6 +8,7 @@
#define pr_fmt(fmt) "KHO: " fmt
#include <linux/cleanup.h>
#include <linux/cma.h>
#include <linux/count_zeros.h>
#include <linux/debugfs.h>
@ -22,6 +23,7 @@
#include <asm/early_ioremap.h>
#include "kexec_handover_internal.h"
/*
* KHO is tightly coupled with mm init and needs access to some of mm
* internal APIs.
@ -67,10 +69,10 @@ early_param("kho", kho_parse_enable);
* Keep track of memory that is to be preserved across KHO.
*
* The serializing side uses two levels of xarrays to manage chunks of per-order
* 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a
* 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations
* each bitmap will cover 16M of address space. Thus, for 16G of memory at most
* 512K of bitmap memory will be needed for order 0.
* PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order
* of a 8TB system would fit inside a single 4096 byte bitmap. For order 0
* allocations each bitmap will cover 128M of address space. Thus, for 16G of
* memory at most 512K of bitmap memory will be needed for order 0.
*
* This approach is fully incremental, as the serialization progresses folios
* can continue be aggregated to the tracker. The final step, immediately prior
@ -78,12 +80,14 @@ early_param("kho", kho_parse_enable);
* successor kernel to parse.
*/
#define PRESERVE_BITS (512 * 8)
#define PRESERVE_BITS (PAGE_SIZE * 8)
struct kho_mem_phys_bits {
DECLARE_BITMAP(preserve, PRESERVE_BITS);
};
static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE);
struct kho_mem_phys {
/*
* Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
@ -131,28 +135,28 @@ static struct kho_out kho_out = {
.finalized = false,
};
static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz)
static void *xa_load_or_alloc(struct xarray *xa, unsigned long index)
{
void *elm, *res;
void *res = xa_load(xa, index);
elm = xa_load(xa, index);
if (elm)
return elm;
if (res)
return res;
void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL);
elm = kzalloc(sz, GFP_KERNEL);
if (!elm)
return ERR_PTR(-ENOMEM);
if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE)))
return ERR_PTR(-EINVAL);
res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
if (xa_is_err(res))
res = ERR_PTR(xa_err(res));
if (res) {
kfree(elm);
return ERR_PTR(xa_err(res));
else if (res)
return res;
}
return elm;
return no_free_ptr(elm);
}
static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
@ -167,12 +171,12 @@ static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn,
const unsigned long pfn_high = pfn >> order;
physxa = xa_load(&track->orders, order);
if (!physxa)
continue;
if (WARN_ON_ONCE(!physxa))
return;
bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
if (!bits)
continue;
if (WARN_ON_ONCE(!bits))
return;
clear_bit(pfn_high % PRESERVE_BITS, bits->preserve);
@ -216,8 +220,7 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
}
}
bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS,
sizeof(*bits));
bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS);
if (IS_ERR(bits))
return PTR_ERR(bits);
@ -345,15 +348,19 @@ static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE);
static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk,
unsigned long order)
{
struct khoser_mem_chunk *chunk;
struct khoser_mem_chunk *chunk __free(free_page) = NULL;
chunk = kzalloc(PAGE_SIZE, GFP_KERNEL);
chunk = (void *)get_zeroed_page(GFP_KERNEL);
if (!chunk)
return NULL;
return ERR_PTR(-ENOMEM);
if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE)))
return ERR_PTR(-EINVAL);
chunk->hdr.order = order;
if (cur_chunk)
KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk);
return chunk;
return no_free_ptr(chunk);
}
static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
@ -374,14 +381,17 @@ static int kho_mem_serialize(struct kho_serialization *ser)
struct khoser_mem_chunk *chunk = NULL;
struct kho_mem_phys *physxa;
unsigned long order;
int err = -ENOMEM;
xa_for_each(&ser->track.orders, order, physxa) {
struct kho_mem_phys_bits *bits;
unsigned long phys;
chunk = new_chunk(chunk, order);
if (!chunk)
if (IS_ERR(chunk)) {
err = PTR_ERR(chunk);
goto err_free;
}
if (!first_chunk)
first_chunk = chunk;
@ -391,8 +401,10 @@ static int kho_mem_serialize(struct kho_serialization *ser)
if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) {
chunk = new_chunk(chunk, order);
if (!chunk)
if (IS_ERR(chunk)) {
err = PTR_ERR(chunk);
goto err_free;
}
}
elm = &chunk->bitmaps[chunk->hdr.num_elms];
@ -409,7 +421,7 @@ static int kho_mem_serialize(struct kho_serialization *ser)
err_free:
kho_mem_ser_free(first_chunk);
return -ENOMEM;
return err;
}
static void __init deserialize_bitmap(unsigned int order,
@ -465,8 +477,8 @@ static void __init kho_mem_deserialize(const void *fdt)
* area for early allocations that happen before page allocator is
* initialized.
*/
static struct kho_scratch *kho_scratch;
static unsigned int kho_scratch_cnt;
struct kho_scratch *kho_scratch;
unsigned int kho_scratch_cnt;
/*
* The scratch areas are scaled by default as percent of memory allocated from
@ -752,6 +764,9 @@ int kho_preserve_folio(struct folio *folio)
const unsigned int order = folio_order(folio);
struct kho_mem_track *track = &kho_out.ser.track;
if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order)))
return -EINVAL;
return __kho_preserve_order(track, pfn, order);
}
EXPORT_SYMBOL_GPL(kho_preserve_folio);
@ -775,6 +790,11 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages)
unsigned long failed_pfn = 0;
int err = 0;
if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT,
nr_pages << PAGE_SHIFT))) {
return -EINVAL;
}
while (pfn < end_pfn) {
const unsigned int order =
min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn));
@ -862,16 +882,17 @@ err_free:
return NULL;
}
static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk)
static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk,
unsigned short order)
{
struct kho_mem_track *track = &kho_out.ser.track;
unsigned long pfn = PHYS_PFN(virt_to_phys(chunk));
__kho_unpreserve(track, pfn, pfn + 1);
for (int i = 0; chunk->phys[i]; i++) {
for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
pfn = PHYS_PFN(chunk->phys[i]);
__kho_unpreserve(track, pfn, pfn + 1);
__kho_unpreserve(track, pfn, pfn + (1 << order));
}
}
@ -882,7 +903,7 @@ static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc)
while (chunk) {
struct kho_vmalloc_chunk *tmp = chunk;
kho_vmalloc_unpreserve_chunk(chunk);
kho_vmalloc_unpreserve_chunk(chunk, kho_vmalloc->order);
chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
free_page((unsigned long)tmp);
@ -992,7 +1013,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation)
while (chunk) {
struct page *page;
for (int i = 0; chunk->phys[i]; i++) {
for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) {
phys_addr_t phys = chunk->phys[i];
if (idx + contig_pages > total_pages)

View File

@ -0,0 +1,25 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* kexec_handover_debug.c - kexec handover optional debug functionality
* Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com>
*/
#define pr_fmt(fmt) "KHO: " fmt
#include "kexec_handover_internal.h"
bool kho_scratch_overlap(phys_addr_t phys, size_t size)
{
phys_addr_t scratch_start, scratch_end;
unsigned int i;
for (i = 0; i < kho_scratch_cnt; i++) {
scratch_start = kho_scratch[i].addr;
scratch_end = kho_scratch[i].addr + kho_scratch[i].size;
if (phys < scratch_end && (phys + size) > scratch_start)
return true;
}
return false;
}

View File

@ -0,0 +1,20 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H
#define LINUX_KEXEC_HANDOVER_INTERNAL_H
#include <linux/kexec_handover.h>
#include <linux/types.h>
extern struct kho_scratch *kho_scratch;
extern unsigned int kho_scratch_cnt;
#ifdef CONFIG_KEXEC_HANDOVER_DEBUG
bool kho_scratch_overlap(phys_addr_t phys, size_t size);
#else
static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size)
{
return false;
}
#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */
#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */

View File

@ -64,6 +64,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/maple_tree.h>
#define TP_FCT tracepoint_string(__func__)
/*
* Kernel pointer hashing renders much of the maple tree dump useless as tagged
* pointers get hashed to arbitrary values.
@ -2756,7 +2758,7 @@ static inline void mas_rebalance(struct ma_state *mas,
MA_STATE(l_mas, mas->tree, mas->index, mas->last);
MA_STATE(r_mas, mas->tree, mas->index, mas->last);
trace_ma_op(__func__, mas);
trace_ma_op(TP_FCT, mas);
/*
* Rebalancing occurs if a node is insufficient. Data is rebalanced
@ -2997,7 +2999,7 @@ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node)
MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last);
MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last);
trace_ma_op(__func__, mas);
trace_ma_op(TP_FCT, mas);
mast.l = &l_mas;
mast.r = &r_mas;
@ -3172,7 +3174,7 @@ static bool mas_is_span_wr(struct ma_wr_state *wr_mas)
return false;
}
trace_ma_write(__func__, wr_mas->mas, wr_mas->r_max, entry);
trace_ma_write(TP_FCT, wr_mas->mas, wr_mas->r_max, entry);
return true;
}
@ -3416,7 +3418,7 @@ static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas)
* of data may happen.
*/
mas = wr_mas->mas;
trace_ma_op(__func__, mas);
trace_ma_op(TP_FCT, mas);
if (unlikely(!mas->index && mas->last == ULONG_MAX))
return mas_new_root(mas, wr_mas->entry);
@ -3552,7 +3554,7 @@ done:
} else {
memcpy(wr_mas->node, newnode, sizeof(struct maple_node));
}
trace_ma_write(__func__, mas, 0, wr_mas->entry);
trace_ma_write(TP_FCT, mas, 0, wr_mas->entry);
mas_update_gap(mas);
mas->end = new_end;
return;
@ -3596,7 +3598,7 @@ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas)
mas->offset++; /* Keep mas accurate. */
}
trace_ma_write(__func__, mas, 0, wr_mas->entry);
trace_ma_write(TP_FCT, mas, 0, wr_mas->entry);
/*
* Only update gap when the new entry is empty or there is an empty
* entry in the original two ranges.
@ -3717,7 +3719,7 @@ static inline void mas_wr_append(struct ma_wr_state *wr_mas,
mas_update_gap(mas);
mas->end = new_end;
trace_ma_write(__func__, mas, new_end, wr_mas->entry);
trace_ma_write(TP_FCT, mas, new_end, wr_mas->entry);
return;
}
@ -3731,7 +3733,7 @@ static void mas_wr_bnode(struct ma_wr_state *wr_mas)
{
struct maple_big_node b_node;
trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry);
trace_ma_write(TP_FCT, wr_mas->mas, 0, wr_mas->entry);
memset(&b_node, 0, sizeof(struct maple_big_node));
mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end);
mas_commit_b_node(wr_mas, &b_node);
@ -5062,7 +5064,7 @@ void *mas_store(struct ma_state *mas, void *entry)
{
MA_WR_STATE(wr_mas, mas, entry);
trace_ma_write(__func__, mas, 0, entry);
trace_ma_write(TP_FCT, mas, 0, entry);
#ifdef CONFIG_DEBUG_MAPLE_TREE
if (MAS_WARN_ON(mas, mas->index > mas->last))
pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last,
@ -5163,7 +5165,7 @@ void mas_store_prealloc(struct ma_state *mas, void *entry)
}
store:
trace_ma_write(__func__, mas, 0, entry);
trace_ma_write(TP_FCT, mas, 0, entry);
mas_wr_store_entry(&wr_mas);
MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas));
mas_destroy(mas);
@ -5882,7 +5884,7 @@ void *mtree_load(struct maple_tree *mt, unsigned long index)
MA_STATE(mas, mt, index, index);
void *entry;
trace_ma_read(__func__, &mas);
trace_ma_read(TP_FCT, &mas);
rcu_read_lock();
retry:
entry = mas_start(&mas);
@ -5925,7 +5927,7 @@ int mtree_store_range(struct maple_tree *mt, unsigned long index,
MA_STATE(mas, mt, index, last);
int ret = 0;
trace_ma_write(__func__, &mas, 0, entry);
trace_ma_write(TP_FCT, &mas, 0, entry);
if (WARN_ON_ONCE(xa_is_advanced(entry)))
return -EINVAL;
@ -6148,7 +6150,7 @@ void *mtree_erase(struct maple_tree *mt, unsigned long index)
void *entry = NULL;
MA_STATE(mas, mt, index, index);
trace_ma_op(__func__, &mas);
trace_ma_op(TP_FCT, &mas);
mtree_lock(mt);
entry = mas_erase(&mas);
@ -6485,7 +6487,7 @@ void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max)
unsigned long copy = *index;
#endif
trace_ma_read(__func__, &mas);
trace_ma_read(TP_FCT, &mas);
if ((*index) > max)
return NULL;

View File

@ -46,6 +46,8 @@ MODULE_PARM_DESC(aggr_interval_us,
static struct damon_ctx *damon_stat_context;
static unsigned long damon_stat_last_refresh_jiffies;
static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c)
{
struct damon_target *t;
@ -130,13 +132,12 @@ static void damon_stat_set_idletime_percentiles(struct damon_ctx *c)
static int damon_stat_damon_call_fn(void *data)
{
struct damon_ctx *c = data;
static unsigned long last_refresh_jiffies;
/* avoid unnecessarily frequent stat update */
if (time_before_eq(jiffies, last_refresh_jiffies +
if (time_before_eq(jiffies, damon_stat_last_refresh_jiffies +
msecs_to_jiffies(5 * MSEC_PER_SEC)))
return 0;
last_refresh_jiffies = jiffies;
damon_stat_last_refresh_jiffies = jiffies;
aggr_interval_us = c->attrs.aggr_interval;
damon_stat_set_estimated_memory_bandwidth(c);
@ -210,6 +211,8 @@ static int damon_stat_start(void)
err = damon_start(&damon_stat_context, 1, true);
if (err)
return err;
damon_stat_last_refresh_jiffies = jiffies;
call_control.data = damon_stat_context;
return damon_call(damon_stat_context, &call_control);
}

View File

@ -1552,16 +1552,17 @@ static struct damon_ctx *damon_sysfs_build_ctx(
return ctx;
}
static unsigned long damon_sysfs_next_update_jiffies;
static int damon_sysfs_repeat_call_fn(void *data)
{
struct damon_sysfs_kdamond *sysfs_kdamond = data;
static unsigned long next_update_jiffies;
if (!sysfs_kdamond->refresh_ms)
return 0;
if (time_before(jiffies, next_update_jiffies))
if (time_before(jiffies, damon_sysfs_next_update_jiffies))
return 0;
next_update_jiffies = jiffies +
damon_sysfs_next_update_jiffies = jiffies +
msecs_to_jiffies(sysfs_kdamond->refresh_ms);
if (!mutex_trylock(&damon_sysfs_lock))
@ -1607,6 +1608,9 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
}
kdamond->damon_ctx = ctx;
damon_sysfs_next_update_jiffies =
jiffies + msecs_to_jiffies(kdamond->refresh_ms);
repeat_call_control->fn = damon_sysfs_repeat_call_fn;
repeat_call_control->data = kdamond;
repeat_call_control->repeat = true;

View File

@ -3681,7 +3681,8 @@ skip:
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
struct folio *folio, unsigned long start,
unsigned long addr, unsigned int nr_pages,
unsigned long *rss, unsigned short *mmap_miss)
unsigned long *rss, unsigned short *mmap_miss,
bool can_map_large)
{
unsigned int ref_from_caller = 1;
vm_fault_t ret = 0;
@ -3696,7 +3697,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
* The folio must not cross VMA or page table boundary.
*/
addr0 = addr - start * PAGE_SIZE;
if (folio_within_vma(folio, vmf->vma) &&
if (can_map_large && folio_within_vma(folio, vmf->vma) &&
(addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) {
vmf->pte -= start;
page -= start;
@ -3811,13 +3812,27 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
unsigned long rss = 0;
unsigned int nr_pages = 0, folio_type;
unsigned short mmap_miss = 0, mmap_miss_saved;
bool can_map_large;
rcu_read_lock();
folio = next_uptodate_folio(&xas, mapping, end_pgoff);
if (!folio)
goto out;
if (filemap_map_pmd(vmf, folio, start_pgoff)) {
file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
end_pgoff = min(end_pgoff, file_end);
/*
* Do not allow to map with PTEs beyond i_size and with PMD
* across i_size to preserve SIGBUS semantics.
*
* Make an exception for shmem/tmpfs that for long time
* intentionally mapped with PMDs across i_size.
*/
can_map_large = shmem_mapping(mapping) ||
file_end >= folio_next_index(folio);
if (can_map_large && filemap_map_pmd(vmf, folio, start_pgoff)) {
ret = VM_FAULT_NOPAGE;
goto out;
}
@ -3830,10 +3845,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
goto out;
}
file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
if (end_pgoff > file_end)
end_pgoff = file_end;
folio_type = mm_counter_file(folio);
do {
unsigned long end;
@ -3850,7 +3861,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
else
ret |= filemap_map_folio_range(vmf, folio,
xas.xa_index - folio->index, addr,
nr_pages, &rss, &mmap_miss);
nr_pages, &rss, &mmap_miss,
can_map_large);
folio_unlock(folio);
} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);

View File

@ -214,7 +214,8 @@ retry:
if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
return true;
zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO | __GFP_ZEROTAGS) &
~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
if (!zero_folio) {
count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
@ -3263,6 +3264,14 @@ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
caller_pins;
}
static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
{
for (; nr_pages; page++, nr_pages--)
if (PageHWPoison(page))
return true;
return false;
}
/*
* It splits @folio into @new_order folios and copies the @folio metadata to
* all the resulting folios.
@ -3270,17 +3279,24 @@ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
static void __split_folio_to_order(struct folio *folio, int old_order,
int new_order)
{
/* Scan poisoned pages when split a poisoned folio to large folios */
const bool handle_hwpoison = folio_test_has_hwpoisoned(folio) && new_order;
long new_nr_pages = 1 << new_order;
long nr_pages = 1 << old_order;
long i;
folio_clear_has_hwpoisoned(folio);
/* Check first new_nr_pages since the loop below skips them */
if (handle_hwpoison &&
page_range_has_hwpoisoned(folio_page(folio, 0), new_nr_pages))
folio_set_has_hwpoisoned(folio);
/*
* Skip the first new_nr_pages, since the new folio from them have all
* the flags from the original folio.
*/
for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) {
struct page *new_head = &folio->page + i;
/*
* Careful: new_folio is not a "real" folio before we cleared PageTail.
* Don't pass it around before clear_compound_head().
@ -3322,6 +3338,10 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
(1L << PG_dirty) |
LRU_GEN_MASK | LRU_REFS_MASK));
if (handle_hwpoison &&
page_range_has_hwpoisoned(new_head, new_nr_pages))
folio_set_has_hwpoisoned(new_folio);
new_folio->mapping = folio->mapping;
new_folio->index = folio->index + i;
@ -3422,8 +3442,6 @@ static int __split_unmapped_folio(struct folio *folio, int new_order,
if (folio_test_anon(folio))
mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
folio_clear_has_hwpoisoned(folio);
/*
* split to new_order one order at a time. For uniform split,
* folio is split to new_order directly.
@ -3653,8 +3671,6 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
min_order = mapping_min_folio_order(folio->mapping);
if (new_order < min_order) {
VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u",
min_order);
ret = -EINVAL;
goto out;
}
@ -3986,12 +4002,7 @@ int min_order_for_split(struct folio *folio)
int split_folio_to_list(struct folio *folio, struct list_head *list)
{
int ret = min_order_for_split(folio);
if (ret < 0)
return ret;
return split_huge_page_to_list_to_order(&folio->page, list, ret);
return split_huge_page_to_list_to_order(&folio->page, list, 0);
}
/*

View File

@ -72,9 +72,6 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0);
/* Don't sleep. */
flags &= ~(__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM);
handle = stack_depot_save(entries, nr_entries, flags);
return stack_depot_set_extra_bits(handle, extra);
}

View File

@ -84,7 +84,8 @@ void kmsan_slab_free(struct kmem_cache *s, void *object)
if (s->ctor)
return;
kmsan_enter_runtime();
kmsan_internal_poison_memory(object, s->object_size, GFP_KERNEL,
kmsan_internal_poison_memory(object, s->object_size,
GFP_KERNEL & ~(__GFP_RECLAIM),
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
}
@ -114,7 +115,8 @@ void kmsan_kfree_large(const void *ptr)
kmsan_enter_runtime();
page = virt_to_head_page((void *)ptr);
KMSAN_WARN_ON(ptr != page_address(page));
kmsan_internal_poison_memory((void *)ptr, page_size(page), GFP_KERNEL,
kmsan_internal_poison_memory((void *)ptr, page_size(page),
GFP_KERNEL & ~(__GFP_RECLAIM),
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
}

View File

@ -208,7 +208,7 @@ void kmsan_free_page(struct page *page, unsigned int order)
return;
kmsan_enter_runtime();
kmsan_internal_poison_memory(page_address(page), page_size(page),
GFP_KERNEL,
GFP_KERNEL & ~(__GFP_RECLAIM),
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
}

113
mm/ksm.c
View File

@ -2455,6 +2455,95 @@ static bool should_skip_rmap_item(struct folio *folio,
return true;
}
struct ksm_next_page_arg {
struct folio *folio;
struct page *page;
unsigned long addr;
};
static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct ksm_next_page_arg *private = walk->private;
struct vm_area_struct *vma = walk->vma;
pte_t *start_ptep = NULL, *ptep, pte;
struct mm_struct *mm = walk->mm;
struct folio *folio;
struct page *page;
spinlock_t *ptl;
pmd_t pmd;
if (ksm_test_exit(mm))
return 0;
cond_resched();
pmd = pmdp_get_lockless(pmdp);
if (!pmd_present(pmd))
return 0;
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) {
ptl = pmd_lock(mm, pmdp);
pmd = pmdp_get(pmdp);
if (!pmd_present(pmd)) {
goto not_found_unlock;
} else if (pmd_leaf(pmd)) {
page = vm_normal_page_pmd(vma, addr, pmd);
if (!page)
goto not_found_unlock;
folio = page_folio(page);
if (folio_is_zone_device(folio) || !folio_test_anon(folio))
goto not_found_unlock;
page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
goto found_unlock;
}
spin_unlock(ptl);
}
start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
if (!start_ptep)
return 0;
for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
pte = ptep_get(ptep);
if (!pte_present(pte))
continue;
page = vm_normal_page(vma, addr, pte);
if (!page)
continue;
folio = page_folio(page);
if (folio_is_zone_device(folio) || !folio_test_anon(folio))
continue;
goto found_unlock;
}
not_found_unlock:
spin_unlock(ptl);
if (start_ptep)
pte_unmap(start_ptep);
return 0;
found_unlock:
folio_get(folio);
spin_unlock(ptl);
if (start_ptep)
pte_unmap(start_ptep);
private->page = page;
private->folio = folio;
private->addr = addr;
return 1;
}
static struct mm_walk_ops ksm_next_page_ops = {
.pmd_entry = ksm_next_page_pmd_entry,
.walk_lock = PGWALK_RDLOCK,
};
static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
@ -2542,21 +2631,27 @@ next_mm:
ksm_scan.address = vma->vm_end;
while (ksm_scan.address < vma->vm_end) {
struct ksm_next_page_arg ksm_next_page_arg;
struct page *tmp_page = NULL;
struct folio_walk fw;
struct folio *folio;
if (ksm_test_exit(mm))
break;
folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
if (folio) {
if (!folio_is_zone_device(folio) &&
folio_test_anon(folio)) {
folio_get(folio);
tmp_page = fw.page;
}
folio_walk_end(&fw, vma);
int found;
found = walk_page_range_vma(vma, ksm_scan.address,
vma->vm_end,
&ksm_next_page_ops,
&ksm_next_page_arg);
if (found > 0) {
folio = ksm_next_page_arg.folio;
tmp_page = ksm_next_page_arg.page;
ksm_scan.address = ksm_next_page_arg.addr;
} else {
VM_WARN_ON_ONCE(found < 0);
ksm_scan.address = vma->vm_end - PAGE_SIZE;
}
if (tmp_page) {

View File

@ -65,6 +65,7 @@
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
#include <linux/shmem_fs.h>
#include <linux/memory-tiers.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
@ -5501,8 +5502,25 @@ fallback:
return ret;
}
if (!needs_fallback && vma->vm_file) {
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t file_end;
file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
/*
* Do not allow to map with PTEs beyond i_size and with PMD
* across i_size to preserve SIGBUS semantics.
*
* Make an exception for shmem/tmpfs that for long time
* intentionally mapped with PMDs across i_size.
*/
needs_fallback = !shmem_mapping(mapping) &&
file_end < folio_next_index(folio);
}
if (pmd_none(*vmf->pmd)) {
if (folio_test_pmd_mappable(folio)) {
if (!needs_fallback && folio_test_pmd_mappable(folio)) {
ret = do_set_pmd(vmf, folio, page);
if (ret != VM_FAULT_FALLBACK)
return ret;

View File

@ -2469,7 +2469,7 @@ void *__init alloc_large_system_hash(const char *tablename,
panic("Failed to allocate %s hash table\n", tablename);
pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
tablename, 1UL << log2qty, get_order(size), size,
virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
if (_hash_shift)

View File

@ -187,7 +187,7 @@ static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr
if (!folio || !folio_test_large(folio))
return 1;
return folio_pte_batch(folio, ptep, pte, max_nr);
return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, FPB_RESPECT_WRITE);
}
static int move_ptes(struct pagetable_move_control *pmc,

View File

@ -82,13 +82,13 @@ retry:
__folio_mark_uptodate(folio);
err = filemap_add_folio(mapping, folio, offset, gfp);
if (unlikely(err)) {
folio_put(folio);
/*
* If a split of large page was required, it
* already happened when we marked the page invalid
* which guarantees that this call won't fail
*/
set_direct_map_default_noflush(folio_page(folio, 0));
folio_put(folio);
if (err == -EEXIST)
goto retry;

View File

@ -1882,6 +1882,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
struct shmem_inode_info *info = SHMEM_I(inode);
unsigned long suitable_orders = 0;
struct folio *folio = NULL;
pgoff_t aligned_index;
long pages;
int error, order;
@ -1895,10 +1896,12 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
order = highest_order(suitable_orders);
while (suitable_orders) {
pages = 1UL << order;
index = round_down(index, pages);
folio = shmem_alloc_folio(gfp, order, info, index);
if (folio)
aligned_index = round_down(index, pages);
folio = shmem_alloc_folio(gfp, order, info, aligned_index);
if (folio) {
index = aligned_index;
goto allocated;
}
if (pages == HPAGE_PMD_NR)
count_vm_event(THP_FILE_FALLBACK);

View File

@ -2046,7 +2046,11 @@ static inline void mark_objexts_empty(struct slabobj_ext *obj_exts)
if (slab_exts) {
unsigned int offs = obj_to_index(obj_exts_slab->slab_cache,
obj_exts_slab, obj_exts);
/* codetag should be NULL */
if (unlikely(is_codetag_empty(&slab_exts[offs].ref)))
return;
/* codetag should be NULL here */
WARN_ON(slab_exts[offs].ref.ct);
set_codetag_empty(&slab_exts[offs].ref);
}

View File

@ -177,6 +177,32 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
return 0;
}
static int try_folio_split_or_unmap(struct folio *folio, struct page *split_at,
unsigned long min_order)
{
enum ttu_flags ttu_flags =
TTU_SYNC |
TTU_SPLIT_HUGE_PMD |
TTU_IGNORE_MLOCK;
int ret;
ret = try_folio_split_to_order(folio, split_at, min_order);
/*
* If the split fails, unmap the folio, so it will be refaulted
* with PTEs to respect SIGBUS semantics.
*
* Make an exception for shmem/tmpfs that for long time
* intentionally mapped with PMDs across i_size.
*/
if (ret && !shmem_mapping(folio->mapping)) {
try_to_unmap(folio, ttu_flags);
WARN_ON(folio_mapped(folio));
}
return ret;
}
/*
* Handle partial folios. The folio may be entirely within the
* range if a split has raced with us. If not, we zero the part of the
@ -194,6 +220,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
size_t size = folio_size(folio);
unsigned int offset, length;
struct page *split_at, *split_at2;
unsigned int min_order;
if (pos < start)
offset = start - pos;
@ -223,8 +250,9 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
if (!folio_test_large(folio))
return true;
min_order = mapping_min_folio_order(folio->mapping);
split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE);
if (!try_folio_split(folio, split_at, NULL)) {
if (!try_folio_split_or_unmap(folio, split_at, min_order)) {
/*
* try to split at offset + length to make sure folios within
* the range can be dropped, especially to avoid memory waste
@ -248,13 +276,10 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
if (!folio_trylock(folio2))
goto out;
/*
* make sure folio2 is large and does not change its mapping.
* Its split result does not matter here.
*/
/* make sure folio2 is large and does not change its mapping */
if (folio_test_large(folio2) &&
folio2->mapping == folio->mapping)
try_folio_split(folio2, split_at2, NULL);
try_folio_split_or_unmap(folio2, split_at2, min_order);
folio_unlock(folio2);
out:

View File

@ -277,12 +277,6 @@ handle_line() {
fi
done
if [[ ${words[$last]} =~ ^[0-9a-f]+\] ]]; then
words[$last-1]="${words[$last-1]} ${words[$last]}"
unset words[$last] spaces[$last]
last=$(( $last - 1 ))
fi
# Extract info after the symbol if present. E.g.:
# func_name+0x54/0x80 (P)
# ^^^
@ -295,6 +289,14 @@ handle_line() {
last=$(( $last - 1 ))
fi
# Join module name with its build id if present, as these were
# split during tokenization (e.g. "[module" and "modbuildid]").
if [[ ${words[$last]} =~ ^[0-9a-f]+\] ]]; then
words[$last-1]="${words[$last-1]} ${words[$last]}"
unset words[$last] spaces[$last]
last=$(( $last - 1 ))
fi
if [[ ${words[$last]} =~ \[([^]]+)\] ]]; then
module=${words[$last]}
# some traces format is "(%pS)", which like "(foo+0x0/0x1 [bar])"