1
0
mirror of https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git synced 2026-01-11 17:10:13 +00:00

Compare commits

...

425 Commits

Author SHA1 Message Date
Linus Torvalds
c06c303832 ocfs2: fix xattr array entry __counted_by error
Commit 2f26f58df041 ("ocfs2: annotate flexible array members with
__counted_by_le()") started annotating the flexible arrays used by
ocfs2, and now gcc complains about ocfs2_reflink_xattr_header():

  In function ‘fortify_memset_chk’,
      inlined from ‘ocfs2_reflink_xattr_header’ at fs/ocfs2/xattr.c:6365:5:
  include/linux/fortify-string.h:480:25: error: call to ‘__write_overflow_field’ declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror=attribute-warning]

and it looks like the complaint is valid - even if the actual error
message is somewhat confusing.

The 'last' pointer points to past the end of the counted flex array, but
is used as an actual 'last' entry rather than a 'one-past-last'.

It looks like the code copied and cleared an extra entry (which is
likely harmless in practice), but I don't know ocfs2 at all.  Because
it's also possible that the counted-by annotations are off-by-one, and
so this needs checking by somebody who actually knows ocfs2.

But in the meantime this fixes the build error, and certainly _looks_
sane.

Cc: Dmitry Antipov <dmantipov@yandex.ru>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Heming Zhao <heming.zhao@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2025-12-06 15:28:11 -08:00
Linus Torvalds
509d3f4584 Significant patch series in this pull request:
- The 6 patch series "panic: sys_info: Refactor and fix a potential
   issue" from Andy Shevchenko fixes a build issue and does some cleanup in
   ib/sys_info.c.
 
 - The 9 patch series "Implement mul_u64_u64_div_u64_roundup()" from
   David Laight enhances the 64-bit math code on behalf of a PWM driver and
   beefs up the test module for these library functions.
 
 - The 2 patch series "scripts/gdb/symbols: make BPF debug info available
   to GDB" from Ilya Leoshkevich makes BPF symbol names, sizes, and line
   numbers available to the GDB debugger.
 
 - The 4 patch series "Enable hung_task and lockup cases to dump system
   info on demand" from Feng Tang adds a sysctl which can be used to cause
   additional info dumping when the hung-task and lockup detectors fire.
 
 - The 6 patch series "lib/base64: add generic encoder/decoder, migrate
   users" from Kuan-Wei Chiu adds a general base64 encoder/decoder to lib/
   and migrates several users away from their private implementations.
 
 - The 2 patch series "rbree: inline rb_first() and rb_last()" from Eric
   Dumazet makes TCP a little faster.
 
 - The 9 patch series "liveupdate: Rework KHO for in-kernel users" from
   Pasha Tatashin reworks the KEXEC Handover interfaces in preparation for
   Live Update Orchestrator (LUO), and possibly for other future clients.
 
 - The 13 patch series "kho: simplify state machine and enable dynamic
   updates" from Pasha Tatashin increases the flexibility of KEXEC
   Handover.  Also preparation for LUO.
 
 - The 18 patch series "Live Update Orchestrator" from Pasha Tatashin is
   a major new feature targeted at cloud environments.  Quoting the [0/N]:
 
     This series introduces the Live Update Orchestrator, a kernel subsystem
     designed to facilitate live kernel updates using a kexec-based reboot.
     This capability is critical for cloud environments, allowing hypervisors
     to be updated with minimal downtime for running virtual machines.  LUO
     achieves this by preserving the state of selected resources, such as
     memory, devices and their dependencies, across the kernel transition.
 
     As a key feature, this series includes support for preserving memfd file
     descriptors, which allows critical in-memory data, such as guest RAM or
     any other large memory region, to be maintained in RAM across the kexec
     reboot.
 
   Mike Rappaport merits a mention here, for his extensive review and
   testing work.
 
 - The 3 patch series "kexec: reorganize kexec and kdump sysfs" from
   Sourabh Jain moves the kexec and kdump sysfs entries from /sys/kernel/
   to /sys/kernel/kexec/ and adds back-compatibility symlinks which can
   hopefully be removed one day.
 
 - The 2 patch series "kho: fixes for vmalloc restoration" from Mike
   Rapoport fixes a BUG which was being hit during KHO restoration of
   vmalloc() regions.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCaTSAkQAKCRDdBJ7gKXxA
 jrkiAP9QKfsRv46XZaM5raScjY1ayjP+gqb2rgt6BQ/gZvb2+wD/cPAYOR6BiX52
 n0pVpQmG5P/KyOmpLztn96ejL4heKwQ=
 =JY96
 -----END PGP SIGNATURE-----

Merge tag 'mm-nonmm-stable-2025-12-06-11-14' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull non-MM updates from Andrew Morton:

 - "panic: sys_info: Refactor and fix a potential issue" (Andy Shevchenko)
   fixes a build issue and does some cleanup in ib/sys_info.c

 - "Implement mul_u64_u64_div_u64_roundup()" (David Laight)
   enhances the 64-bit math code on behalf of a PWM driver and beefs up
   the test module for these library functions

 - "scripts/gdb/symbols: make BPF debug info available to GDB" (Ilya Leoshkevich)
   makes BPF symbol names, sizes, and line numbers available to the GDB
   debugger

 - "Enable hung_task and lockup cases to dump system info on demand" (Feng Tang)
   adds a sysctl which can be used to cause additional info dumping when
   the hung-task and lockup detectors fire

 - "lib/base64: add generic encoder/decoder, migrate users" (Kuan-Wei Chiu)
   adds a general base64 encoder/decoder to lib/ and migrates several
   users away from their private implementations

 - "rbree: inline rb_first() and rb_last()" (Eric Dumazet)
   makes TCP a little faster

 - "liveupdate: Rework KHO for in-kernel users" (Pasha Tatashin)
   reworks the KEXEC Handover interfaces in preparation for Live Update
   Orchestrator (LUO), and possibly for other future clients

 - "kho: simplify state machine and enable dynamic updates" (Pasha Tatashin)
   increases the flexibility of KEXEC Handover. Also preparation for LUO

 - "Live Update Orchestrator" (Pasha Tatashin)
   is a major new feature targeted at cloud environments. Quoting the
   cover letter:

      This series introduces the Live Update Orchestrator, a kernel
      subsystem designed to facilitate live kernel updates using a
      kexec-based reboot. This capability is critical for cloud
      environments, allowing hypervisors to be updated with minimal
      downtime for running virtual machines. LUO achieves this by
      preserving the state of selected resources, such as memory,
      devices and their dependencies, across the kernel transition.

      As a key feature, this series includes support for preserving
      memfd file descriptors, which allows critical in-memory data, such
      as guest RAM or any other large memory region, to be maintained in
      RAM across the kexec reboot.

   Mike Rappaport merits a mention here, for his extensive review and
   testing work.

 - "kexec: reorganize kexec and kdump sysfs" (Sourabh Jain)
   moves the kexec and kdump sysfs entries from /sys/kernel/ to
   /sys/kernel/kexec/ and adds back-compatibility symlinks which can
   hopefully be removed one day

 - "kho: fixes for vmalloc restoration" (Mike Rapoport)
   fixes a BUG which was being hit during KHO restoration of vmalloc()
   regions

* tag 'mm-nonmm-stable-2025-12-06-11-14' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (139 commits)
  calibrate: update header inclusion
  Reinstate "resource: avoid unnecessary lookups in find_next_iomem_res()"
  vmcoreinfo: track and log recoverable hardware errors
  kho: fix restoring of contiguous ranges of order-0 pages
  kho: kho_restore_vmalloc: fix initialization of pages array
  MAINTAINERS: TPM DEVICE DRIVER: update the W-tag
  init: replace simple_strtoul with kstrtoul to improve lpj_setup
  KHO: fix boot failure due to kmemleak access to non-PRESENT pages
  Documentation/ABI: new kexec and kdump sysfs interface
  Documentation/ABI: mark old kexec sysfs deprecated
  kexec: move sysfs entries to /sys/kernel/kexec
  test_kho: always print restore status
  kho: free chunks using free_page() instead of kfree()
  selftests/liveupdate: add kexec test for multiple and empty sessions
  selftests/liveupdate: add simple kexec-based selftest for LUO
  selftests/liveupdate: add userspace API selftests
  docs: add documentation for memfd preservation via LUO
  mm: memfd_luo: allow preserving memfd
  liveupdate: luo_file: add private argument to store runtime state
  mm: shmem: export some functions to internal.h
  ...
2025-12-06 14:01:20 -08:00
Linus Torvalds
09670b8c38 tracing fixes for v6.19:
- Fix accounting of stop_count in file release
 
   On opening the trace file, if "pause-on-trace" option is set, it will
   increment the stop_count. On file release, it checks if stop_count is set,
   and if so it decrements it. Since this code was originally written, the
   stop_count can be incremented by other use cases. This makes just checking
   the stop_count not enough to know if it should be decremented.
 
   Add a new iterator flag called "PAUSE" and have it set if the open
   disables tracing and only decrement the stop_count if that flag is set on
   close.
 
 - Remove length field in trace_seq_printf() of print_synth_event()
 
   When printing the synthetic event that has a static length array field,
   the vsprintf() of the trace_seq_printf() triggered a "(efault)" in the
   output. That's because the print_fmt replaced the "%.*s" with "%s" causing
   the arguments to be off.
 
 - Fix a bunch of typos
 -----BEGIN PGP SIGNATURE-----
 
 iIoEABYKADIWIQRRSw7ePDh/lE+zeZMp5XQQmuv6qgUCaTRsYBQccm9zdGVkdEBn
 b29kbWlzLm9yZwAKCRAp5XQQmuv6qrYpAQC3Qc5QMOlPjqGHXls/4IR4SBEAvsUi
 VZx3PdknfYCe3AD9HoYGOtrDDhSJ1tQbsWP5ud2jatHwL0zGAl3legNp7ww=
 =jlNP
 -----END PGP SIGNATURE-----

Merge tag 'trace-v6.19-2' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace

Pull tracing fixes from Steven Rostedt:

 - Fix accounting of stop_count in file release

   On opening the trace file, if "pause-on-trace" option is set, it will
   increment the stop_count. On file release, it checks if stop_count is
   set, and if so it decrements it. Since this code was originally
   written, the stop_count can be incremented by other use cases. This
   makes just checking the stop_count not enough to know if it should be
   decremented.

   Add a new iterator flag called "PAUSE" and have it set if the open
   disables tracing and only decrement the stop_count if that flag is
   set on close.

 - Remove length field in trace_seq_printf() of print_synth_event()

   When printing the synthetic event that has a static length array
   field, the vsprintf() of the trace_seq_printf() triggered a
   "(efault)" in the output. That's because the print_fmt replaced the
   "%.*s" with "%s" causing the arguments to be off.

 - Fix a bunch of typos

* tag 'trace-v6.19-2' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace:
  tracing: Fix typo in trace_seq.c
  tracing: Fix typo in trace_probe.c
  tracing: Fix multiple typos in trace_osnoise.c
  tracing: Fix multiple typos in trace_events_user.c
  tracing: Fix typo in trace_events_trigger.c
  tracing: Fix typo in trace_events_hist.c
  tracing: Fix typo in trace_events_filter.c
  tracing: Fix multiple typos in trace_events.c
  tracing: Fix multiple typos in trace.c
  tracing: Fix typo in ring_buffer_benchmark.c
  tracing: Fix multiple typos in ring_buffer.c
  tracing: Fix typo in fprobe.c
  tracing: Fix typo in fpgraph.c
  tracing: Fix fixed array of synthetic event
  tracing: Fix enabling of tracing on file release
2025-12-06 13:49:40 -08:00
Linus Torvalds
cc3ee4ba57 Miscellaneous documentation fixes.
Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmk0F6ERHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1hj3w/+N9TghrusYXp4Uoo7krob+SzUIf7cOBEa
 S/akrU1cNgHalPhZFBX5k8oXDcs8HFk5iyXvkr1uVepsdEfxT4BFtvARRRhY/58S
 Z5pFwfMSBDrUx5Xr6Z+jZrRkiSthQ+Dk//4HDhC1/3+344KAL5OvN3wMVrdBAQvx
 +rNS2+NpYpgvFUq7IgtUeRw7pMo4o7rnC+5R6pW6bneCsDYILNmyux9qS9VtnsIC
 +9qNtHlwm+LHEePsDsaQ567D/HFPS+yye/QI0HY2AIyCdO5COIPC73Z4BY3auHr4
 V6709nkQo70HQeY0LVi1oR72WMIW+2WETaL5Vx/Yp8HEAbiY9VKdKg89I8/KHqCI
 4OpjpDD1p/+33Ua8Otf3DAiF9DCfUeVWxAhB+wl17nAiFyUrYtRghNOKh8n+SovM
 EK+EUWX+xMyRVvYAPao7b505nxLqsJn5P6W4EFpN9RWXOKUNDYpo4DSUI32br1HB
 EN2bFQb0AkeRE/AD0K+DTRoorWYe+ZIdwcKXod0GhYoBiSbH36nlhFQ/8zn4p1y/
 hp4DZVcyImk3h/LIqf54PdfZS7s6hoU3VuzOzPVrUHdR+kX6XUs8dEqD+vf+xfCH
 yAwA1+bxpYOo3KNikqGkHBLFTO2K/Hb5gJ5ARUosa1LBu1SvSGbFuhcDzJRntvzK
 G22YpUHWbek=
 =tB8Y
 -----END PGP SIGNATURE-----

Merge tag 'x86-urgent-2025-12-06' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:
 "Miscellaneous documentation fixes"

* tag 'x86-urgent-2025-12-06' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/boot/Documentation: Prefix hexadecimal literals with 0x
  x86/boot/Documentation: Spell 'ID' consistently
  x86/platform: Fix and extend kernel-doc comments in <asm/x86_init.h>
2025-12-06 12:33:26 -08:00
Linus Torvalds
09bcd5ef66 Miscellaneous scheduler fixes/cleanups:
- Fix psi_dequeue() for Proxy Execution
   - Fix hrtick() vs. scheduling context bug
   - Fix unfairness caused by stalled tg_load_avg_contrib when the last task migrates out
   - Fix whitespace noise in headers
   - Remove a preempt-disable section in rt_mutex_setprio()
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmk0FhoRHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1holg/8DQWTB5UcOSK8r4VR6xsDkxmnEA4RNJYg
 YWMAUCJbeAP1qciX9QbPH6T8aKAtrx5d7aLqGfDj7u0sRWTtkz32FcqWiny56vox
 Rc7UIHCL9n5r/ZMwy5sNN7Dxfdr2Eqmxyg5yaAT3VU3AkBQWzVfj8wQbzYL0FPy3
 aw4kwaRB8NMTANdcyVi3hTIDXbLeNb8WCvUKmH0YDfEpeKBikzLBn+yvlkGB/9Wb
 LZ3CzPUtLVdKyJ/W1VdJBokZ1DSUhMkLFFWXxFqt/5TgaXu5wYyCpUr0mvHSQDEd
 PITEMjhZ+NVMQLSSe3od1qa3vIpNe1W/t0FvlXPHXmZ/lmqC3UXwZuO1gzzCz9Jk
 7NluFcoFdSNPLlDEjzA1qls6YAaKHZ8FZfrWRjr2LnZZKZj1r6pfeVJjCWl6Lw+U
 7aOH+Z4TZexgdjZo9qC+S7VJUvt5+P6SipVE/F9a4xd85kEWxGUhKhJuDD7b0Ksq
 pe0dQcva7mW7/FpAZYdSycYTJ98fU4SiUeKp7Er4xX0mRLO7KVFPja9M5ZHvPGH3
 m+i0ONknEDtavgFmg0Z2L2+bpad+cv18hwR94Jhuw7q5ZciwV9vaO/4hbGb4AlAV
 /eJ4rHqPWhOTfe4iHDl82nRYeOxT2V5tX0Uu5oDAKhpAk9xNFkByQsl53gqdJT+Q
 3zVRbx2LIA4=
 =kr0U
 -----END PGP SIGNATURE-----

Merge tag 'sched-urgent-2025-12-06' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar:
 "Miscellaneous scheduler fixes/cleanups:

   - Fix psi_dequeue() for Proxy Execution

   - Fix hrtick() vs. scheduling context bug

   - Fix unfairness caused by stalled tg_load_avg_contrib when the last
     task migrates out

   - Fix whitespace noise in headers

   - Remove a preempt-disable section in rt_mutex_setprio()"

* tag 'sched-urgent-2025-12-06' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/core: Fix psi_dequeue() for Proxy Execution
  sched/fair: Fix unfairness caused by stalled tg_load_avg_contrib when the last task migrates out
  sched/rt: Remove a preempt-disable section in rt_mutex_setprio()
  sched/hrtick: Fix hrtick() vs. scheduling context
  sched/headers: Remove whitespace noise from kernel/sched/sched.h
2025-12-06 12:31:21 -08:00
Linus Torvalds
08b8ddac1f Address various objtool scalability bugs/inefficiencies exposed by
allmodconfig builds, plus improve the quality of alternatives
 instructions generated code and disassembly.
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmk0FVoRHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1iClg//dHY58dvrsp5Fzo10XgU99/kwzNEgl2b5
 SMrSEbliTrehdpG4vBvig9tAMZurxOVIf6yDBEtV45XfD6w3tw6EFYpO1was9wTE
 R/80Ze6BAEeao782xN3sCpakU1Ogwbxhe4jYFZKE/WVbP9ZaeCI8qeBj3RAuOQ9y
 PCJzjD5fl9c2cAGDqCJEswxIptpP7eXoBo/V3Txf46M8/ffFcXdJbHN3HRBlszVs
 5I9Wb2/vFmwJ4Yi4EO8H7KfzwaXA8wW/MJSDcM24P2/+o5iTqSLNd+rADFMW3XF2
 /8b3uAy/6A6tT3ek1teNoM7qB9hRpM1pmpFwgjjTkjl8yamEp6P/W99qUN+UmfV+
 NTiW9sz7ShhVTMCdALIljyjmji318crKYQBDulAHuEACpodcBg/GUGfuUcrjSRB/
 C7PLatOpfMCODPRGPH4+8Wg8nnBGvOEjjODZBjAq2yU5aJnBeLPmbK2mtcaJtKi+
 R0T2LIsNgmnEa4wRZbH8i4jXsgcbe6gD45Tx3qZpss7D4d9IyRWPO8v6GegFUpvh
 dw8qBqhgi1FzryZ/5uwh5IzkVq+iXHqkPBsV9w7CVSFF1Kc5w1/l7MXsEjkc7Xe3
 qMjc43qsN0H/7ngoIA7yp4m7q87gqJMzReIfeIF4pGVtoULGQ+drN0jjQE/SHiKS
 /EM8IAAk0pU=
 =2DKc
 -----END PGP SIGNATURE-----

Merge tag 'objtool-urgent-2025-12-06' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull objtool fixes from Ingo Molnar:
 "Address various objtool scalability bugs/inefficiencies exposed by
  allmodconfig builds, plus improve the quality of alternatives
  instructions generated code and disassembly"

* tag 'objtool-urgent-2025-12-06' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  objtool: Simplify .annotate_insn code generation output some more
  objtool: Add more robust signal error handling, detect and warn about stack overflows
  objtool: Remove newlines and tabs from annotation macros
  objtool: Consolidate annotation macros
  x86/asm: Remove ANNOTATE_DATA_SPECIAL usage
  x86/alternative: Remove ANNOTATE_DATA_SPECIAL usage
  objtool: Fix stack overflow in validate_branch()
2025-12-06 11:56:51 -08:00
Linus Torvalds
92fc1f16e2 Two fixes related to recent introduction of scoped_seqlock_read():
- Fix compiler build failures when a particular .config and
    compiler build options variant doesn't result in the
    expected removal of unused, catch-bugs portions of
    scoped_seqlock_read() by the inliner at build time,
    and cause a linker fail even in correct code.
 
  - Match read-locking order in do_task_stat() and
    do_io_accounting(). The inconsistency here was harmless
    but unnecessary.
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmk0E/wRHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1j10A/9GFzapX2e4SjyzTUeScdEbVM9cp7cb4lo
 PdPXtmq+Nsh+aw/I+hFBj1bGOnV+xRfoU/wc2a3CfFa1CwekRuzTcbCMU60wfJJr
 CYTf1e390J+6XG588JasjykN9TePDY+TZPIv9rO1Eczsgf6AlXQ8m3Q2tAax8CoN
 8AwMBLZFDmP6WjykeqADXOw4MWU/qS/6vESy65LN11n4jzp7dU4VCTF/Cb8ISEvj
 DMPByPwl/Kv9ioKwKf8xEMHvwboa3wwaE8OI8hxvcdB/VjUk0aSTSN/+6EFefX09
 XM2+2EmDr0E2lLutdsBccD7Yy9PK9vlekWSTDqGY5UcTWYRc0mo20nnUrHyu9QF5
 Yk+mr3jyYnPxKgUYu2d2mfeHXDidW0Yo60fvsO7Y1yfxBPQLMyr+MdYiq86IUvcc
 6pTJhw33N5pd+1zM2tRJclUAyiXSD89/XftBrF/USKB0SAS8dLz/fYJ3ApjEJzfz
 xdbIOz8jOMGum1wF8ss1wqjIhWvw3ICZhdMo+6032eR5Gr5SNGpM1lPu7cQQJ4pI
 tRvATuu5lXsxPEM4l/gb1BqlZImUGCaG4wgiQ1xcwMrsyqLoqWUmumdF3A84+wBa
 J6N+3NFALuce6vxABUOyx/oqfaBITndbnWQlC4uGgtNzlqjGABsyqBQL+svp2j2m
 O2jhqGfW4J8=
 =+H27
 -----END PGP SIGNATURE-----

Merge tag 'locking-urgent-2025-12-06' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking fixes from Ingo Molnar:
 "Two fixes related to recent introduction of scoped_seqlock_read():

   - Fix compiler build failures when a particular .config and compiler
     build options variant doesn't result in the expected removal of
     unused, catch-bugs portions of scoped_seqlock_read() by the inliner
     at build time, and cause a linker fail even in correct code

   - Match read-locking order in do_task_stat() and do_io_accounting().
     The inconsistency here was harmless but unnecessary"

* tag 'locking-urgent-2025-12-06' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  seqlock: Cure some more scoped_seqlock() optimization fails
  seqlock, procfs: Match scoped_seqlock_read() critical section vs. RCU ordering in do_task_stat() to do_io_accounting()
2025-12-06 11:31:49 -08:00
Linus Torvalds
5e5ea7f616 iommu/amd: fix SEV-TIO support reporting
Commit eeb934137deb ("iommu/amd: Report SEV-TIO support") was confused
about the config options that expose amd_iommu_sev_tio_supported(), and
made the declaration (and alternative dummy function) conditional on the
CONFIG_AMD_IOMMU config option.

But the code is actually dependent on CONFIG_KVM_AMD_SEV, resulting in

   ERROR: modpost: "amd_iommu_sev_tio_supported" [drivers/crypto/ccp/ccp.ko] undefined!
   make[2]: *** [scripts/Makefile.modpost:147: Module.symvers] Error 1

if you have the AMD iommu enabled, but don't enable KVM_AMD_SEV support.

Fix it by moving the declaration into the right #ifdef section in the
header file.

Fixes: eeb934137deb ("iommu/amd: Report SEV-TIO support")
Cc: Alexey Kardashevskiy <aik@amd.com>
Cc: Joerg Roedel <joerg.roedel@amd.com>
Cc: Vasant Hegde <vasant.hegde@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2025-12-06 11:13:50 -08:00
Linus Torvalds
b0319c4642 NFSD 6.19 Release Notes
Mike Snitzer's mechanism for disabling I/O caching introduced in
 v6.18 is extended to include using direct I/O. The goal is to
 further reduce the memory footprint consumed by NFS clients
 accessing large data sets via NFSD.
 
 The NFSD community adopted a maintainer entry profile during this
 cycle. See
 
   Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst
 
 Work continues on hardening NFSD's implementation of the pNFS block
 layout type. This type enables pNFS clients to directly access the
 underlying block devices that contain an exported file system,
 reducing server overhead and increasing data throughput.
 
 The remaining patches in this pull request are clean-ups and minor
 optimizations. Many thanks to the contributors, reviewers, testers,
 and bug reporters who participated during the v6.19 NFSD development
 cycle.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEKLLlsBKG3yQ88j7+M2qzM29mf5cFAmk0SkQACgkQM2qzM29m
 f5cI3RAAoqg53ctyWC8B76mC2kT/bugWWXNwVAVd58UGy3yptMlJG2TBBAHbV6rs
 NgMzP1gg470eccYonntS6Pk259hJimi/REifk+2bFuvjqym7OruKXJBxn2FZrCCw
 mBu/ptfj9SFoTezAHh/wNuHTEy68gLn6V2LB3jdMcxeqUaC39kFnB0sZP4+xiFcx
 PfL6uiXj8JtGpYXf8AKf7HniZCBrtkia1ByRrFHrcPX5A6S9dL85rDQbm/O8L3AA
 hS3cp2UQUSwvFUED9N2QXPpRQ3nytNSG08f/wOaXhIXAmXq/sZFGjlNqjpX8i5jV
 jsSQIeYy1BwLsdxo25wbShrqmYsFH3zWRELKoCs0g9lXbpJWyK0O93zETXtYRRae
 4O9iJ5VcQMyMpg4/Gh1jsRU9RUCc88T15F2HQVYTIgHrLKnz1sWMKI//1G3kYrf3
 L7hYk2ZU6QYqSNOFkrL0Jf7geqf8FR4nUi/nv+A/2gFqm+WWIhNTKQt2hsQlqD66
 HaYOhFX5SOj8YnjsbpGM8W65n8ELaON0PWhq+SBpRCm7i2llod0HTIpyv00NTwVL
 xmr5MfmO7989kVVy7DA/h/xg/dfOpUCNjImrXz+QDbsmCMdLjT/6YGD47FLkxzEO
 O+PuRIORuez8KosEaIRVjdXcNFPSeGfDdBAveHH9IqRX7ZvIZQY=
 =E0oM
 -----END PGP SIGNATURE-----

Merge tag 'nfsd-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux

Pull nfsd updates from Chuck Lever:

 - Mike Snitzer's mechanism for disabling I/O caching introduced in
   v6.18 is extended to include using direct I/O. The goal is to further
   reduce the memory footprint consumed by NFS clients accessing large
   data sets via NFSD.

 - The NFSD community adopted a maintainer entry profile during this
   cycle. See

      Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst

 - Work continues on hardening NFSD's implementation of the pNFS block
   layout type. This type enables pNFS clients to directly access the
   underlying block devices that contain an exported file system,
   reducing server overhead and increasing data throughput.

 - The remaining patches are clean-ups and minor optimizations. Many
   thanks to the contributors, reviewers, testers, and bug reporters who
   participated during the v6.19 NFSD development cycle.

* tag 'nfsd-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux: (38 commits)
  NFSD: nfsd-io-modes: Separate lists
  NFSD: nfsd-io-modes: Wrap shell snippets in literal code blocks
  NFSD: Add toctree entry for NFSD IO modes docs
  NFSD: add Documentation/filesystems/nfs/nfsd-io-modes.rst
  NFSD: Implement NFSD_IO_DIRECT for NFS WRITE
  NFSD: Make FILE_SYNC WRITEs comply with spec
  NFSD: Add trace point for SCSI fencing operation.
  NFSD: use correct reservation type in nfsd4_scsi_fence_client
  xdrgen: Don't generate unnecessary semicolon
  xdrgen: Fix union declarations
  NFSD: don't start nfsd if sv_permsocks is empty
  xdrgen: handle _XdrString in union encoder/decoder
  xdrgen: Fix the variable-length opaque field decoder template
  xdrgen: Make the xdrgen script location-independent
  xdrgen: Generalize/harden pathname construction
  lockd: don't allow locking on reexported NFSv2/3
  MAINTAINERS: add a nfsd blocklayout reviewer
  nfsd: Use MD5 library instead of crypto_shash
  nfsd: stop pretending that we cache the SEQUENCE reply.
  NFS: nfsd-maintainer-entry-profile: Inline function name prefixes
  ...
2025-12-06 10:57:02 -08:00
Linus Torvalds
1a68aefc71 xen: branch for v6.19-rc1
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRTLbB6QfY48x44uB6AXGG7T9hjvgUCaTPM8wAKCRCAXGG7T9hj
 vhGqAP9rd3B4jk2ATeij3Qm+IS4VnglsOX4elLdT/Tff410jiAD+NFvQbXuE/ujZ
 6J/tejkqwDcxWbYbXvILzc3VniK+Dgc=
 =dFWc
 -----END PGP SIGNATURE-----

Merge tag 'for-linus-6.19-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip

Pull xen updates from Juergen Gross:
 "This round it contains only three small cleanup patches"

* tag 'for-linus-6.19-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip:
  drivers/xen: use min() instead of min_t()
  drivers/xen/xenbus: Replace deprecated strcpy in xenbus_transaction_end
  drivers/xen/xenbus: Simplify return statement in join()
2025-12-06 10:49:19 -08:00
Linus Torvalds
249872f53d tsm for 6.19
- Introduce the PCI/TSM core for the coordination of device
   authentication, link encryption and establishment (IDE), and later
   management of the device security operational states (TDISP). Notify
   the new TSM core layer of PCI device arrival and departure.
 
 - Add a low level TSM driver for the link encryption establishment
   capabilities of the AMD SEV-TIO architecture.
 
 - Add a library of helpers TSM drivers to use for IDE establishment and
   the DOE transport.
 
 - Add skeleton support for 'bind' and 'guest_request' operations in
   support of TDISP.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQSbo+XnGs+rwLz9XGXfioYZHlFsZwUCaTOdAwAKCRDfioYZHlFs
 Z/fWAQDS5mwS/8rn0UdH/SijTm/oKVxdiyIQbTstrjk8AySITgEA5ki9w2iKa0WG
 x1ACZKlo9gS9emyx4wuJpCBIMtR50Qc=
 =B4oG
 -----END PGP SIGNATURE-----

Merge tag 'tsm-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/devsec/tsm

Pull PCIe Link Encryption and Device Authentication from Dan Williams:
 "New PCI infrastructure and one architecture implementation for PCIe
  link encryption establishment via platform firmware services.

  This work is the result of multiple vendors coming to consensus on
  some core infrastructure (thanks Alexey, Yilun, and Aneesh!), and
  three vendor implementations, although only one is included in this
  pull. The PCI core changes have an ack from Bjorn, the crypto/ccp/
  changes have an ack from Tom, and the iommu/amd/ changes have an ack
  from Joerg.

  PCIe link encryption is made possible by the soup of acronyms
  mentioned in the shortlog below. Link Integrity and Data Encryption
  (IDE) is a protocol for installing keys in the transmitter and
  receiver at each end of a link. That protocol is transported over Data
  Object Exchange (DOE) mailboxes using PCI configuration requests.

  The aspect that makes this a "platform firmware service" is that the
  key provisioning and protocol is coordinated through a Trusted
  Execution Envrionment (TEE) Security Manager (TSM). That is either
  firmware running in a coprocessor (AMD SEV-TIO), or quasi-hypervisor
  software (Intel TDX Connect / ARM CCA) running in a protected CPU
  mode.

  Now, the only reason to ask a TSM to run this protocol and install the
  keys rather than have a Linux driver do the same is so that later, a
  confidential VM can ask the TSM directly "can you certify this
  device?".

  That precludes host Linux from provisioning its own keys, because host
  Linux is outside the trust domain for the VM. It also turns out that
  all architectures, save for one, do not publish a mechanism for an OS
  to establish keys in the root port. So "TSM-established link
  encryption" is the only cross-architecture path for this capability
  for the foreseeable future.

  This unblocks the other arch implementations to follow in v6.20/v7.0,
  once they clear some other dependencies, and it unblocks the next
  phase of work to implement the end-to-end flow of confidential device
  assignment. The PCIe specification calls this end-to-end flow Trusted
  Execution Environment (TEE) Device Interface Security Protocol
  (TDISP).

  In the meantime, Linux gets a link encryption facility which has
  practical benefits along the same lines as memory encryption. It
  authenticates devices via certificates and may protect against
  interposer attacks trying to capture clear-text PCIe traffic.

  Summary:

   - Introduce the PCI/TSM core for the coordination of device
     authentication, link encryption and establishment (IDE), and later
     management of the device security operational states (TDISP).
     Notify the new TSM core layer of PCI device arrival and departure

   - Add a low level TSM driver for the link encryption establishment
     capabilities of the AMD SEV-TIO architecture

   - Add a library of helpers TSM drivers to use for IDE establishment
     and the DOE transport

   - Add skeleton support for 'bind' and 'guest_request' operations in
     support of TDISP"

* tag 'tsm-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/devsec/tsm: (23 commits)
  crypto/ccp: Fix CONFIG_PCI=n build
  virt: Fix Kconfig warning when selecting TSM without VIRT_DRIVERS
  crypto/ccp: Implement SEV-TIO PCIe IDE (phase1)
  iommu/amd: Report SEV-TIO support
  psp-sev: Assign numbers to all status codes and add new
  ccp: Make snp_reclaim_pages and __sev_do_cmd_locked public
  PCI/TSM: Add 'dsm' and 'bound' attributes for dependent functions
  PCI/TSM: Add pci_tsm_guest_req() for managing TDIs
  PCI/TSM: Add pci_tsm_bind() helper for instantiating TDIs
  PCI/IDE: Initialize an ID for all IDE streams
  PCI/IDE: Add Address Association Register setup for downstream MMIO
  resource: Introduce resource_assigned() for discerning active resources
  PCI/TSM: Drop stub for pci_tsm_doe_transfer()
  drivers/virt: Drop VIRT_DRIVERS build dependency
  PCI/TSM: Report active IDE streams
  PCI/IDE: Report available IDE streams
  PCI/IDE: Add IDE establishment helpers
  PCI: Establish document for PCI host bridge sysfs attributes
  PCI: Add PCIe Device 3 Extended Capability enumeration
  PCI/TSM: Establish Secure Sessions and Link Encryption
  ...
2025-12-06 10:15:41 -08:00
Linus Torvalds
fbff949679 linux-watchdog 6.19-rc1 tag
-----BEGIN PGP SIGNATURE-----
 Version: GnuPG v2.0.14 (GNU/Linux)
 
 iEYEABECAAYFAmk0BaIACgkQ+iyteGJfRsry9QCfRHZ011GLHsNDseZ4pYC2ZJec
 rXwAoNrYZiVHOmWc3rNK2ZzXygNPOjhI
 =VKgp
 -----END PGP SIGNATURE-----

Merge tag 'linux-watchdog-6.19-rc1' of git://www.linux-watchdog.org/linux-watchdog

Pull watchdog updates from Wim Van Sebroeck:

 - Add watchdog support for:
     - Renesas WWDT
     - AST2700 platform
     - MediaTek MT8189 SoC
     - Loongson-2k0300 watchdog
     - Qualcomm Kaanapali watchdog
     - RK3506 compatible
     - Airoha AN7583 SoC

 - DT Schema conversions:
     - lantiq,wdt
     - TI OMAP
     - marvell,orion-wdt

 - Several other fixes and improvements

* tag 'linux-watchdog-6.19-rc1' of git://www.linux-watchdog.org/linux-watchdog: (30 commits)
  watchdog: starfive: Fix resource leak in probe error path
  dt-bindings: watchdog: airoha: Add support for Airoha AN7583 SoC
  dt-bindings: watchdog: lantiq,wdt: convert bindings to dtschema
  dt-bindings: watchdog: Add RK3506 compatible
  dt-bindings: watchdog: Document Qualcomm Kaanapali watchdog
  watchdog: wdat_wdt: Fix ACPI table leak in probe function
  watchdog: loongson1: Add Loongson-2k0300 watchdog support
  dt-bindings: watchdog: loongson,ls1x-wdt: Add ls2k0300-wdt compatible
  watchdog: loongson1: Drop CONFIG_OF
  watchdog: loongson1: Simplify ls1x_wdt_probe code
  watchdog: loongson1: Add missing MODULE_PARM_DESC
  watchdog/diag288: Fix module comment typos
  dt-bindings: watchdog: Support MediaTek MT8189 wdt
  dt-bindings: watchdog: mediatek,mtk-wdt: Add compatible for MT8189 SoC
  dt-bindings: mfd: rohm,bd96801-pmic: Correct timeout-sec length and reference watchdog schema
  dt-bindings: watchdog: Allow node names named 'pmic'
  dt-bindings: watchdog: Restrict timeout-sec to one number
  watchdog: renesas_wwdt: add driver
  dt-bindings: watchdog: Add Renesas WWDT
  dt-bindings: watchdog: Convert marvell,orion-wdt to DT schema
  ...
2025-12-06 10:00:49 -08:00
Linus Torvalds
973ec55764 rpmsg updates for v6.19
Reduce code duplication related to channel removal, and invoke the
 removal in one case previously missing, in the Glink driver.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEBd4DzF816k8JZtUlCx85Pw2ZrcUFAmkzOyMACgkQCx85Pw2Z
 rcWoRA//eHFfVuYGB593ySrAC6XG8Pq0ePZmH2JlNo7vsj88aboN0Vzjlr/m5EII
 FWAVlMa8GN2gBUR89axNZEVu4fZ+cP7qMgT/aP5TdZDSoKro/tKXY8qbrIZFqWFz
 dyv3W5poeuAk5zH7IruR7HrvzQIoTxeCYaOkxe0kiwxfYVz+CFQzXkfXUBBWqWFS
 C5iSEMlXE2c6rIOIle/bcasSJFbag8OsjHAPwkwd12KVyrgsGl9xOnz5Ro3zA4Lq
 yxLG9ZUYRZy8evrCjH7+IdvE2MQdBkIWmuVvE1VitMfflAPG57kyG4R1XdUtLNoW
 KWPUOu5iVKxPyGeuWcLITM7UAQAc5HkzQSb43G3K0L0Pk2rGuS9VIWe4EBixPE4T
 usf77ROZNU3N6SGnk4YMmVQq3FD/4hBvbQIAX3T89IIOfLUeEhQQVUUIo7KqK8g0
 7SCAGokz60L4XhqUm1a5bERGXS7D1HKDDw+SbIctAENseAi0M3I3piLriDAqIfOo
 F0A1s2Wt5BxwQmCdqmuadhoUudZ3fCGEED8BdRVGgrkW1oqyShHgu65QzoLgvOe0
 1u/NCvnApY7amYnbEEodJQ5irFYxvHI5UwOVpRFOSuFZRxaYjJvm4Wg0bnNaO3ne
 VXJHokegnPt0seboLB1mvno4s6cN3VmMV6m95MxbqyFsj3/tf3Q=
 =BL9l
 -----END PGP SIGNATURE-----

Merge tag 'rpmsg-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux

Pull rpmsg updates from Bjorn Andersson:

 - Reduce code duplication related to channel removal, and invoke the
   removal in one case previously missing, both in the Glink driver

* tag 'rpmsg-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux:
  rpmsg: glink: remove duplicate code for rpmsg device remove
  rpmsg: glink: fix rpmsg device leak
2025-12-06 09:58:02 -08:00
Linus Torvalds
e637b37a52 remoteproc updates for v6.19
Add support for the compute DSP in the Qualcomm SDM660 platform, and
 finally fix up the way MSM8974 audio DSP remoteproc driver manages its
 power rails.
 
 Replace the usage of of_reserved_mem_lookup() with
 of_reserved_mem_region_to_resource() to clean things up across most of
 the drivers.
 
 Perform a variety of housekeeping and cleanup work across iMX, Mediatek,
 and TI remoteproc drivers.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEBd4DzF816k8JZtUlCx85Pw2ZrcUFAmkzOh0ACgkQCx85Pw2Z
 rcXl+hAA4BBJ2e9mq1eWh0d7LSMHg0F5idNZF2dswdSha6qJpJFZphAawhjASiGU
 LQTYMvJP3bSYaPSVr+ZO21399a2/xw1tR+nfYMeHrhdgKSYDEiJervt7Aoi5TyUI
 60OIHcs2DjvhaFPVD42UJ8gHwNilfLinV4dQZCm+aC1wdDg20tSkdIiolSZH5RH1
 S6o08c3jZTnH65ROTd3uzp5F8qG0qoPyyEuMpa1Vrk/C/T8is9yliEDFCrQtX9OS
 g+ARXMqFXYXLwwI6mMV6L6tY/VXuM0my5Uc02K3d6Zu8+B6m4uIvsVRribILPrdA
 6ME+RCPqrgTtR3gWqJD8aCudWXncwmqnRTGbi7HcqR5A5TR3Ke7iolf9WF291vsY
 qE8fjKscQIhWlRLlMDhv1zM3s+YVZE+oHJbC3hQN8g3m49I2eq3caYduKXcvjjoa
 RH3ZytUwx1cQ6ZJ0vnRxV5yPcddxZtL6tG7f4x8wn+orx2uMh8utpaHCJQbw286l
 XLV5CEN9bNgIwdsSY5POCGb6zoSHNyuXdzSKRtGUDRa4VC+Qc3LxeeVj46NJCc8I
 Q1tDguCl+2MY2wew5o+b77gPOc7xETbJ2oRtoAVShlnenmvhKrgKnRehR/GOhCqU
 gjOY2rW/oS7Ac8D9L4K6XU9WD0TReoeakZY0l+S5W9HXJQ2TK7o=
 =t4y4
 -----END PGP SIGNATURE-----

Merge tag 'rproc-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux

Pull remoteproc updates from Bjorn Andersson:

 - Add support for the compute DSP in the Qualcomm SDM660 platform, and
   finally fix up the way MSM8974 audio DSP remoteproc driver manages
   its power rails

 - Replace the usage of of_reserved_mem_lookup() with
   of_reserved_mem_region_to_resource() to clean things up across most
   of the drivers

 - Perform a variety of housekeeping and cleanup work across iMX,
   Mediatek, and TI remoteproc drivers

* tag 'rproc-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux: (45 commits)
  remoteproc: qcom_q6v5_wcss: use optional reset for wcss_q6_bcr_reset
  remoteproc: qcom_q6v5_wcss: fix parsing of qcom,halt-regs
  remoteproc: qcom_wcnss: Fix NULL vs IS_ERR() bug in wcnss_alloc_memory_region()
  remoteproc: qcom: q6v5: Fix NULL vs IS_ERR() bug in q6v5_alloc_memory_region()
  remoteproc: qcom: pas: Fix a couple NULL vs IS_ERR() bugs
  remoteproc: qcom_q6v5_adsp: Fix a NULL vs IS_ERR() check in adsp_alloc_memory_region()
  remoteproc: imx_dsp_rproc: Fix NULL vs IS_ERR() bug in imx_dsp_rproc_add_carveout()
  remoteproc: st: Fix indexing of memory-regions
  remoteproc: qcom: pas: Add support for SDM660 CDSP
  dt-bindings: remoteproc: qcom: adsp: Add SDM660 CDSP compatible
  dt-bindings: remoteproc: qcom: adsp: Add missing constrains for SDM660 ADSP
  dt-bindings: remoteproc: qcom,sc8280xp-pas: Fix CDSP power desc
  remoteproc: omap: Remove redundant pm_runtime_mark_last_busy() calls
  remoteproc: qcom: Use of_reserved_mem_region_* functions for "memory-region"
  remoteproc: qcom_q6v5_pas: Use resource with CX PD for MSM8974
  dt-bindings: remoteproc: qcom,adsp: Make msm8974 use CX as power domain
  remoteproc: Use of_reserved_mem_region_* functions for "memory-region"
  remoteproc: imx_dsp_rproc: Simplify start/stop error handling
  remoteproc: imx_rproc: Remove enum imx_rproc_method
  remoteproc: imx_dsp_rproc: Simplify IMX_RPROC_RESET_CONTROLLER switch case
  ...
2025-12-06 09:55:38 -08:00
Linus Torvalds
eee654ca9a Landlock update for v6.19-rc1
-----BEGIN PGP SIGNATURE-----
 
 iIYEABYKAC4WIQSVyBthFV4iTW/VU1/l49DojIL20gUCaTMgExAcbWljQGRpZ2lr
 b2QubmV0AAoJEOXj0OiMgvbSN0kBALPG/cpioGMk0j3DagnUtV6fPvGuux9YTmbe
 KpIWdsoCAQC5gO9nzHYIqBOL0CjMKjovljbN+W/AOiirJew95ocyAA==
 =msQS
 -----END PGP SIGNATURE-----

Merge tag 'landlock-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/mic/linux

Pull landlock updates from Mickaël Salaün:
 "This mainly fixes handling of disconnected directories and adds new
  tests"

* tag 'landlock-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/mic/linux:
  selftests/landlock: Add disconnected leafs and branch test suites
  selftests/landlock: Add tests for access through disconnected paths
  landlock: Improve variable scope
  landlock: Fix handling of disconnected directories
  selftests/landlock: Fix makefile header list
  landlock: Make docs in cred.h and domain.h visible
  landlock: Minor comments improvements
2025-12-06 09:52:41 -08:00
Linus Torvalds
10003ff8ce turbostat-v2025.12.02
Since turbostat-v2025.09.09:
 
 Add LLC statistics columns:
 	LLCkRPS = Last Level Cache Thousands of References Per Second
 	LLC%hit = Last Level Cache Hit %
 Recognize Wildcat Lake and Nova Lake platforms
 Add MSR check for Android
 Add APERF check for VMWARE
 Add RAPL check for AWS
 Minor fixes to turbostat (and x86_energy_perf_policy)
 -----BEGIN PGP SIGNATURE-----
 
 iQJIBAABCgAyFiEE67dNfPFP+XUaA73mB9BFOha3NhcFAmkvWPwUHGxlbi5icm93
 bkBpbnRlbC5jb20ACgkQB9BFOha3NhdjJRAAngCZNP1JONy7zqx8HsGJQ8+6WhEV
 E4F2mOHgeJd8Ws4iJUJHkEhNKK64J//RpKvyv0DWY/Zxb8i//+zv0FNd4LZTpBPk
 kKp/1AvJM1alfShQoxaHdCgrhsx4GiY9qPpGlGiNqJ3WNY0ikg+ayEQ+Iidot+5d
 YG57LdbAYe4eAnQzLIDrPG7sSNtvWdyYps41JhacoExPXjHfNMU3ZFGwXA0RExMy
 iyRW8t/Xllsvfnv2GDknUEfLJCpql43ukI+fvUAYuA104Rw7fOJsZp2/+AbWZPF8
 xqy2XWmuEeSwl7ed1hJhxQEV1w77FTs61tIQorFkLC8VkZhFWe4M6SK0TdE92SvC
 xpeSluw22YzGmWHsoclzL7Xf23i1L4cqMcaVbyxSiU5xzApolE1/khTbrJ2dlKxZ
 cWLSFUuvzRAGE0zicygaXPr6mMlPlbrbTu82slZMiN3uvd0+2sqD3MoUzZuei5Lp
 NJLKNAXbAHMtECHYKqed7p69xc+2Rob0dn27Dl8JOtqAzUeneytSg4tRVn+1YXxI
 NsJpgEFZ9FFQYuFW2vHuFJtpCnfVVKTFxml1q3Iy4T9Bh9VzVEwuRRpLpOhfD15I
 f/xR99AFXijJtzdLf3kWI198FP6RI5I04FJIl80RUYNmQRcQ2PSKq5xpkUP3f/dO
 LlCWw7HxP6It95g=
 =iMfD
 -----END PGP SIGNATURE-----

Merge tag 'turbostat-v2025.12.02' of git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux

Pull turbostat updates from Len Brown:

 - Add LLC statistics columns:
	LLCkRPS = Last Level Cache Thousands of References Per Second
	LLC%hit = Last Level Cache Hit %

 - Recognize Wildcat Lake and Nova Lake platforms

 - Add MSR check for Android

 - Add APERF check for VMWARE

 - Add RAPL check for AWS

 - Minor fixes to turbostat (and x86_energy_perf_policy)

* tag 'turbostat-v2025.12.02' of git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux: (21 commits)
  tools/power turbostat: version 2025.12.02
  tools/power turbostat: Print wide names only for RAW 64-bit columns
  tools/power turbostat: Print percentages in 8-columns
  tools/power turbostat: Print "nan" for out of range percentages
  tools/power turbostat: Validate APERF access for VMWARE
  tools/power turbostat: Enhance perf probe
  tools/power turbostat: Validate RAPL MSRs for AWS Nitro Hypervisor
  tools/power x86_energy_perf_policy: Fix potential NULL pointer dereference
  tools/power x86_energy_perf_policy: Fix format string in error message
  tools/power x86_energy_perf_policy: Simplify Android MSR probe
  tools/power x86_energy_perf_policy: Add Android MSR device support
  tools/power turbostat: Add run-time MSR driver probe
  tools/power turbostat: Set per_cpu_msr_sum to NULL after free
  tools/power turbostat: Add LLC stats
  tools/power turbostat: Remove dead code
  tools/power turbostat: Refactor floating point printout code
  tools/power turbostat.8: Update example
  tools/power turbostat: Refactor added-counter value printing code
  tools/power turbostat: Refactor added column header printing
  tools/power turbostat: Add Wildcat Lake and Nova Lake support
  ...
2025-12-06 09:35:00 -08:00
Linus Torvalds
56a1a04dc9 NVDIMM changes for 6.19
* nvdimm: Prevent integer overflow in ramdax_get_config_data()
 	* Documentation: btt: Unwrap bit 31-30 nested table
 	* nvdimm: replace use of system_wq with system_percpu_wq
 	* tools/testing/nvdimm: Use per-DIMM device handle
 	* nvdimm: allow exposing RAM carveouts as NVDIMM DIMM devices
 -----BEGIN PGP SIGNATURE-----
 
 iIoEABYKADIWIQSgX9xt+GwmrJEQ+euebuN7TNx1MQUCaTMAfRQcaXJhLndlaW55
 QGludGVsLmNvbQAKCRCebuN7TNx1MWGXAP4lGiQL2fMOpP0PnFVf6aiiNyaTzdaa
 47haKGmsvM6R1AEAqIPelP1tZ+zh6au2G27SJXzJAJk3Nxg1tpxQnZshmg0=
 =LhgA
 -----END PGP SIGNATURE-----

Merge tag 'libnvdimm-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull nvdimm updates from Ira Weiny:
 "These are mainly bug fixes and code updates.

  There is a new feature to divide up memmap= carve outs and a fix
  caught in linux-next for that patch. Managing memmap memory on the fly
  for multiple VM's was proving difficult and Mike provided a driver
  which allows for the memory to be better manged.

  Summary:
   - Allow exposing RAM carveouts as NVDIMM DIMM devices
   - Prevent integer overflow in ramdax_get_config_data()
   - Replace use of system_wq with system_percpu_wq
   - Documentation: btt: Unwrap bit 31-30 nested table
   - tools/testing/nvdimm: Use per-DIMM device handle"

* tag 'libnvdimm-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
  nvdimm: Prevent integer overflow in ramdax_get_config_data()
  Documentation: btt: Unwrap bit 31-30 nested table
  nvdimm: replace use of system_wq with system_percpu_wq
  tools/testing/nvdimm: Use per-DIMM device handle
  nvdimm: allow exposing RAM carveouts as NVDIMM DIMM devices
2025-12-06 09:32:25 -08:00
Linus Torvalds
a7405aa92f dma-mapping updates for Linux 6.19:
- next part of DMA mapping API refactoring to physical addresses as the primary
 interface instead of page+offset parameters; this time dma_map_ops callbacks
 are converted to physical addresses, what in turn results also in some
 simplification of architecture specific code (Leon Romanovsky and Jason
 Gunthorpe)
 - clarify that dma_map_benchmark is not a kernel self-test, but standalone
 tool (Qinxin Xia)
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYIAB0WIQSrngzkoBtlA8uaaJ+Jp1EFxbsSRAUCaTLpeQAKCRCJp1EFxbsS
 RKlAAQCo/gVslheoJ+h4Hk5oUjLfVQaiamJOlzxw12EVHVAs3AEA7GILIAL1GbGn
 EpkgwjIuz/J/4aGhNbYO6J+C1qMAHww=
 =aM6P
 -----END PGP SIGNATURE-----

Merge tag 'dma-mapping-6.19-2025-12-05' of git://git.kernel.org/pub/scm/linux/kernel/git/mszyprowski/linux

Pull dma-mapping updates from Marek Szyprowski:

 - More DMA mapping API refactoring to physical addresses as the primary
   interface instead of page+offset parameters.

   This time dma_map_ops callbacks are converted to physical addresses,
   what in turn results also in some simplification of architecture
   specific code (Leon Romanovsky and Jason Gunthorpe)

 - Clarify that dma_map_benchmark is not a kernel self-test, but
   standalone tool (Qinxin Xia)

* tag 'dma-mapping-6.19-2025-12-05' of git://git.kernel.org/pub/scm/linux/kernel/git/mszyprowski/linux:
  dma-mapping: remove unused map_page callback
  xen: swiotlb: Convert mapping routine to rely on physical address
  x86: Use physical address for DMA mapping
  sparc: Use physical address DMA mapping
  powerpc: Convert to physical address DMA mapping
  parisc: Convert DMA map_page to map_phys interface
  MIPS/jazzdma: Provide physical address directly
  alpha: Convert mapping routine to rely on physical address
  dma-mapping: remove unused mapping resource callbacks
  xen: swiotlb: Switch to physical address mapping callbacks
  ARM: dma-mapping: Switch to physical address mapping callbacks
  ARM: dma-mapping: Reduce struct page exposure in arch_sync_dma*()
  dma-mapping: convert dummy ops to physical address mapping
  dma-mapping: prepare dma_map_ops to conversion to physical address
  tools/dma: move dma_map_benchmark from selftests to tools/dma
2025-12-06 09:25:05 -08:00
Linus Torvalds
f468cf53c5 bitmap updates for v6.19
- Runtime field_{get,prep}() (Geert);
  - Rust ID pool updates (Alice);
  - min_t() simplification (David);
  - __sw_hweightN kernel-doc fixes (Andy);
  - cpumask.h headers cleanup (Andy).
 -----BEGIN PGP SIGNATURE-----
 
 iQGzBAABCgAdFiEEi8GdvG6xMhdgpu/4sUSA/TofvsgFAmkxsKkACgkQsUSA/Tof
 vshxxgv+Ly1WkW65Sr3KmzY0lCFBg+oH+1uc9Y6avc3gciY1nEwHEP0mqjOVuGRd
 HRkxhBKQlZe+GEp09IeCzONhhcAe9VnftD4isIrLlqjlcavs9gWaQRU38lCvfj79
 HPVOOe3zy1TlBFqLfcc+cZWDBG9BMGCZycI1+dZMYzGZ3SUwpdGjNIfFNOC0x0Jg
 7u+nVqduzH155kBSaPUH2FhhC9SjmgW429EBpksKs0POcOiijdLesezksDP+5bfr
 9YyAuP1MZ+bWpMS5S0h/Mw9M/X9eB0ZhY0ahkHV8XFhv/8Wo/gYO98yBb5v8bxa9
 9F3D8FFMfYDmMzmFXlUVH7mNbe3fAtbQq/XQKzjGbe2jZM+3A3YNfCXpBASLsZLt
 p3G31cZRRtuDz4hlEiJeQuF0VB3sN7ycfT53dLIyjl9IMLBk4ArhXSPasN7wHa3Y
 VO5UYCQAOBAu9Kou+ThHDPJz0aBI9GtfwvqJTzgvXa0elZ+Iid6DfeqOSzmHyUOd
 A0qHDI/O
 =EM4O
 -----END PGP SIGNATURE-----

Merge tag 'bitmap-for-6.19' of github.com:/norov/linux

Pull bitmap updates from Yury Norov:

 - Runtime field_{get,prep}() (Geert)

 - Rust ID pool updates (Alice)

 - min_t() simplification (David)

 - __sw_hweightN kernel-doc fixes (Andy)

 - cpumask.h headers cleanup (Andy)

* tag 'bitmap-for-6.19' of github.com:/norov/linux: (32 commits)
  rust_binder: use bitmap for allocation of handles
  rust: id_pool: do not immediately acquire new ids
  rust: id_pool: do not supply starting capacity
  rust: id_pool: rename IdPool::new() to with_capacity()
  rust: bitmap: add BitmapVec::new_inline()
  rust: bitmap: add MAX_LEN and MAX_INLINE_LEN constants
  cpumask: Don't use "proxy" headers
  soc: renesas: Use bitfield helpers
  clk: renesas: Use bitfield helpers
  ALSA: usb-audio: Convert to common field_{get,prep}() helpers
  soc: renesas: rz-sysc: Convert to common field_get() helper
  pinctrl: ma35: Convert to common field_{get,prep}() helpers
  iio: mlx90614: Convert to common field_{get,prep}() helpers
  iio: dac: Convert to common field_prep() helper
  gpio: aspeed: Convert to common field_{get,prep}() helpers
  EDAC/ie31200: Convert to common field_get() helper
  crypto: qat - convert to common field_get() helper
  clk: at91: Convert to common field_{get,prep}() helpers
  bitfield: Add non-constant field_{prep,get}() helpers
  bitfield: Add less-checking __FIELD_{GET,PREP}()
  ...
2025-12-06 09:01:27 -08:00
Miguel Ojeda
309e49039f rust: sync: atomic: separate import "blocks"
Commit 14e9a18b07ec ("rust: sync: atomic: Make Atomic*Ops pub(crate)")
added a `pub(crate)` import in the same "block" as the `pub` one,
without running `rustfmt`, which would sort them differently.

Instead of running `rustfmt` as-is, add a newline to keep the import
"blocks" with different visibilities separate.

Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2025-12-06 08:44:10 -08:00
Linus Torvalds
c84d574698 Modules changes for v6.19-rc1
Rust module parameter support:
 
 - Add Rust module parameter support, enabling Rust kernel modules to declare
   and use module parameters. The rust_minimal sample module demonstrates this,
   and the rust null block driver will be the first to use it in the next cycle.
   This also adds the Rust module files under the modules subsystem as agreed
   between the Rust and modules maintainers.
 
 Hardening:
 
 - Add compile-time check for embedded NUL characters in MODULE_*() macros. This
   module metadata was once used (and maybe still) to bypass license enforcement
   (LWN article [1] from 2003). This change required a sparse fix [2] which you
   reviewed.
 
 MAINTAINERS:
 
 - Add Aaron Tomlin as reviewer for the Modules subsystem.
 
 The changes have been in linux-next for 4 weeks. Recent 0day reports for UM [3]
 and arm64 [4] builds were not reproducible and traced to a buggy bindgen version
 combined with unreleased clang-22 in 0day. The Rust team has reported this to
 0day.
 
 As discussed previously, we rotate module maintainership among co-maintainers
 every 6 months. Sami Tolvanen is next in line and will send the next pull
 request. As a reminder, Luis has already announced [5] he will gradually step
 away as maintainer.
 
 Link: https://lwn.net/Articles/82305/ [1]
 Link: https://lore.kernel.org/linux-sparse/CACePvbVG2KrGQq4cNKV=wbO5h=jp3M0RO1SdfX8kV4OukjPG8A@mail.gmail.com/T/#mf838b3e2e3245d88c30a801ea7473d5a5c0eb121 [2]
 Link: https://lore.kernel.org/oe-kbuild-all/202511210858.uwVivgvn-lkp@intel.com/ [3]
 Link: https://lore.kernel.org/oe-kbuild-all/202512020454.Tf36WHw5-lkp@intel.com/ [4]
 Link: https://lore.kernel.org/linux-modules/aGiAF8IQ4PRYn0th@bombadil.infradead.org/ [5]
 Signed-off-by: Daniel Gomez <da.gomez@samsung.com>
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEE73Ua4R8Pc+G5xjxTQJ6jxB8ZUfsFAmkwygIACgkQQJ6jxB8Z
 UfvZ2Q//YAkK9V1Hk8imngjOxmiT1BGzo0feKSOgDHc0K3G3VkutmYMKTPymLS8Q
 6EbvpbBRke990lOB7PloEL5ih27i9jmdL0QKpgU+uijRy5RssYEOoDMEz9JuKnqX
 L8BzR61YzRoEIZBgZWij1Di+ITTu+qHn5VxnJUCqydDS4uqqcgO/9xibmN1JtToO
 HpI63Y3R0VSMnJYfyVYJuKVCVWBhJzOzgIC8ZJCDUSceZlOAAjTsMyeUPS5m8j03
 28o78aH3XTLRpL46vKBt4hpmeNrqE47tj6meMybVEew9SmEF78B9wbaQD3oR8Jod
 BiFAhCNkwQao6aQAaKHAUZyWl+Udqsk8kJEgSeo/Sn5p1A6c2pGbddg++2W5jk75
 gjYQEwdv+VZuym1YraM8E+mKIU/9+X1NXpwuusC5Vli7xz8DYf3w6llZNOgRQqTr
 E1fXRQv2X5rUz3o6gzHgDF14XUEH0GM/3kYdUFSO9mqAQJCsMIRv0xIzRddsAcXH
 ylqNX+o7cO+wuwcIvBIkhlYwS/MnAP/iDGFp8NTGGZsDrorCkNu5mFlO+xzGVLWd
 gizeWnzgKrCNTzlR9oUzsGuPjTaQMBkNMTwlE+7InlLFH2CUc3vyKrkANzcL/vGn
 jHBdg/pNsboAfbERgNG42d8YqrrCuLvYVrI6TRw9RhPPBFt8coE=
 =ujQB
 -----END PGP SIGNATURE-----

Merge tag 'modules-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/modules/linux

Pull module updates from Daniel Gomez:
 "Rust module parameter support:

   - Add Rust module parameter support, enabling Rust kernel modules to
     declare and use module parameters. The rust_minimal sample module
     demonstrates this, and the rust null block driver will be the first
     to use it in the next cycle. This also adds the Rust module files
     under the modules subsystem as agreed between the Rust and modules
     maintainers.

  Hardening:

   - Add compile-time check for embedded NUL characters in MODULE_*()
     macros. This module metadata was once used (and maybe still) to
     bypass license enforcement (LWN article from 2003):

	https://lwn.net/Articles/82305/ [1]

  MAINTAINERS:

   - Add Aaron Tomlin as reviewer for the Modules subsystem"

* tag 'modules-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/modules/linux:
  MAINTAINERS: Add myself as reviewer for module support
  module: Add compile-time check for embedded NUL characters
  media: radio: si470x: Fix DRIVER_AUTHOR macro definition
  media: dvb-usb-v2: lmedm04: Fix firmware macro definitions
  modules: add rust modules files to MAINTAINERS
  rust: samples: add a module parameter to the rust_minimal sample
  rust: module: update the module macro with module parameter support
  rust: module: use a reference in macros::module::module
  rust: introduce module_param module
  rust: str: add radix prefixed integer parsing functions
  rust: sync: add `SetOnce`
2025-12-06 08:27:07 -08:00
John Stultz
c2ae8b0df2 sched/core: Fix psi_dequeue() for Proxy Execution
Currently, if the sleep flag is set, psi_dequeue() doesn't
change any of the psi_flags.

This is because psi_task_switch() will clear TSK_ONCPU as well
as other potential flags (TSK_RUNNING), and the assumption is
that a voluntary sleep always consists of a task being dequeued
followed shortly there after with a psi_sched_switch() call.

Proxy Execution changes this expectation, as mutex-blocked tasks
that would normally sleep stay on the runqueue. But in the case
where the mutex-owning task goes to sleep, or the owner is on a
remote cpu, we will then deactivate the blocked task shortly
after.

In that situation, the mutex-blocked task will have had its
TSK_ONCPU cleared when it was switched off the cpu, but it will
stay TSK_RUNNING. Then if we later dequeue it (as currently done
if we hit a case find_proxy_task() can't yet handle, such as the
case of the owner being on another rq or a sleeping owner)
psi_dequeue() won't change any state (leaving it TSK_RUNNING),
as it incorrectly expects a psi_task_switch() call to
immediately follow.

Later on when the task get woken/re-enqueued, and psi_flags are
set for TSK_RUNNING, we hit an error as the task is already
TSK_RUNNING:

  psi: inconsistent task state! task=188:kworker/28:0 cpu=28 psi_flags=4 clear=0 set=4

To resolve this, extend the logic in psi_dequeue() so that
if the sleep flag is set, we also check if psi_flags have
TSK_ONCPU set (meaning the psi_task_switch is imminent) before
we do the shortcut return.

If TSK_ONCPU is not set, that means we've already switched away,
and this psi_dequeue call needs to clear the flags.

Fixes: be41bde4c3a8 ("sched: Add an initial sketch of the find_proxy_task() function")
Reported-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Haiyue Wang <haiyuewa@163.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://patch.msgid.link/20251205012721.756394-1-jstultz@google.com
Closes: https://lore.kernel.org/lkml/20251117185550.365156-1-kprateek.nayak@amd.com/
2025-12-06 10:13:16 +01:00
xupengbo
ca125231dd sched/fair: Fix unfairness caused by stalled tg_load_avg_contrib when the last task migrates out
When a task is migrated out, there is a probability that the tg->load_avg
value will become abnormal. The reason is as follows:

1. Due to the 1ms update period limitation in update_tg_load_avg(), there
   is a possibility that the reduced load_avg is not updated to tg->load_avg
   when a task migrates out.

2. Even though __update_blocked_fair() traverses the leaf_cfs_rq_list and
   calls update_tg_load_avg() for cfs_rqs that are not fully decayed, the key
   function cfs_rq_is_decayed() does not check whether
   cfs->tg_load_avg_contrib is null. Consequently, in some cases,
   __update_blocked_fair() removes cfs_rqs whose avg.load_avg has not been
   updated to tg->load_avg.

Add a check of cfs_rq->tg_load_avg_contrib in cfs_rq_is_decayed(),
which fixes the case (2.) mentioned above.

Fixes: 1528c661c24b ("sched/fair: Ratelimit update to tg->load_avg")
Signed-off-by: xupengbo <xupengbo@oppo.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Aaron Lu <ziqianlu@bytedance.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: Aaron Lu <ziqianlu@bytedance.com>
Link: https://patch.msgid.link/20250827022208.14487-1-xupengbo@oppo.com
2025-12-06 10:03:13 +01:00
Sebastian Andrzej Siewior
22abd83277 sched/rt: Remove a preempt-disable section in rt_mutex_setprio()
rt_mutex_setprio() has only one caller: rt_mutex_adjust_prio(). It
expects that task_struct::pi_lock and rt_mutex_base::wait_lock are held.
Both locks are raw_spinlock_t and are acquired with disabled interrupts.

Nevertheless rt_mutex_setprio() disables preemption while invoking
__balance_callbacks() and raw_spin_rq_unlock(). Even if one of the
balance callbacks unlocks the rq then it must not enable interrupts
because rt_mutex_base::wait_lock is still locked.
Therefore interrupts should remain disabled and disabling preemption is
not needed.

Commit 4c9a4bc89a9cc ("sched: Allow balance callbacks for check_class_changed()")
adds a preempt-disable section to rt_mutex_setprio() and
__sched_setscheduler(). In __sched_setscheduler() the preemption is
disabled before rq is unlocked and interrupts enabled but I don't see
why it makes a difference in rt_mutex_setprio().

Remove the preempt_disable() section from rt_mutex_setprio().

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20251127155529.t_sTatE4@linutronix.de
2025-12-06 10:03:13 +01:00
Peter Zijlstra
e38e529974 sched/hrtick: Fix hrtick() vs. scheduling context
The sched_class::task_tick() method is called on the donor
sched_class, and sched_tick() hands it rq->donor as argument,
which is consistent.

However, while hrtick() uses the donor sched_class, it then passes
rq->curr, which is inconsistent. Fix it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: John Stultz <jstultz@google.com>
Link: https://patch.msgid.link/20250918080205.442967033@infradead.org
2025-12-06 10:03:13 +01:00
Ingo Molnar
dde3763365 sched/headers: Remove whitespace noise from kernel/sched/sched.h
A single case of space-Tab noise snuck in recently.

Fixes: 36569780b0d6 ("sched: Change nr_uninterruptible type to unsigned long")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/176478595428.498.13816176784792752599.tip-bot2@tip-bot2
2025-12-06 10:03:13 +01:00
Peter Zijlstra
90dfeef1cd seqlock: Cure some more scoped_seqlock() optimization fails
Arnd reported an x86 randconfig using gcc-15 tripped over
__scoped_seqlock_bug(). Turns out GCC chose not to inline the
scoped_seqlock helper functions and as such was not able to optimize
properly.

[ mingo: Clang fails the build too in some circumstances. ]

Reported-by: Arnd Bergmann <arnd@arndb.de>
Tested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Link: https://patch.msgid.link/20251204104332.GG2528459@noisy.programming.kicks-ass.net
2025-12-06 09:53:05 +01:00
Maurice Hieronymus
c5108c58b9 tracing: Fix typo in trace_seq.c
Fix typo "wont" to "won't".

Link: https://patch.msgid.link/20251121221835.28032-15-mhi@mailbox.org
Signed-off-by: Maurice Hieronymus <mhi@mailbox.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-05 15:43:41 -05:00
Maurice Hieronymus
0f17df72a7 tracing: Fix typo in trace_probe.c
Fix typo "separater" to "separator".

Link: https://patch.msgid.link/20251121221835.28032-14-mhi@mailbox.org
Signed-off-by: Maurice Hieronymus <mhi@mailbox.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-05 15:43:41 -05:00
Maurice Hieronymus
fa3f733d97 tracing: Fix multiple typos in trace_osnoise.c
Fix multiple typos in comments:
"Anotate" -> "Annotate"
"infor" -> "info"
"timestemp" -> "timestamp"
"tread" -> "thread"
"varaibles" -> "variables"
"wast" -> "waste"

Link: https://patch.msgid.link/20251121221835.28032-13-mhi@mailbox.org
Signed-off-by: Maurice Hieronymus <mhi@mailbox.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-05 15:43:41 -05:00
Maurice Hieronymus
6ce5725d73 tracing: Fix multiple typos in trace_events_user.c
Fix multiple typos in comments:
"ambigious" -> "ambiguous"
"explictly" -> "explicitly"
"Uknown" -> "Unknown"

Link: https://patch.msgid.link/20251121221835.28032-12-mhi@mailbox.org
Signed-off-by: Maurice Hieronymus <mhi@mailbox.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-05 15:43:41 -05:00
Maurice Hieronymus
0166d3e31a tracing: Fix typo in trace_events_trigger.c
Fix typo "componenents" to "components".

Link: https://patch.msgid.link/20251121221835.28032-11-mhi@mailbox.org
Signed-off-by: Maurice Hieronymus <mhi@mailbox.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-05 15:43:41 -05:00
Maurice Hieronymus
c29e75532e tracing: Fix typo in trace_events_hist.c
Fix typo "tigger" to "trigger".

Link: https://patch.msgid.link/20251121221835.28032-10-mhi@mailbox.org
Signed-off-by: Maurice Hieronymus <mhi@mailbox.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-05 15:43:40 -05:00
Maurice Hieronymus
86f320904e tracing: Fix typo in trace_events_filter.c
Fix typo "singe" to "single".

Link: https://patch.msgid.link/20251121221835.28032-9-mhi@mailbox.org
Signed-off-by: Maurice Hieronymus <mhi@mailbox.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-05 15:43:40 -05:00
Maurice Hieronymus
d4290963d5 tracing: Fix multiple typos in trace_events.c
Fix multiple typos in comments:
"appened" -> "appended"
"paranthesis" -> "parenthesis"
"parethesis" -> "parenthesis"
"wont" -> "won't"

Link: https://patch.msgid.link/20251121221835.28032-8-mhi@mailbox.org
Signed-off-by: Maurice Hieronymus <mhi@mailbox.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-05 15:43:40 -05:00
Maurice Hieronymus
8d4cdbd45c tracing: Fix multiple typos in trace.c
Fix multiple typos in comments:
"alse" -> "also"
"enabed" -> "enabled"
"instane" -> "instance"
"outputing" -> "outputting"
"seperated" -> "separated"

Link: https://patch.msgid.link/20251121221835.28032-7-mhi@mailbox.org
Signed-off-by: Maurice Hieronymus <mhi@mailbox.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-05 15:43:40 -05:00
Maurice Hieronymus
81354f6335 tracing: Fix typo in ring_buffer_benchmark.c
Fix typo "overwite" to "overwrite".

Link: https://patch.msgid.link/20251121221835.28032-6-mhi@mailbox.org
Signed-off-by: Maurice Hieronymus <mhi@mailbox.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-05 15:43:40 -05:00
Maurice Hieronymus
1edb820ae9 tracing: Fix multiple typos in ring_buffer.c
Fix multiple typos in comments:
"ording" -> "ordering"
"scatch" -> "scratch"
"wont" -> "won't"

Link: https://patch.msgid.link/20251121221835.28032-5-mhi@mailbox.org
Signed-off-by: Maurice Hieronymus <mhi@mailbox.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-05 15:43:40 -05:00
Maurice Hieronymus
2ec7345c2d tracing: Fix typo in fprobe.c
Fix typo "funciton" to "function".

Link: https://patch.msgid.link/20251121221835.28032-4-mhi@mailbox.org
Signed-off-by: Maurice Hieronymus <mhi@mailbox.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-05 15:43:39 -05:00
Maurice Hieronymus
9c3f3b8fea tracing: Fix typo in fpgraph.c
Fix typo "reservered" to "reserved".

Link: https://patch.msgid.link/20251121221835.28032-3-mhi@mailbox.org
Signed-off-by: Maurice Hieronymus <mhi@mailbox.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-05 15:43:39 -05:00
Steven Rostedt
47ef834209 tracing: Fix fixed array of synthetic event
The commit 4d38328eb442d ("tracing: Fix synth event printk format for str
fields") replaced "%.*s" with "%s" but missed removing the number size of
the dynamic and static strings. The commit e1a453a57bc7 ("tracing: Do not
add length to print format in synthetic events") fixed the dynamic part
but did not fix the static part. That is, with the commands:

  # echo 's:wake_lat char[] wakee; u64 delta;' >> /sys/kernel/tracing/dynamic_events
  # echo 'hist:keys=pid:ts=common_timestamp.usecs if !(common_flags & 0x18)' > /sys/kernel/tracing/events/sched/sched_waking/trigger
  # echo 'hist:keys=next_pid:delta=common_timestamp.usecs-$ts:onmatch(sched.sched_waking).trace(wake_lat,next_comm,$delta)' > /sys/kernel/tracing/events/sched/sched_switch/trigger

That caused the output of:

          <idle>-0       [001] d..5.   193.428167: wake_lat: wakee=(efault)sshd-sessiondelta=155
    sshd-session-879     [001] d..5.   193.811080: wake_lat: wakee=(efault)kworker/u34:5delta=58
          <idle>-0       [002] d..5.   193.811198: wake_lat: wakee=(efault)bashdelta=91

The commit e1a453a57bc7 fixed the part where the synthetic event had
"char[] wakee". But if one were to replace that with a static size string:

  # echo 's:wake_lat char[16] wakee; u64 delta;' >> /sys/kernel/tracing/dynamic_events

Where "wakee" is defined as "char[16]" and not "char[]" making it a static
size, the code triggered the "(efaul)" again.

Remove the added STR_VAR_LEN_MAX size as the string is still going to be
nul terminated.

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Douglas Raillard <douglas.raillard@arm.com>
Link: https://patch.msgid.link/20251204151935.5fa30355@gandalf.local.home
Fixes: e1a453a57bc7 ("tracing: Do not add length to print format in synthetic events")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-05 15:38:10 -05:00
Steven Rostedt
02e7769e38 tracing: Fix enabling of tracing on file release
The trace file will pause tracing if the tracing instance has the
"pause-on-trace" option is set. This happens when the file is opened, and
it is unpaused when the file is closed. When this was first added, there
was only one user that paused tracing. On open, the check to pause was:

   if (!iter->snapshot && (tr->trace_flags & TRACE_ITER(PAUSE_ON_TRACE)))

Where if it is not the snapshot tracer and the "pause-on-trace" option is
set, then it increments a "stop_count" of the trace instance.

On close, the check is:

   if (!iter->snapshot && tr->stop_count)

That is, if it is not the snapshot buffer and it was stopped, it will
re-enable tracing.

Now there's more places that stop tracing. This means, if something else
stops tracing the tr->stop_count will be non-zero, and that means if the
trace file is closed, it will decrement the stop_count even though it
never incremented it. This causes a warning because when the user that
stopped tracing enables it again, the stop_count goes below zero.

Instead of relying on the stop_count being set to know if the close of
the trace file should enable tracing again, add a new flag to the trace
iterator. The trace iterator is unique per open of the trace file, and if
the open stops tracing set the trace iterator PAUSE flag. On close, if the
PAUSE flag is set, then re-enable it again.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251202161751.24abaaf1@gandalf.local.home
Fixes: 06e0a548bad0f ("tracing: Do not disable tracing when reading the trace file")
Reported-by: syzbot+ccdec3bfe0beec58a38d@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/692f44a5.a70a0220.2ea503.00c8.GAE@google.com/
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
2025-12-05 15:17:56 -05:00
David Laight
150215b89b drivers/xen: use min() instead of min_t()
min_t(unsigned int, a, b) casts an 'unsigned long' to 'unsigned int'.
Use min(a, b) instead as it promotes any 'unsigned int' to 'unsigned long'
and so cannot discard significant bits.

In this case the 'unsigned long' value is small enough that the result
is ok.

Detected by an extra check added to min_t().

Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Message-ID: <20251119224140.8616-30-david.laight.linux@gmail.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
2025-12-05 08:46:07 +01:00
Dan Williams
7dfbe9a675 crypto/ccp: Fix CONFIG_PCI=n build
It turns out that the PCI driver for ccp is unconditionally built into the
kernel in the CONFIG_PCI=y case. This means that the new SEV-TIO support
needs an explicit dependency on PCI to avoid build errors when
CONFIG_CRYPTO_DEV_SP_PSP=y and CONFIG_PCI=n.

Reported-by: kernel test robot <lkp@intel.com>
Closes: http://lore.kernel.org/202512030743.6pVPA4sx-lkp@intel.com
Cc: Alexey Kardashevskiy <aik@amd.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: John Allen <john.allen@amd.com>
Acked-by: Alexey Kardashevskiy <aik@amd.com>
Link: https://patch.msgid.link/20251203031948.2471431-1-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-12-04 18:14:08 -08:00
Nathan Chancellor
311607017e virt: Fix Kconfig warning when selecting TSM without VIRT_DRIVERS
After commit 3225f52cde56 ("PCI/TSM: Establish Secure Sessions and Link
Encryption"), there is a Kconfig warning when selecting CONFIG_TSM
without CONFIG_VIRT_DRIVERS:

  WARNING: unmet direct dependencies detected for TSM
    Depends on [n]: VIRT_DRIVERS [=n]
    Selected by [y]:
    - PCI_TSM [=y] && PCI [=y]

CONFIG_TSM is defined in drivers/virt/coco/Kconfig but this Kconfig is
only sourced when CONFIG_VIRT_DRIVERS is enabled. Since this symbol is
hidden with no dependencies, it should be available without a symbol
that just enables a menu.

Move the sourcing of drivers/virt/coco/Kconfig outside of
CONFIG_VIRT_DRIVERS and wrap the other source statements in
drivers/virt/coco/Kconfig with CONFIG_VIRT_DRIVERS to ensure users do
not get any additional prompts while ensuring CONFIG_TSM is always
available to select. This complements commit 110c155e8a68 ("drivers/virt:
Drop VIRT_DRIVERS build dependency"), which addressed the build issue
that this Kconfig warning was pointing out.

Fixes: 3225f52cde56 ("PCI/TSM: Establish Secure Sessions and Link Encryption")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202511140712.NubhamPy-lkp@intel.com/
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Link: https://patch.msgid.link/20251203-fix-pci-tsm-select-tsm-warning-v1-1-c3959c1cb110@kernel.org
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-12-04 17:34:16 -08:00
Josh Poimboeuf
2d3451ef1e objtool: Simplify .annotate_insn code generation output some more
Remove the superfluous section name quotes, and combine the longs into a
single command.

Before:

  911: .pushsection ".discard.annotate_insn", "M", @progbits, 8; .long 911b - .; .long 2; .popsection

After:

  911: .pushsection .discard.annotate_insn, "M", @progbits, 8; .long 911b - ., 2; .popsection

No change in functionality.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/hpsfcihgqmhcdrg7pop7z73ptymakgjq7qlxrawrjxilosk43l@xikqif3ievj4
2025-12-03 19:45:29 +01:00
Josh Poimboeuf
799647ddb4 objtool: Add more robust signal error handling, detect and warn about stack overflows
When the kernel build fails due to an objtool segfault, the error
message is a bit obtuse and confusing:

  make[5]: *** [scripts/Makefile.build:503: drivers/scsi/qla2xxx/qla2xxx.o] Error 139
                                                                            ^^^^^^^^^
  make[5]: *** Deleting file 'drivers/scsi/qla2xxx/qla2xxx.o'
  make[4]: *** [scripts/Makefile.build:556: drivers/scsi/qla2xxx] Error 2
  make[3]: *** [scripts/Makefile.build:556: drivers/scsi] Error 2
  make[2]: *** [scripts/Makefile.build:556: drivers] Error 2
  make[1]: *** [/home/jpoimboe/git/linux/Makefile:2013: .] Error 2
  make: *** [Makefile:248: __sub-make] Error 2

Add a signal handler to objtool which prints an error message like if
the local stack has overflown (for which there's a chance as objtool
makes heavy use of recursion):

  drivers/scsi/qla2xxx/qla2xxx.o: error: SIGSEGV: objtool stack overflow!

or:

  drivers/scsi/qla2xxx/qla2xxx.o: error: SIGSEGV: objtool crash!

Also, re-raise the signal so the core dump still gets triggered.

[ mingo: Applied a build fix, added more comments and prettified the code. ]

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Alexandre Chartre <alexandre.chartre@oracle.com>
Cc: David Laight <david.laight.linux@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://patch.msgid.link/mi4tihk4dbncn7belrhp6ooudhpw4vdggerktu5333w3gqf3uf@vqlhc3y667mg
2025-12-03 19:42:37 +01:00
Josh Poimboeuf
ed3bf863dc objtool: Remove newlines and tabs from annotation macros
Remove newlines and tabs from the annotation macros so the invoking code
can insert them as needed to match the style of the surrounding code.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://patch.msgid.link/66305834c2eb78f082217611b756231ae9c0b555.1764694625.git.jpoimboe@kernel.org
2025-12-03 19:42:37 +01:00
Josh Poimboeuf
305c8dc477 objtool: Consolidate annotation macros
Consolidate __ASM_ANNOTATE into a single macro which is used by both C
and asm.  This also makes the code generation a bit more palatable by
putting it all on a single line.

Turn this:

	911:
	       .pushsection .discard.annotate_insn,"M", @progbits, 8
	       .long 911b - .
	       .long 1
	       .popsection
	       jmp __x86_return_thunk

Into:

	911: .pushsection ".discard.annotate_insn", "M", @progbits, 8; .long 911b - .; .long 1; .popsection
	jmp __x86_return_thunk

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://patch.msgid.link/c05ff40d3383e85c3b59018ef0b3c7aaf993a60d.1764694625.git.jpoimboe@kernel.org
2025-12-03 19:40:44 +01:00
Ingo Molnar
1c3377bee2 x86/boot/Documentation: Prefix hexadecimal literals with 0x
The x86 bootloader ID specification text uses hexadecimal
values without a 0x prefix:

        D  kexec-tools
        E  Extended (see ext_loader_type)
        F  Special (0xFF = undefined)
        10 Reserved
        11 Minimal Linux Bootloader
           <http://sebastian-plotz.blogspot.de>
        12 OVMF UEFI virtualization stack
        13 barebox

Which beyond the ambiguity of '13' in isolation, also
made me fail a grep -wi '0xd' when I was looking for
the kexec bootloader ID definition and caused quite
a bit of head-scratching before I found out why it
didn't show up.

Furthermore, the actual explanatory text uses the 0x
prefix:

  For boot loader IDs above T = 0xD, write T = 0xE to this field and
  write the extended ID minus 0x10 to the ext_loader_type field.
  Similarly, the ext_loader_ver field can be used to provide more than
  four bits for the bootloader version.

So make it all both unambiguous, easy to grep and consistent
across the entire documentation by prefixing the IDs with 0x.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: linux-kernel@vger.kernel.org
2025-12-03 18:49:09 +01:00
Ingo Molnar
c7957da777 x86/boot/Documentation: Spell 'ID' consistently
The bootloader ID specification text uses 2 capitalization
variants for the same thing: 'id', 'ids', 'ID' and 'IDs'.

Use 'ID/IDs' consistently.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: linux-kernel@vger.kernel.org
2025-12-03 18:49:00 +01:00
Josh Poimboeuf
f387d0e102 x86/asm: Remove ANNOTATE_DATA_SPECIAL usage
Instead of manually annotating each __ex_table entry, just make the
section mergeable and store the entry size in the ELF section header.

Either way works for objtool create_fake_symbols(), this way produces
cleaner code generation.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://patch.msgid.link/b858cb7891c1ba0080e22a9c32595e6c302435e2.1764694625.git.jpoimboe@kernel.org
2025-12-03 16:53:19 +01:00
Josh Poimboeuf
a818f28f01 x86/alternative: Remove ANNOTATE_DATA_SPECIAL usage
Instead of manually annotating each .altinstructions entry, just make
the section mergeable and store the entry size in the ELF section
header.

Either way works for objtool create_fake_symbols(), this way produces
cleaner code generation.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://patch.msgid.link/5ac04e6db5be6453dce8003a771ebb0c47b4cd7a.1764694625.git.jpoimboe@kernel.org
2025-12-03 16:53:19 +01:00
Bagas Sanjaya
df8c841dd9 NFSD: nfsd-io-modes: Separate lists
Sphinx reports htmldocs indentation warnings:

Documentation/filesystems/nfs/nfsd-io-modes.rst:58: ERROR: Unexpected indentation. [docutils]
Documentation/filesystems/nfs/nfsd-io-modes.rst:59: WARNING: Block quote ends without a blank line; unexpected unindent. [docutils]

These caused the lists to be shown as long running paragraphs merged
with their previous paragraphs.

Fix these by separating the lists with a blank line.

Fixes: fa8d4e6784d1b6 ("NFSD: add Documentation/filesystems/nfs/nfsd-io-modes.rst")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/linux-next/20251202152506.7a2d2d41@canb.auug.org.au/
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-12-03 09:05:14 -05:00
Bagas Sanjaya
4fcf9952fb NFSD: nfsd-io-modes: Wrap shell snippets in literal code blocks
Sphinx reports htmldocs indentation warnings:

Documentation/filesystems/nfs/nfsd-io-modes.rst:29: ERROR: Unexpected indentation. [docutils]
Documentation/filesystems/nfs/nfsd-io-modes.rst:34: ERROR: Unexpected indentation. [docutils]

Fix these by wrapping shell snippets in literal code blocks.

Fixes: fa8d4e6784d1b6 ("NFSD: add Documentation/filesystems/nfs/nfsd-io-modes.rst")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/linux-next/20251202152506.7a2d2d41@canb.auug.org.au/
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-12-03 09:05:14 -05:00
Bagas Sanjaya
21478b6eca NFSD: Add toctree entry for NFSD IO modes docs
Commit fa8d4e6784d1b6 ("NFSD: add
Documentation/filesystems/nfs/nfsd-io-modes.rst") adds documentation for
NFSD I/O modes, but it forgets to add toctree entry for it. Hence,
Sphinx reports:

Documentation/filesystems/nfs/nfsd-io-modes.rst: WARNING: document isn't included in any toctree [toc.not_included]

Add the entry.

Fixes: fa8d4e6784d1b6 ("NFSD: add Documentation/filesystems/nfs/nfsd-io-modes.rst")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/linux-next/20251202152506.7a2d2d41@canb.auug.org.au/
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-12-03 09:05:14 -05:00
Len Brown
9c0bad7508 tools/power turbostat: version 2025.12.02
Since release 2025.09.09:

Add LLC statistics columns:
    LLCkRPS = Last Level Cache Thousands of References Per Second
    LLC%hit = Last Level Cache Hit %
Recognize Wildcat Lake and Nova Lake platforms
Add MSR check for Android
Add APERF check for VMWARE
Add RAPL check for AWS
minor fixes

This patch:

White-space only, resulting from running Lindent
on everything except the tab-justified data-tables,
and using -l150 instead of -l80 to allow long lines.

Signed-off-by: Len Brown <len.brown@intel.com>
2025-12-02 16:11:14 -05:00
Len Brown
1a23ba6a1b tools/power turbostat: Print wide names only for RAW 64-bit columns
Print a wide column header only for the case of a 64-bit RAW counter.

It turns out that wide column headers otherwise are more harm than good.

Signed-off-by: Len Brown <len.brown@intel.com>
2025-12-02 16:11:14 -05:00
Len Brown
2ba8b24e9d tools/power turbostat: Print percentages in 8-columns
Added counters that are FORMAT_PERCENT
do not need to be 64-bits -- 32 is plenty.
This allows the output code to fit them,
and their header, into 8-columns.

Signed-off-by: Len Brown <len.brown@intel.com>
2025-12-02 16:11:14 -05:00
Len Brown
8808292799 tools/power turbostat: Print "nan" for out of range percentages
Sometimes counters return junk.
For the cases where values > 100% is invalid, print "nan".

Signed-off-by: Len Brown <len.brown@intel.com>
2025-12-02 16:11:14 -05:00
Len Brown
951845d51d tools/power turbostat: Validate APERF access for VMWARE
VMWARE correctly enumerates lack of APERF and MPERF in CPUID,
but turbostat didn't consult that before attempting to access them.

Since VMWARE allows access, but always returns 0, turbostat
got confusd into an infinite reset loop.

Head this off by listening to CPUID.6.APERF_MPERF
(and rename the existing variable to make this more clear)

Reported-by: David Arcari <darcari@redhat.com>
Tested-by: David Arcari <darcari@redhat.com>
Signed-off-by: Len Brown <len.brown@intel.com>
2025-12-02 16:11:14 -05:00
Len Brown
68769a0b5a tools/power turbostat: Enhance perf probe
check_perf_access() will now check both IPC and LLC perf counters
if they are enabled.  If any fail, it now disables perf
and all perf counters.

Signed-off-by: Len Brown <len.brown@intel.com>
2025-12-02 16:11:14 -05:00
Len Brown
19476a592b tools/power turbostat: Validate RAPL MSRs for AWS Nitro Hypervisor
Even though the platform->plat_rapl_msrs enumeration may be accurate,
a VM, such as AWS Nitro Hypervisor, may deny access to the underlying MSRs.

Probe if PKG_ENERGY is readable and non-zero.
If no, ignore all RAPL MSRs.

Reported-by: Emily Ehlert <ehemily@amazon.de>
Tested-by: Emily Ehlert <ehemily@amazon.de>
Signed-off-by: Len Brown <len.brown@intel.com>
2025-12-02 16:11:14 -05:00
Malaya Kumar Rout
51860d6330 tools/power x86_energy_perf_policy: Fix potential NULL pointer dereference
In err_on_hypervisor(), strstr() is called to search for "flags" in the
buffer, but the return value is not checked before being used in pointer
arithmetic (flags - buffer). If strstr() returns NULL because "flags" is
not found in /proc/cpuinfo, this will cause undefined behavior and likely
a crash.

Add a NULL check after the strstr() call and handle the error appropriately
by cleaning up resources and reporting a meaningful error message.

Signed-off-by: Malaya Kumar Rout <mrout@redhat.com>
Signed-off-by: Len Brown <len.brown@intel.com>
2025-12-02 16:11:09 -05:00
Malaya Kumar Rout
7446bd6119 tools/power x86_energy_perf_policy: Fix format string in error message
The error message in validate_cpu_selected_set() uses an incomplete
format specifier "cpu%" instead of "cpu%d", resulting in the error
message printing "Requested cpu% is not present" rather than
showing the actual CPU number.

Fix the format string to properly display the CPU number.

Signed-off-by: Malaya Kumar Rout <mrout@redhat.com>
Signed-off-by: Len Brown <len.brown@intel.com>
2025-12-02 15:58:30 -05:00
Len Brown
90a2fe2576 tools/power x86_energy_perf_policy: Simplify Android MSR probe
no functional change

Signed-off-by: Len Brown <len.brown@intel.com>
2025-12-02 15:58:30 -05:00
Kaushlendra Kumar
2ff4b59f2e tools/power x86_energy_perf_policy: Add Android MSR device support
Add support for Android MSR device paths which use /dev/msrN format
instead of the standard Linux /dev/cpu/N/msr format. The tool now
probes both path formats at startup and uses the appropriate one.

This enables x86_energy_perf_policy to work on Android systems where
MSR devices follow a different naming convention while maintaining
full compatibility with standard Linux systems.

Signed-off-by: Kaushlendra Kumar <kaushlendra.kumar@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
2025-12-02 15:58:30 -05:00
Len Brown
d71cb404f0 tools/power turbostat: Add run-time MSR driver probe
Rather than starting down the conditional-compile road...

Probe the location of the MSR files at run-time.

Signed-off-by: Len Brown <len.brown@intel.com>
2025-12-02 15:58:30 -05:00
Emily Ehlert
2313b97bc0 tools/power turbostat: Set per_cpu_msr_sum to NULL after free
Set per_cpu_msr_sum to NULL after freeing it in the error path
of msr_sum_record() to prevent potential use-after-free issues.

Signed-off-by: Emily Ehlert <ehemily@amazon.com>
Signed-off-by: Len Brown <len.brown@intel.com>
2025-12-02 15:58:30 -05:00
Len Brown
28a3ad1fd2 tools/power turbostat: Add LLC stats
LLCkRPS = Last Level Cache Thousands of References Per Second
LLC%hit = Last Level Cache Hit %

These columns are enabled by-default.
They can be controlled with the --show/--hide options
by individual column names above,
or together using the "llc" or "cache" groups.

Signed-off-by: Len Brown <len.brown@intel.com>
2025-12-02 15:58:23 -05:00
Alexey Kardashevskiy
4be423572d crypto/ccp: Implement SEV-TIO PCIe IDE (phase1)
Implement the SEV-TIO (Trusted I/O) firmware interface for PCIe TDISP
(Trust Domain In-Socket Protocol). This enables secure communication
between trusted domains and PCIe devices through the PSP (Platform
Security Processor).

The implementation includes:
- Device Security Manager (DSM) operations for establishing secure links
- SPDM (Security Protocol and Data Model) over DOE (Data Object Exchange)
- IDE (Integrity Data Encryption) stream management for secure PCIe

This module bridges the SEV firmware stack with the generic PCIe TSM
framework.

This is phase1 as described in Documentation/driver-api/pci/tsm.rst.

On AMD SEV, the AMD PSP firmware acts as TSM (manages the security/trust).
The CCP driver provides the interface to it and registers in the TSM
subsystem.

Detect the PSP support (reported via FEATURE_INFO + SNP_PLATFORM_STATUS)
and enable SEV-TIO in the SNP_INIT_EX call if the hardware supports TIO.

Implement SEV TIO PSP command wrappers in sev-dev-tio.c and store
the data in the SEV-TIO-specific structs.

Implement TSM hooks and IDE setup in sev-dev-tsm.c.

Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Link: https://patch.msgid.link/692f506bb80c9_261c11004@dwillia2-mobl4.notmuch
Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-12-02 12:50:33 -08:00
Alexey Kardashevskiy
eeb934137d iommu/amd: Report SEV-TIO support
The SEV-TIO switch in the AMD BIOS is reported to the OS via
the IOMMU Extended Feature 2 register (EFR2), bit 1.

Add helper to parse the bit and report the feature presence.

Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Link: https://patch.msgid.link/20251202024449.542361-4-aik@amd.com
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-12-02 12:06:45 -08:00
Alexey Kardashevskiy
c3859de858 psp-sev: Assign numbers to all status codes and add new
Make the definitions explicit. Add some more new codes.

The following patches will be using SPDM_REQUEST and
EXPAND_BUFFER_LENGTH_REQUEST, others are useful for the PSP FW
diagnostics.

Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Link: https://patch.msgid.link/20251202024449.542361-3-aik@amd.com
Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-12-02 12:06:38 -08:00
Alexey Kardashevskiy
8a5dd102e4 ccp: Make snp_reclaim_pages and __sev_do_cmd_locked public
The snp_reclaim_pages() helper reclaims pages in the FW state. SEV-TIO
and the TMPM driver (a hardware engine which smashes IOMMU PDEs among
other things) will use to reclaim memory when cleaning up.

Share and export snp_reclaim_pages().

Most of the SEV-TIO code uses sev_do_cmd() which locks the sev_cmd_mutex
and already exported. But the SNP init code (which also sets up SEV-TIO)
executes under the sev_cmd_mutex lock so the SEV-TIO code has to use
the __sev_do_cmd_locked() helper. This one though does not need to be
exported/shared globally as SEV-TIO is a part of the CCP driver still.

Share __sev_do_cmd_locked() via the CCP internal header.

Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Link: https://patch.msgid.link/20251202024449.542361-2-aik@amd.com
Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-12-02 12:05:51 -08:00
Alice Ryhl
5ba71195a9 rust_binder: use bitmap for allocation of handles
To find an unused Binder handle, Rust Binder currently iterates the
red/black tree from the beginning until it finds a gap in the keys. This
is extremely slow.

To improve the performance, add a bitmap that keeps track of which
indices are actually in use. This allows us to quickly find an unused
key in the red/black tree.

For a benchmark, please see the below numbers that were obtained from
modifying binderThroughputTest to send a node with each transaction and
stashing it in the server. This results in the number of nodes
increasing by one for every transaction sent. I got the following table
of roundtrip latencies (in µs):

Transaction Range │ Baseline (Rust) │ Bitmap (Rust) │ Comparison (C)
0 - 10,000        │          176.88 │         92.93 │          99.41
10,000 - 20,000   │          437.37 │         87.74 │          98.55
20,000 - 30,000   │          677.49 │         76.24 │          96.37
30,000 - 40,000   │          901.76 │         83.39 │          96.73
40,000 - 50,000   │         1126.62 │        100.44 │          94.57
50,000 - 60,000   │         1288.98 │         94.38 │          96.64
60,000 - 70,000   │         1588.74 │         88.27 │          96.36
70,000 - 80,000   │         1812.97 │         93.97 │          91.24
80,000 - 90,000   │         2062.95 │         92.22 │         102.01
90,000 - 100,000  │         2330.03 │         97.18 │         100.31

It should be clear that the current Rust code becomes linearly slower
per insertion as the number of calls to rb_next() per transaction
increases. After this change, the time to find an ID number appears
constant. (Technically it is not constant-time as both insertion and
removal scan the entire bitmap. However, quick napkin math shows that
scanning the entire bitmap with N=100k takes ~1.5µs, which is neglible
in a benchmark where the rountrip latency is 100µs.)

I've included a comparison to the C driver, which uses the same bitmap
algorithm as this patch since commit 15d9da3f818c ("binder: use bitmap
for faster descriptor lookup").

This currently checks if the bitmap should be shrunk after every
removal. One potential future change is introducing a shrinker to make
this operation O(1), but based on the benchmark above this does not seem
required at this time.

Reviewed-by: Burak Emir <bqe@google.com>
Reviewed-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Acked-by: Carlos Llamas <cmllamas@google.com>
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-12-02 14:17:47 -05:00
Alice Ryhl
f523d110a6 rust: id_pool: do not immediately acquire new ids
When Rust Binder assigns a new ID, it performs various fallible
operations before it "commits" to actually using the new ID. To support
this pattern, change acquire_next_id() so that it does not immediately
call set_bit(), but instead returns an object that may be used to call
set_bit() later.

The UnusedId type holds a exclusive reference to the IdPool, so it's
guaranteed that nobody else can call find_unused_id() while the UnusedId
object is live.

[Miguel: rust: id_pool: fix example]

Reviewed-by: Burak Emir <bqe@google.com>
Reviewed-by: Danilo Krummrich <dakr@kernel.org>
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-12-02 14:17:09 -05:00
Alice Ryhl
69ec6a1bed rust: id_pool: do not supply starting capacity
Rust Binder wants to use inline bitmaps whenever possible to avoid
allocations, so introduce a constructor for an IdPool with arbitrary
capacity that stores the bitmap inline.

The existing constructor could be renamed to with_capacity() to match
constructors for other similar types, but it is removed as there is
currently no user for it.

[Miguel: rust: id_pool: fix broken intra-doc link]

Acked-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Reviewed-by: Burak Emir <bqe@google.com>
Reviewed-by: Danilo Krummrich <dakr@kernel.org>
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-12-02 14:14:50 -05:00
Josh Poimboeuf
0c314a881c objtool: Fix stack overflow in validate_branch()
On an allmodconfig kernel compiled with Clang, objtool is segfaulting in
drivers/scsi/qla2xxx/qla2xxx.o due to a stack overflow in
validate_branch().

Due in part to KASAN being enabled, the qla2xxx code has a large number
of conditional jumps, causing objtool to go quite deep in its recursion.

By far the biggest offender of stack usage is the recently added
'prev_state' stack variable in validate_insn(), coming in at 328 bytes.

Move that variable (and its tracing usage) to handle_insn_ops() and make
handle_insn_ops() noinline to keep its stack frame outside the recursive
call chain.

Reported-by: Nathan Chancellor <nathan@kernel.org>
Fixes: fcb268b47a2f ("objtool: Trace instruction state changes during function validation")
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/21bb161c23ca0d8c942a960505c0d327ca2dc7dc.1764691895.git.jpoimboe@kernel.org
Closes: https://lore.kernel.org/20251201202329.GA3225984@ax162
2025-12-02 17:40:35 +01:00
Ingo Molnar
24bc5ea5c0 seqlock, procfs: Match scoped_seqlock_read() critical section vs. RCU ordering in do_task_stat() to do_io_accounting()
There's two patterns of taking the RCU read-lock and the
sig->stats_lock read-seqlock in do_task_stat() and
do_io_accounting(), with a different ordering:

	# do_io_accounting():

	guard(rcu)();
	scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) {

	# do_task_stat():

	scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) {
	...
			rcu_read_lock();

The ordering is RCU-read+seqlock_read in the first
case, seqlock_read+RCU-read in the second case.

While technically these read locks can be taken in any order,
nevertheless it's good practice to use the more intrusive lock
on the inside (which is the IRQs-off section in this case),
and reduces head-scratching during review when done consistently,
so let's use the do_io_accounting() pattern in do_task_stat().

This will also reduce irqs-off latencies in do_task_stat() a tiny bit.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Link: https://patch.msgid.link/aS6rwnaPbHFCdHp1@gmail.com
2025-12-02 11:21:07 +01:00
Randy Dunlap
33b4c26d4d x86/platform: Fix and extend kernel-doc comments in <asm/x86_init.h>
Fix most (17) kernel-doc warnings in x86_init.h (except for struct
x86_init_ops). The changes are:

- fix struct member name typos
- add ending ':' to struct member names
- add some missing struct member descriptions

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20251129002524.1196500-1-rdunlap@infradead.org
2025-12-01 21:57:16 +01:00
Mike Snitzer
fa8d4e6784 NFSD: add Documentation/filesystems/nfs/nfsd-io-modes.rst
This document details the NFSD IO modes that are configurable using
NFSD's experimental debugfs interfaces:

  /sys/kernel/debug/nfsd/io_cache_read
  /sys/kernel/debug/nfsd/io_cache_write

This document will evolve as NFSD's interfaces do (e.g. if/when NFSD's
debugfs interfaces are replaced with per-export controls).

Future updates will provide more specific guidance and howto
information to help others use and evaluate NFSD's IO modes:
BUFFERED, DONTCACHE and DIRECT.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-12-01 09:57:10 -05:00
Mike Snitzer
06c5c97293 NFSD: Implement NFSD_IO_DIRECT for NFS WRITE
When NFSD_IO_DIRECT is selected via the
/sys/kernel/debug/nfsd/io_cache_write experimental tunable, split
incoming unaligned NFS WRITE requests into a prefix, middle and
suffix segment, as needed. The middle segment is now DIO-aligned and
the prefix and/or suffix are unaligned. Synchronous buffered IO is
used for the unaligned segments, and IOCB_DIRECT is used for the
middle DIO-aligned extent.

Although IOCB_DIRECT avoids the use of the page cache, by itself it
doesn't guarantee data durability. For UNSTABLE WRITE requests,
durability is obtained by a subsequent NFS COMMIT request.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Co-developed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-12-01 09:57:10 -05:00
Chuck Lever
e3e8e176ca NFSD: Make FILE_SYNC WRITEs comply with spec
Mike noted that when NFSD responds to an NFS_FILE_SYNC WRITE, it
does not also persist file time stamps. To wit, Section 18.32.3
of RFC 8881 mandates:

> The client specifies with the stable parameter the method of how
> the data is to be processed by the server. If stable is
> FILE_SYNC4, the server MUST commit the data written plus all file
> system metadata to stable storage before returning results. This
> corresponds to the NFSv2 protocol semantics. Any other behavior
> constitutes a protocol violation. If stable is DATA_SYNC4, then
> the server MUST commit all of the data to stable storage and
> enough of the metadata to retrieve the data before returning.

Commit 3f3503adb332 ("NFSD: Use vfs_iocb_iter_write()") replaced:

-		flags |= RWF_SYNC;

with:

+		kiocb.ki_flags |= IOCB_DSYNC;

which appears to be correct given:

	if (flags & RWF_SYNC)
		kiocb_flags |= IOCB_DSYNC;

in kiocb_set_rw_flags(). However the author of that commit did not
appreciate that the previous line in kiocb_set_rw_flags() results
in IOCB_SYNC also being set:

	kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);

RWF_SUPPORTED contains RWF_SYNC, and RWF_SYNC is the same bit as
IOCB_SYNC. Reviewers at the time did not catch the omission.

Reported-by: Mike Snitzer <snitzer@kernel.org>
Closes: https://lore.kernel.org/linux-nfs/20251018005431.3403-1-cel@kernel.org/T/#t
Fixes: 3f3503adb332 ("NFSD: Use vfs_iocb_iter_write()")
Cc: stable@vger.kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neil@brown.name>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-12-01 09:57:10 -05:00
Alexandru Gagniuc
641092c1bc remoteproc: qcom_q6v5_wcss: use optional reset for wcss_q6_bcr_reset
The "wcss_q6_bcr_reset" is not used on IPQ8074, and IPQ6018. Use
devm_reset_control_get_optional_exclusive() for this reset so that
probe() does not fail on platforms where it is not used.

Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
Link: https://lore.kernel.org/r/20251129013207.3981517-2-mr.nuke.me@gmail.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
2025-11-29 15:20:23 -06:00
Alexandru Gagniuc
7e81fa8d80 remoteproc: qcom_q6v5_wcss: fix parsing of qcom,halt-regs
The "qcom,halt-regs" consists of a phandle reference followed by the
three offsets within syscon for halt registers. Thus, we need to
request 4 integers from of_property_read_variable_u32_array(), with
the halt_reg ofsets at indexes 1, 2, and 3. Offset 0 is the phandle.

With MAX_HALT_REG at 3, of_property_read_variable_u32_array() returns
-EOVERFLOW, causing .probe() to fail.

Increase MAX_HALT_REG to 4, and update the indexes accordingly.

Fixes: 0af65b9b915e ("remoteproc: qcom: wcss: Add non pas wcss Q6 support for QCS404")
Signed-off-by: Alexandru Gagniuc <mr.nuke.me@gmail.com>
Link: https://lore.kernel.org/r/20251129013207.3981517-1-mr.nuke.me@gmail.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
2025-11-29 15:19:45 -06:00
Dan Carpenter
cda5dc12eb remoteproc: qcom_wcnss: Fix NULL vs IS_ERR() bug in wcnss_alloc_memory_region()
The devm_ioremap_resource_wc() function never returns NULL, it returns
error pointers.  Update the checking to match.

Fixes: c70b9d5fdcd7 ("remoteproc: qcom: Use of_reserved_mem_region_* functions for "memory-region"")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Link: https://lore.kernel.org/r/09a43da41ee277a80a3265348831e747f7b62620.1764427595.git.dan.carpenter@linaro.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
2025-11-29 14:18:23 -06:00
Dan Carpenter
cb200e41ed remoteproc: qcom: q6v5: Fix NULL vs IS_ERR() bug in q6v5_alloc_memory_region()
The devm_ioremap_resource_wc() function never returns NULL, it returns
error pointers.  Update the checking to match.

Fixes: c70b9d5fdcd7 ("remoteproc: qcom: Use of_reserved_mem_region_* functions for "memory-region"")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Link: https://lore.kernel.org/r/674b32a78563282adeaf3cdf941314a0b8181026.1764427595.git.dan.carpenter@linaro.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
2025-11-29 14:18:23 -06:00
Dan Carpenter
e7839f773e remoteproc: qcom: pas: Fix a couple NULL vs IS_ERR() bugs
The devm_ioremap_resource_wc() function never returns NULL, it returns
error pointers.  Update the checking to match.

Fixes: c70b9d5fdcd7 ("remoteproc: qcom: Use of_reserved_mem_region_* functions for "memory-region"")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Link: https://lore.kernel.org/r/0599691acc394d9390da8fa0b5de3399b132b187.1764427595.git.dan.carpenter@linaro.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
2025-11-29 14:18:23 -06:00
Dan Carpenter
5e6fee736e remoteproc: qcom_q6v5_adsp: Fix a NULL vs IS_ERR() check in adsp_alloc_memory_region()
The devm_ioremap_resource_wc() function never returns NULL, it returns
error pointers.  Update the check to match.

Fixes: c70b9d5fdcd7 ("remoteproc: qcom: Use of_reserved_mem_region_* functions for "memory-region"")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Link: https://lore.kernel.org/r/6d6b1b0fb6a61b5155a640507217fd7e658858cf.1764427595.git.dan.carpenter@linaro.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
2025-11-29 14:18:23 -06:00
Mickaël Salaün
54f9baf537
selftests/landlock: Add disconnected leafs and branch test suites
Test disconnected directories with two test suites
(layout4_disconnected_leafs and layout5_disconnected_branch) and 43
variants to cover the main corner cases.

These tests are complementary to the previous commit.

Add test_renameat() and test_exchangeat() helpers.

Test coverage for security/landlock is 92.1% of 1927 lines according to
LLVM 20.

Cc: Günther Noack <gnoack@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Tingmao Wang <m@maowtm.org>
Link: https://lore.kernel.org/r/20251128172200.760753-5-mic@digikod.net
Signed-off-by: Mickaël Salaün <mic@digikod.net>
2025-11-28 18:27:07 +01:00
Tingmao Wang
a18ee3f31f
selftests/landlock: Add tests for access through disconnected paths
This adds tests for the edge case discussed in [1], with specific ones
for rename and link operations when the operands are through
disconnected paths, as that go through a separate code path in Landlock.

This has resulted in a warning, due to collect_domain_accesses() not
expecting to reach a different root from path->mnt:

  #  RUN           layout1_bind.path_disconnected ...
  #            OK  layout1_bind.path_disconnected
  ok 96 layout1_bind.path_disconnected
  #  RUN           layout1_bind.path_disconnected_rename ...
  [..] ------------[ cut here ]------------
  [..] WARNING: CPU: 3 PID: 385 at security/landlock/fs.c:1065 collect_domain_accesses
  [..] ...
  [..] RIP: 0010:collect_domain_accesses (security/landlock/fs.c:1065 (discriminator 2) security/landlock/fs.c:1031 (discriminator 2))
  [..] current_check_refer_path (security/landlock/fs.c:1205)
  [..] ...
  [..] hook_path_rename (security/landlock/fs.c:1526)
  [..] security_path_rename (security/security.c:2026 (discriminator 1))
  [..] do_renameat2 (fs/namei.c:5264)
  #            OK  layout1_bind.path_disconnected_rename
  ok 97 layout1_bind.path_disconnected_rename

Move the const char definitions a bit above so that we can use the path
for s4d1 in cleanup code.

Cc: Günther Noack <gnoack@google.com>
Cc: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/027d5190-b37a-40a8-84e9-4ccbc352bcdf@maowtm.org [1]
Signed-off-by: Tingmao Wang <m@maowtm.org>
Link: https://lore.kernel.org/r/20251128172200.760753-4-mic@digikod.net
Signed-off-by: Mickaël Salaün <mic@digikod.net>
2025-11-28 18:27:06 +01:00
Mickaël Salaün
f7ef7de6b9
landlock: Improve variable scope
This is now possible thanks to the disconnected directory fix.

Cc: Günther Noack <gnoack@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Tingmao Wang <m@maowtm.org>
Link: https://lore.kernel.org/r/20251128172200.760753-3-mic@digikod.net
Signed-off-by: Mickaël Salaün <mic@digikod.net>
2025-11-28 18:27:06 +01:00
Mickaël Salaün
49c9e09d96
landlock: Fix handling of disconnected directories
Disconnected files or directories can appear when they are visible and
opened from a bind mount, but have been renamed or moved from the source
of the bind mount in a way that makes them inaccessible from the mount
point (i.e. out of scope).

Previously, access rights tied to files or directories opened through a
disconnected directory were collected by walking the related hierarchy
down to the root of the filesystem, without taking into account the
mount point because it couldn't be found. This could lead to
inconsistent access results, potential access right widening, and
hard-to-debug renames, especially since such paths cannot be printed.

For a sandboxed task to create a disconnected directory, it needs to
have write access (i.e. FS_MAKE_REG, FS_REMOVE_FILE, and FS_REFER) to
the underlying source of the bind mount, and read access to the related
mount point.   Because a sandboxed task cannot acquire more access
rights than those defined by its Landlock domain, this could lead to
inconsistent access rights due to missing permissions that should be
inherited from the mount point hierarchy, while inheriting permissions
from the filesystem hierarchy hidden by this mount point instead.

Landlock now handles files and directories opened from disconnected
directories by taking into account the filesystem hierarchy when the
mount point is not found in the hierarchy walk, and also always taking
into account the mount point from which these disconnected directories
were opened.  This ensures that a rename is not allowed if it would
widen access rights [1].

The rationale is that, even if disconnected hierarchies might not be
visible or accessible to a sandboxed task, relying on the collected
access rights from them improves the guarantee that access rights will
not be widened during a rename because of the access right comparison
between the source and the destination (see LANDLOCK_ACCESS_FS_REFER).
It may look like this would grant more access on disconnected files and
directories, but the security policies are always enforced for all the
evaluated hierarchies.  This new behavior should be less surprising to
users and safer from an access control perspective.

Remove a wrong WARN_ON_ONCE() canary in collect_domain_accesses() and
fix the related comment.

Because opened files have their access rights stored in the related file
security properties, there is no impact for disconnected or unlinked
files.

Cc: Christian Brauner <brauner@kernel.org>
Cc: Günther Noack <gnoack@google.com>
Cc: Song Liu <song@kernel.org>
Reported-by: Tingmao Wang <m@maowtm.org>
Closes: https://lore.kernel.org/r/027d5190-b37a-40a8-84e9-4ccbc352bcdf@maowtm.org
Closes: https://lore.kernel.org/r/09b24128f86973a6022e6aa8338945fcfb9a33e4.1749925391.git.m@maowtm.org
Fixes: b91c3e4ea756 ("landlock: Add support for file reparenting with LANDLOCK_ACCESS_FS_REFER")
Fixes: cb2c7d1a1776 ("landlock: Support filesystem access-control")
Link: https://lore.kernel.org/r/b0f46246-f2c5-42ca-93ce-0d629702a987@maowtm.org [1]
Reviewed-by: Tingmao Wang <m@maowtm.org>
Link: https://lore.kernel.org/r/20251128172200.760753-2-mic@digikod.net
Signed-off-by: Mickaël Salaün <mic@digikod.net>
2025-11-28 18:27:04 +01:00
Andy Shevchenko
aa514a297a calibrate: update header inclusion
While cleaning up some headers, I got a build error on this file:

init/calibrate.c:20:9: error: call to undeclared function 'kstrtoul'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]

Update header inclusions to follow IWYU (Include What You Use) principle.

Link: https://lkml.kernel.org/r/20251124230607.1445421-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:45 -08:00
Ilias Stamatis
6fb3acdebf Reinstate "resource: avoid unnecessary lookups in find_next_iomem_res()"
Commit 97523a4edb7b ("kernel/resource: remove first_lvl / siblings_only
logic") removed an optimization introduced by commit 756398750e11
("resource: avoid unnecessary lookups in find_next_iomem_res()").  That
was not called out in the message of the first commit explicitly so it's
not entirely clear whether removing the optimization happened
inadvertently or not.

As the original commit message of the optimization explains there is no
point considering the children of a subtree in find_next_iomem_res() if
the top level range does not match.

Reinstating the optimization results in performance improvements in
systems where /proc/iomem is ~5k lines long.  Calling mmap() on /dev/mem
in such platforms takes 700-1500μs without the optimisation and 10-50μs
with the optimisation.

Note that even though commit 97523a4edb7b removed the 'sibling_only'
parameter from next_resource(), newer kernels have basically reinstated it
under the name 'skip_children'.

Link: https://lore.kernel.org/all/20251124165349.3377826-1-ilstam@amazon.com/T/#u
Fixes: 97523a4edb7b ("kernel/resource: remove first_lvl / siblings_only logic")
Signed-off-by: Ilias Stamatis <ilstam@amazon.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: "Huang, Ying" <huang.ying.caritas@gmail.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:45 -08:00
Breno Leitao
3fa805c37d vmcoreinfo: track and log recoverable hardware errors
Introduce a generic infrastructure for tracking recoverable hardware
errors (HW errors that are visible to the OS but does not cause a panic)
and record them for vmcore consumption.  This aids post-mortem crash
analysis tools by preserving a count and timestamp for the last occurrence
of such errors.  On the other side, correctable errors, which the OS
typically remains unaware of because the underlying hardware handles them
transparently, are less relevant for crash dump and therefore are NOT
tracked in this infrastructure.

Add centralized logging for sources of recoverable hardware errors based
on the subsystem it has been notified.

hwerror_data is write-only at kernel runtime, and it is meant to be read
from vmcore using tools like crash/drgn.  For example, this is how it
looks like when opening the crashdump from drgn.

	>>> prog['hwerror_data']
	(struct hwerror_info[1]){
		{
			.count = (int)844,
			.timestamp = (time64_t)1752852018,
		},
		...

This helps fleet operators quickly triage whether a crash may be
influenced by hardware recoverable errors (which executes a uncommon code
path in the kernel), especially when recoverable errors occurred shortly
before a panic, such as the bug fixed by commit ee62ce7a1d90 ("page_pool:
Track DMA-mapped pages and unmap them when destroying the pool")

This is not intended to replace full hardware diagnostics but provides a
fast way to correlate hardware events with kernel panics quickly.

Rare machine check exceptions—like those indicated by mce_flags.p5 or
mce_flags.winchip—are not accounted for in this method, as they fall
outside the intended usage scope for this feature's user base.

[leitao@debian.org: add hw-recoverable-errors to toctree]
  Link: https://lkml.kernel.org/r/20251127-vmcoreinfo_fix-v1-1-26f5b1c43da9@debian.org
Link: https://lkml.kernel.org/r/20251010-vmcore_hw_error-v5-1-636ede3efe44@debian.org
Signed-off-by: Breno Leitao <leitao@debian.org>
Suggested-by: Tony Luck <tony.luck@intel.com>
Suggested-by: Shuai Xue <xueshuai@linux.alibaba.com>
Reviewed-by: Shuai Xue <xueshuai@linux.alibaba.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>	[APEI]
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Bob Moore <robert.moore@intel.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Konrad Rzessutek Wilk <konrad.wilk@oracle.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Mahesh Salgaonkar <mahesh@linux.ibm.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: "Oliver O'Halloran" <oohall@gmail.com>
Cc: Omar Sandoval <osandov@osandov.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:44 -08:00
Mike Rapoport (Microsoft)
7b71205ae1 kho: fix restoring of contiguous ranges of order-0 pages
When contiguous ranges of order-0 pages are restored, kho_restore_page()
calls prep_compound_page() with the first page in the range and order as
parameters and then kho_restore_pages() calls split_page() to make sure
all pages in the range are order-0.

However, since split_page() is not intended to split compound pages and
with VM_DEBUG enabled it will trigger a VM_BUG_ON_PAGE().

Update kho_restore_page() so that it will use prep_compound_page() when it
restores a folio and make sure it properly sets page count for both large
folios and ranges of order-0 pages.

Link: https://lkml.kernel.org/r/20251125110917.843744-3-rppt@kernel.org
Fixes: a667300bd53f ("kho: add support for preserving vmalloc allocations")
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reported-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:44 -08:00
Mike Rapoport (Microsoft)
4bc84cd539 kho: kho_restore_vmalloc: fix initialization of pages array
Patch series "kho: fixes for vmalloc restoration".

Pratyush reported off-list that when kho_restore_vmalloc() is used to
restore a vmalloc_huge() allocation it hits VM_BUG_ON() when we
reconstruct the struct pages in kho_restore_pages().

These patches fix the issue.


This patch (of 2):

In case a preserved vmalloc allocation was using huge pages, all pages in
the array of pages added to vm_struct during kho_restore_vmalloc() are
wrongly set to the same page.

Fix the indexing when assigning pages to that array.

Link: https://lkml.kernel.org/r/20251125110917.843744-1-rppt@kernel.org
Link: https://lkml.kernel.org/r/20251125110917.843744-2-rppt@kernel.org
Fixes: a667300bd53f ("kho: add support for preserving vmalloc allocations")
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:44 -08:00
Jarkko Sakkinen
c39eab75a5 MAINTAINERS: TPM DEVICE DRIVER: update the W-tag
I migrated test suite to git.kernel.org so that all my kernel stuff is
consolidated to one single place:

https://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd-test.git/about/

Link: https://lkml.kernel.org/r/20251125160353.2300402-1-jarkko@kernel.org
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:44 -08:00
Thorsten Blum
af06a40474 init: replace simple_strtoul with kstrtoul to improve lpj_setup
Replace simple_strtoul() with the recommended kstrtoul() for parsing the
'lpj=' boot parameter.

Check the return value of kstrtoul() and reject invalid values.  This adds
error handling while preserving existing behavior for valid values, and
removes use of the deprecated simple_strtoul() helper.

Link: https://lkml.kernel.org/r/20251122114539.446937-2-thorsten.blum@linux.dev
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:43 -08:00
Ran Xiaokai
40cd0e8dd2 KHO: fix boot failure due to kmemleak access to non-PRESENT pages
When booting with debug_pagealloc=on while having:
CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT=y
CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=n
the system fails to boot due to page faults during kmemleak scanning.

This occurs because:
With debug_pagealloc is enabled, __free_pages() invokes
debug_pagealloc_unmap_pages(), clearing the _PAGE_PRESENT bit for freed
pages in the kernel page table.  KHO scratch areas are allocated from
memblock and noted by kmemleak.  But these areas don't remain reserved but
released later to the page allocator using init_cma_reserved_pageblock(). 
This causes subsequent kmemleak scans access non-PRESENT pages, leading to
fatal page faults.

Mark scratch areas with kmemleak_ignore_phys() after they are allocated
from memblock to exclude them from kmemleak scanning before they are
released to buddy allocator to fix this.

[ran.xiaokai@zte.com.cn: add comment]
  Link: https://lkml.kernel.org/r/20251127122700.103927-1-ranxiaokai627@163.com
Link: https://lkml.kernel.org/r/20251122182929.92634-1-ranxiaokai627@163.com
Signed-off-by: Ran Xiaokai <ran.xiaokai@zte.com.cn>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Changyuan Lyu <changyuanl@google.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:43 -08:00
Sourabh Jain
fb5c364427 Documentation/ABI: new kexec and kdump sysfs interface
Add an ABI document for following kexec and kdump sysfs interface:

- /sys/kernel/kexec/loaded
- /sys/kernel/kexec/crash_loaded
- /sys/kernel/kexec/crash_size
- /sys/kernel/kexec/crash_elfcorehdr_size
- /sys/kernel/kexec/crash_cma_ranges

Link: https://lkml.kernel.org/r/20251118114507.1769455-4-sourabhjain@linux.ibm.com
Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Aditya Gupta <adityag@linux.ibm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Cc: Shivang Upadhyay <shivangu@linux.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:43 -08:00
Sourabh Jain
5c991b6d9b Documentation/ABI: mark old kexec sysfs deprecated
The previous commit ("kexec: move sysfs entries to /sys/kernel/kexec")
moved all existing kexec sysfs entries to a new location. The ABI
document is updated to include a note about the deprecation of the old
kexec sysfs entries.

The following kexec sysfs entries are deprecated:
- /sys/kernel/kexec_loaded
- /sys/kernel/kexec_crash_loaded
- /sys/kernel/kexec_crash_size
- /sys/kernel/crash_elfcorehdr_size
- /sys/kernel/kexec_crash_cma_ranges

Link: https://lkml.kernel.org/r/20251118114507.1769455-3-sourabhjain@linux.ibm.com
Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Aditya Gupta <adityag@linux.ibm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Cc: Shivang Upadhyay <shivangu@linux.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:43 -08:00
Sourabh Jain
cf4340bdd9 kexec: move sysfs entries to /sys/kernel/kexec
Patch series "kexec: reorganize kexec and kdump sysfs", v6.

All existing kexec and kdump sysfs entries are moved to a new location,
/sys/kernel/kexec, to keep /sys/kernel/ clean and better organized.
Symlinks are created at the old locations for backward compatibility and
can be removed in the future [01/03].

While doing this cleanup, the old kexec and kdump sysfs entries are
marked as deprecated in the existing ABI documentation [02/03]. This
makes it clear that these older interfaces should no longer be used.
New ABI documentation is added to describe the reorganized interfaces
[03/03], so users and tools can rely on the updated sysfs interfaces
going forward.


This patch (of 3):

Several kexec and kdump sysfs entries are currently placed directly under
/sys/kernel/, which clutters the directory and makes it harder to identify
unrelated entries.  To improve organization and readability, these entries
are now moved under a dedicated directory, /sys/kernel/kexec.

The following sysfs moved under new kexec sysfs node
+------------------------------------+------------------+
|    Old sysfs name         |     New sysfs name        |
|  (under /sys/kernel)      | (under /sys/kernel/kexec) |
+---------------------------+---------------------------+
| kexec_loaded              | loaded                    |
+---------------------------+---------------------------+
| kexec_crash_loaded        | crash_loaded              |
+---------------------------+---------------------------+
| kexec_crash_size          | crash_size                |
+---------------------------+---------------------------+
| crash_elfcorehdr_size     | crash_elfcorehdr_size     |
+---------------------------+---------------------------+
| kexec_crash_cma_ranges    | crash_cma_ranges          |
+---------------------------+---------------------------+

For backward compatibility, symlinks are created at the old locations so
that existing tools and scripts continue to work.  These symlinks can be
removed in the future once users have switched to the new path.

While creating symlinks, entries are added in /sys/kernel/ that point to
their new locations under /sys/kernel/kexec/.  If an error occurs while
adding a symlink, it is logged but does not stop initialization of the
remaining kexec sysfs symlinks.

The /sys/kernel/<crash_elfcorehdr_size | kexec/crash_elfcorehdr_size>
entry is now controlled by CONFIG_CRASH_DUMP instead of
CONFIG_VMCORE_INFO, as CONFIG_CRASH_DUMP also enables CONFIG_VMCORE_INFO.

Link: https://lkml.kernel.org/r/20251118114507.1769455-1-sourabhjain@linux.ibm.com
Link: https://lkml.kernel.org/r/20251118114507.1769455-2-sourabhjain@linux.ibm.com
Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Aditya Gupta <adityag@linux.ibm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Cc: Shivang Upadhyay <shivangu@linux.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:42 -08:00
Pratyush Yadav
11047466ef test_kho: always print restore status
Currently the KHO test only prints a message on success, and remains
silent on failure.  This makes it difficult to notice a failing test.  A
failing test is usually more interesting than a successful one.

Always print the test status after attempting restore.

Link: https://lkml.kernel.org/r/20251118181046.23321-1-pratyush@kernel.org
Signed-off-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Acked-by: SeongJae Park <sj@kerneel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Pratyush Yadav <pratyush@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:42 -08:00
Pratyush Yadav
b15515155a kho: free chunks using free_page() instead of kfree()
Before commit fa759cd75bce5 ("kho: allocate metadata directly from the
buddy allocator"), the chunks were allocated from the slab allocator using
kzalloc().  Those were rightly freed using kfree().

When the commit switched to using the buddy allocator directly, it missed
updating kho_mem_ser_free() to use free_page() instead of kfree().

Link: https://lkml.kernel.org/r/20251118182218.63044-1-pratyush@kernel.org
Fixes: fa759cd75bce5 ("kho: allocate metadata directly from the buddy allocator")
Signed-off-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: David Matlack <dmatlack@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:42 -08:00
Pasha Tatashin
724bf8c559 selftests/liveupdate: add kexec test for multiple and empty sessions
Introduce a new kexec-based selftest, luo_kexec_multi_session, to validate
the end-to-end lifecycle of a more complex LUO scenario.

While the existing luo_kexec_simple test covers the basic end-to-end
lifecycle, it is limited to a single session with one preserved file. 
This new test significantly expands coverage by verifying LUO's ability to
handle a mixed workload involving multiple sessions, some of which are
intentionally empty.  This ensures that the LUO core correctly preserves
and restores the state of all session types across a reboot.

The test validates the following sequence:

Stage 1 (Pre-kexec):

  - Creates two empty test sessions (multi-test-empty-1,
    multi-test-empty-2).
  - Creates a session with one preserved memfd (multi-test-files-1).
  - Creates another session with two preserved memfds
    (multi-test-files-2), each containing unique data.
  - Creates a state-tracking session to manage the transition to
    Stage 2.
  - Executes a kexec reboot via the helper script.

Stage 2 (Post-kexec):

  - Retrieves the state-tracking session to confirm it is in the
    post-reboot stage.
  - Retrieves all four test sessions (both the empty and non-empty
    ones).
  - For the non-empty sessions, restores the preserved memfds and
    verifies their contents match the original data patterns.
  - Finalizes all test sessions and the state session to ensure a clean
    teardown and that all associated kernel resources are correctly
    released.

This test provides greater confidence in the robustness of the LUO
framework by validating its behavior in a more realistic, multi-faceted
scenario.

Link: https://lkml.kernel.org/r/20251125165850.3389713-19-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:42 -08:00
Pasha Tatashin
a003bdb9ec selftests/liveupdate: add simple kexec-based selftest for LUO
Introduce a kexec-based selftest, luo_kexec_simple, to validate the
end-to-end lifecycle of a Live Update Orchestrator session across a
reboot.

While existing tests verify the uAPI in a pre-reboot context, this test
ensures that the core functionality—preserving state via Kexec Handover
and restoring it in a new kernel—works as expected.

The test operates in two stages, managing its state across the reboot by
preserving a dedicated "state session" containing a memfd.  This mechanism
dogfoods the LUO feature itself for state tracking, making the test
self-contained.

The test validates the following sequence:

Stage 1 (Pre-kexec):
 - Creates a test session (test-session).
 - Creates and preserves a memfd with a known data pattern into the test
   session.
 - Creates the state-tracking session to signal progression to Stage 2.
 - Executes a kexec reboot via a helper script.

Stage 2 (Post-kexec):
 - Retrieves the state-tracking session to confirm it is in the
   post-reboot stage.
 - Retrieves the preserved test session.
 - Restores the memfd from the test session and verifies its contents
   match the original data pattern written in Stage 1.
 - Finalizes both the test and state sessions to ensure a clean
   teardown.

The test relies on a helper script (do_kexec.sh) to perform the reboot and
a shared utility library (luo_test_utils.c) for common LUO operations,
keeping the main test logic clean and focused.

Link: https://lkml.kernel.org/r/20251125165850.3389713-18-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Zhu Yanjun <yanjun.zhu@linux.dev>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:41 -08:00
Pasha Tatashin
80bab43f6f selftests/liveupdate: add userspace API selftests
Introduce a selftest suite for LUO.  These tests validate the core
userspace-facing API provided by the /dev/liveupdate device and its
associated ioctls.

The suite covers fundamental device behavior, session management, and the
file preservation mechanism using memfd as a test case.  This provides
regression testing for the LUO uAPI.

The following functionality is verified:

Device Access:
    Basic open and close operations on /dev/liveupdate.
    Enforcement of exclusive device access (verifying EBUSY on a
    second open).

Session Management:
    Successful creation of sessions with unique names.
    Failure to create sessions with duplicate names.

File Preservation:
    Preserving a single memfd and verifying its content remains
    intact post-preservation.
    Preserving multiple memfds within a single session, each with
    unique data.
    A complex scenario involving multiple sessions, each containing
    a mix of empty and data-filled memfds.

Note: This test suite is limited to verifying the pre-kexec functionality
of LUO (e.g., session creation, file preservation).  The post-kexec
restoration of resources is not covered, as the kselftest framework does
not currently support orchestrating a reboot and continuing execution in
the new kernel.

Link: https://lkml.kernel.org/r/20251125165850.3389713-17-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:41 -08:00
Pratyush Yadav
15fc11bb2c docs: add documentation for memfd preservation via LUO
Add the documentation under the "Preserving file descriptors" section of
LUO's documentation.

Link: https://lkml.kernel.org/r/20251125165850.3389713-16-pasha.tatashin@soleen.com
Signed-off-by: Pratyush Yadav <ptyadav@amazon.de>
Co-developed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:41 -08:00
Pratyush Yadav
b3749f174d mm: memfd_luo: allow preserving memfd
The ability to preserve a memfd allows userspace to use KHO and LUO to
transfer its memory contents to the next kernel.  This is useful in many
ways.  For one, it can be used with IOMMUFD as the backing store for IOMMU
page tables.  Preserving IOMMUFD is essential for performing a hypervisor
live update with passthrough devices.  memfd support provides the first
building block for making that possible.

For another, applications with a large amount of memory that takes time to
reconstruct, reboots to consume kernel upgrades can be very expensive. 
memfd with LUO gives those applications reboot-persistent memory that they
can use to quickly save and reconstruct that state.

While memfd is backed by either hugetlbfs or shmem, currently only support
on shmem is added.  To be more precise, support for anonymous shmem files
is added.

The handover to the next kernel is not transparent.  All the properties of
the file are not preserved; only its memory contents, position, and size. 
The recreated file gets the UID and GID of the task doing the restore, and
the task's cgroup gets charged with the memory.

Once preserved, the file cannot grow or shrink, and all its pages are
pinned to avoid migrations and swapping.  The file can still be read from
or written to.

Use vmalloc to get the buffer to hold the folios, and preserve it using
kho_preserve_vmalloc().  This doesn't have the size limit.

Link: https://lkml.kernel.org/r/20251125165850.3389713-15-pasha.tatashin@soleen.com
Signed-off-by: Pratyush Yadav <ptyadav@amazon.de>
Co-developed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:41 -08:00
Pratyush Yadav
8def18633e liveupdate: luo_file: add private argument to store runtime state
Currently file handlers only get the serialized_data field to store their
state.  This field has a pointer to the serialized state of the file, and
it becomes a part of LUO file's serialized state.

File handlers can also need some runtime state to track information that
shouldn't make it in the serialized data.

One such example is a vmalloc pointer.  While kho_preserve_vmalloc()
preserves the memory backing a vmalloc allocation, it does not store the
original vmap pointer, since that has no use being passed to the next
kernel.  The pointer is needed to free the memory in case the file is
unpreserved.

Provide a private field in struct luo_file and pass it to all the
callbacks.  The field's can be set by preserve, and must be freed by
unpreserve.

Link: https://lkml.kernel.org/r/20251125165850.3389713-14-pasha.tatashin@soleen.com
Signed-off-by: Pratyush Yadav <ptyadav@amazon.de>
Co-developed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:40 -08:00
Pratyush Yadav
ed6f45f81b mm: shmem: export some functions to internal.h
shmem_inode_acct_blocks(), shmem_recalc_inode(), and
shmem_add_to_page_cache() are used by shmem_alloc_and_add_folio().  This
functionality will be used by memfd LUO integration.

Link: https://lkml.kernel.org/r/20251125165850.3389713-13-pasha.tatashin@soleen.com
Signed-off-by: Pratyush Yadav <ptyadav@amazon.de>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:40 -08:00
Pratyush Yadav
e165e2a257 mm: shmem: allow freezing inode mapping
To prepare a shmem inode for live update, its index -> folio mappings must
be serialized.  Once the mappings are serialized, they cannot change since
it would cause the serialized data to become inconsistent.  This can be
done by pinning the folios to avoid migration, and by making sure no
folios can be added to or removed from the inode.

While mechanisms to pin folios already exist, the only way to stop folios
being added or removed are the grow and shrink file seals.  But file seals
come with their own semantics, one of which is that they can't be removed.
This doesn't work with liveupdate since it can be cancelled or error out,
which would need the seals to be removed and the file's normal
functionality to be restored.

Introduce SHMEM_F_MAPPING_FROZEN to indicate this instead.  It is internal
to shmem and is not directly exposed to userspace.  It functions similar
to F_SEAL_GROW | F_SEAL_SHRINK, but additionally disallows hole punching,
and can be removed.

Link: https://lkml.kernel.org/r/20251125165850.3389713-12-pasha.tatashin@soleen.com
Signed-off-by: Pratyush Yadav <ptyadav@amazon.de>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:40 -08:00
Pratyush Yadav
6ff1610ced mm: shmem: use SHMEM_F_* flags instead of VM_* flags
shmem_inode_info::flags can have the VM flags VM_NORESERVE and VM_LOCKED. 
These are used to suppress pre-accounting or to lock the pages in the
inode respectively.  Using the VM flags directly makes it difficult to add
shmem-specific flags that are unrelated to VM behavior since one would
need to find a VM flag not used by shmem and re-purpose it.

Introduce SHMEM_F_NORESERVE and SHMEM_F_LOCKED which represent the same
information, but their bits are independent of the VM flags.  Callers can
still pass VM_NORESERVE to shmem_get_inode(), but it gets transformed to
the shmem-specific flag internally.

No functional changes intended.

Link: https://lkml.kernel.org/r/20251125165850.3389713-11-pasha.tatashin@soleen.com
Signed-off-by: Pratyush Yadav <ptyadav@amazon.de>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:39 -08:00
Pasha Tatashin
7a5afa7ea8 MAINTAINERS: add liveupdate entry
Add a MAINTAINERS file entry for the new Live Update Orchestrator
introduced in previous patches.

Link: https://lkml.kernel.org/r/20251125165850.3389713-10-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:39 -08:00
Pasha Tatashin
906a330624 docs: add luo documentation
Add the documentation files for the Live Update Orchestrator

Link: https://lkml.kernel.org/r/20251125165850.3389713-9-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:39 -08:00
Pasha Tatashin
16cec0d265 liveupdate: luo_session: add ioctls for file preservation
Introducing the userspace interface and internal logic required to manage
the lifecycle of file descriptors within a session.  Previously, a session
was merely a container; this change makes it a functional management unit.

The following capabilities are added:

A new set of ioctl commands are added, which operate on the file
descriptor returned by CREATE_SESSION. This allows userspace to:
- LIVEUPDATE_SESSION_PRESERVE_FD: Add a file descriptor to a session
  to be preserved across the live update.
- LIVEUPDATE_SESSION_RETRIEVE_FD: Retrieve a preserved file in the
  new kernel using its unique token.
- LIVEUPDATE_SESSION_FINISH: finish session

The session's .release handler is enhanced to be state-aware.  When a
session's file descriptor is closed, it correctly unpreserves the session
based on its current state before freeing all associated file resources.

Link: https://lkml.kernel.org/r/20251125165850.3389713-8-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:39 -08:00
Pasha Tatashin
7c722a7f44 liveupdate: luo_file: implement file systems callbacks
This patch implements the core mechanism for managing preserved files
throughout the live update lifecycle.  It provides the logic to invoke the
file handler callbacks (preserve, unpreserve, freeze, unfreeze, retrieve,
and finish) at the appropriate stages.

During the reboot phase, luo_file_freeze() serializes the final metadata
for each file (handler compatible string, token, and data handle) into a
memory region preserved by KHO.  In the new kernel, luo_file_deserialize()
reconstructs the in-memory file list from this data, preparing the session
for retrieval.

Link: https://lkml.kernel.org/r/20251125165850.3389713-7-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:38 -08:00
Pasha Tatashin
81cd25d263 liveupdate: luo_core: add user interface
Introduce the user-space interface for the Live Update Orchestrator via
ioctl commands, enabling external control over the live update process and
management of preserved resources.

The idea is that there is going to be a single userspace agent driving the
live update, therefore, only a single process can ever hold this device
opened at a time.

The following ioctl commands are introduced:

LIVEUPDATE_IOCTL_CREATE_SESSION
Provides a way for userspace to create a named session for grouping file
descriptors that need to be preserved. It returns a new file descriptor
representing the session.

LIVEUPDATE_IOCTL_RETRIEVE_SESSION
Allows the userspace agent in the new kernel to reclaim a preserved
session by its name, receiving a new file descriptor to manage the
restored resources.

Link: https://lkml.kernel.org/r/20251125165850.3389713-6-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:38 -08:00
Pasha Tatashin
0153094d03 liveupdate: luo_session: add sessions support
Introduce concept of "Live Update Sessions" within the LUO framework.  LUO
sessions provide a mechanism to group and manage `struct file *` instances
(representing file descriptors) that need to be preserved across a
kexec-based live update.

Each session is identified by a unique name and acts as a container for
file objects whose state is critical to a userspace workload, such as a
virtual machine or a high-performance database, aiming to maintain their
functionality across a kernel transition.

This groundwork establishes the framework for preserving file-backed state
across kernel updates, with the actual file data preservation mechanisms
to be implemented in subsequent patches.

[dan.carpenter@linaro.org: fix use after free in luo_session_deserialize()]
  Link: https://lkml.kernel.org/r/c5dd637d7eed3a3be48c5e9fedb881596a3b1f5a.1764163896.git.dan.carpenter@linaro.org
Link: https://lkml.kernel.org/r/20251125165850.3389713-5-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:38 -08:00
Pasha Tatashin
db8bed8082 kexec: call liveupdate_reboot() before kexec
Modify the kernel_kexec() to call liveupdate_reboot().

This ensures that the Live Update Orchestrator is notified just before the
kernel executes the kexec jump.  The liveupdate_reboot() function triggers
the final freeze event, allowing participating FDs perform last-minute
check or state saving within the blackout window.

If liveupdate_reboot() returns an error (indicating a failure during LUO
finalization), the kexec operation is aborted to prevent proceeding with
an inconsistent state.  An error is returned to user.

Link: https://lkml.kernel.org/r/20251125165850.3389713-4-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:38 -08:00
Pasha Tatashin
1aece82100 liveupdate: luo_core: integrate with KHO
Integrate the LUO with the KHO framework to enable passing LUO state
across a kexec reboot.

This patch implements the lifecycle integration with KHO:

1. Incoming State: During early boot (`early_initcall`), LUO checks if
   KHO is active. If so, it retrieves the "LUO" subtree, verifies the
   "luo-v1" compatibility string, and reads the `liveupdate-number` to
   track the update count.

2. Outgoing State: During late initialization (`late_initcall`), LUO
   allocates a new FDT for the next kernel, populates it with the basic
   header (compatible string and incremented update number), and
   registers it with KHO (`kho_add_subtree`).

3. Finalization: The `liveupdate_reboot()` notifier is updated to invoke
   `kho_finalize()`. This ensures that all memory segments marked for
   preservation are properly serialized before the kexec jump.

Link: https://lkml.kernel.org/r/20251125165850.3389713-3-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:37 -08:00
Pasha Tatashin
9e2fd062fa liveupdate: luo_core: Live Update Orchestrator
Patch series "Live Update Orchestrator", v8.

This series introduces the Live Update Orchestrator, a kernel subsystem
designed to facilitate live kernel updates using a kexec-based reboot. 
This capability is critical for cloud environments, allowing hypervisors
to be updated with minimal downtime for running virtual machines.  LUO
achieves this by preserving the state of selected resources, such as
memory, devices and their dependencies, across the kernel transition.

As a key feature, this series includes support for preserving memfd file
descriptors, which allows critical in-memory data, such as guest RAM or
any other large memory region, to be maintained in RAM across the kexec
reboot.

The other series that use LUO, are VFIO [1], IOMMU [2], and PCI [3]
preservations.

Github repo of this series [4].

The core of LUO is a framework for managing the lifecycle of preserved
resources through a userspace-driven interface. Key features include:

- Session Management
  Userspace agent (i.e. luod [5]) creates named sessions, each
  represented by a file descriptor (via centralized agent that controls
  /dev/liveupdate). The lifecycle of all preserved resources within a
  session is tied to this FD, ensuring automatic kernel cleanup if the
  controlling userspace agent crashes or exits unexpectedly.

- File Preservation
  A handler-based framework allows specific file types (demonstrated
  here with memfd) to be preserved. Handlers manage the serialization,
  restoration, and lifecycle of their specific file types.

- File-Lifecycle-Bound State
  A new mechanism for managing shared global state whose lifecycle is
  tied to the preservation of one or more files. This is crucial for
  subsystems like IOMMU or HugeTLB, where multiple file descriptors may
  depend on a single, shared underlying resource that must be preserved
  only once.

- KHO Integration
  LUO drives the Kexec Handover framework programmatically to pass its
  serialized metadata to the next kernel. The LUO state is finalized and
  added to the kexec image just before the reboot is triggered. In the
  future this step will also be removed once stateless KHO is
  merged [6].

- Userspace Interface
  Control is provided via ioctl commands on /dev/liveupdate for creating
  and retrieving sessions, as well as on session file descriptors for
  managing individual files.

- Testing
  The series includes a set of selftests, including userspace API
  validation, kexec-based lifecycle tests for various session and file
  scenarios, and a new in-kernel test module to validate the FLB logic.




Introduce LUO, a mechanism intended to facilitate kernel updates while
keeping designated devices operational across the transition (e.g., via
kexec).  The primary use case is updating hypervisors with minimal
disruption to running virtual machines.  For userspace side of hypervisor
update we have copyless migration.  LUO is for updating the kernel.

This initial patch lays the groundwork for the LUO subsystem.

Further functionality, including the implementation of state transition
logic, integration with KHO, and hooks for subsystems and file
descriptors, will be added in subsequent patches.

Create a character device at /dev/liveupdate.

A new uAPI header, <uapi/linux/liveupdate.h>, will define the necessary
structures.  The magic number for IOCTL is registered in
Documentation/userspace-api/ioctl/ioctl-number.rst.

Link: https://lkml.kernel.org/r/20251125165850.3389713-1-pasha.tatashin@soleen.com
Link: https://lkml.kernel.org/r/20251125165850.3389713-2-pasha.tatashin@soleen.com
Link: https://lore.kernel.org/all/20251018000713.677779-1-vipinsh@google.com/ [1]
Link: https://lore.kernel.org/linux-iommu/20250928190624.3735830-1-skhawaja@google.com [2]
Link: https://lore.kernel.org/linux-pci/20250916-luo-pci-v2-0-c494053c3c08@kernel.org [3]
Link: https://github.com/googleprodkernel/linux-liveupdate/tree/luo/v8 [4]
Link: https://tinyurl.com/luoddesign [5]
Link: https://lore.kernel.org/all/20251020100306.2709352-1-jasonmiu@google.com [6]
Link: https://lore.kernel.org/all/20251115233409.768044-1-pasha.tatashin@soleen.com [7]
Link: https://github.com/soleen/linux/blob/luo/v8b03/diff.v7.v8 [8]
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: David Matlack <dmatlack@google.com>
Cc: Aleksander Lobakin <aleksander.lobakin@intel.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andriy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: anish kumar <yesanishhere@gmail.com>
Cc: Anna Schumaker <anna.schumaker@oracle.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Daniel Wagner <wagi@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Jeffery <djeffery@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guixin Liu <kanie@linux.alibaba.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Joanthan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Leon Romanovsky <leon@kernel.org>
Cc: Leon Romanovsky <leonro@nvidia.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Matthew Maurer <mmaurer@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Myugnjoo Ham <myungjoo.ham@samsung.com>
Cc: Parav Pandit <parav@nvidia.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Saeed Mahameed <saeedm@nvidia.com>
Cc: Samiullah Khawaja <skhawaja@google.com>
Cc: Song Liu <song@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Stuart Hayes <stuart.w.hayes@gmail.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Weißschuh <linux@weissschuh.net>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: William Tu <witu@nvidia.com>
Cc: Yoann Congal <yoann.congal@smile.fr>
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:37 -08:00
Pasha Tatashin
7bd3643f94 kho: add Kconfig option to enable KHO by default
Currently, Kexec Handover must be explicitly enabled via the kernel
command line parameter `kho=on`.

For workloads that rely on KHO as a foundational requirement (such as the
upcoming Live Update Orchestrator), requiring an explicit boot parameter
adds redundant configuration steps.

Introduce CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT.  When selected, KHO
defaults to enabled.  This is equivalent to passing kho=on at boot.  The
behavior can still be disabled at runtime by passing kho=off.

Link: https://lkml.kernel.org/r/20251114190002.3311679-14-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Coiby Xu <coxu@redhat.com>
Cc: Dave Vasilevsky <dave@vasilevsky.ca>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:37 -08:00
Pasha Tatashin
de51999e68 kho: allow memory preservation state updates after finalization
Currently, kho_preserve_* and kho_unpreserve_* return -EBUSY if KHO is
finalized.  This enforces a rigid "freeze" on the KHO memory state.

With the introduction of re-entrant finalization, this restriction is no
longer necessary.  Users should be allowed to modify the preservation set
(e.g., adding new pages or freeing old ones) even after an initial
finalization.

The intended workflow for updates is now:
1. Modify state (preserve/unpreserve).
2. Call kho_finalize() again to refresh the serialized metadata.

Remove the kho_out.finalized checks to enable this dynamic behavior.

This also allows to convert kho_unpreserve_* functions to void, as they do
not return any error anymore.

Link: https://lkml.kernel.org/r/20251114190002.3311679-13-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Coiby Xu <coxu@redhat.com>
Cc: Dave Vasilevsky <dave@vasilevsky.ca>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:36 -08:00
Pasha Tatashin
d7255959b6 kho: allow kexec load before KHO finalization
Currently, kho_fill_kimage() checks kho_out.finalized and returns early if
KHO is not yet finalized.  This enforces a strict ordering where userspace
must finalize KHO *before* loading the kexec image.

This is restrictive, as standard workflows often involve loading the
target kernel early in the lifecycle and finalizing the state (FDT) only
immediately before the reboot.

Since the KHO FDT resides at a physical address allocated during boot
(kho_init), its location is stable.  We can attach this stable address to
the kimage regardless of whether the content has been finalized yet.

Relax the check to only require kho_enable, allowing kexec_file_load to
proceed at any time.

Link: https://lkml.kernel.org/r/20251114190002.3311679-12-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Coiby Xu <coxu@redhat.com>
Cc: Dave Vasilevsky <dave@vasilevsky.ca>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:36 -08:00
Pasha Tatashin
8e068a286a kho: update FDT dynamically for subtree addition/removal
Currently, sub-FDTs were tracked in a list (kho_out.sub_fdts) and the
final FDT is constructed entirely from scratch during kho_finalize().

We can maintain the FDT dynamically:
1. Initialize a valid, empty FDT in kho_init().
2. Use fdt_add_subnode and fdt_setprop in kho_add_subtree to
   update the FDT immediately when a subsystem registers.
3. Use fdt_del_node in kho_remove_subtree to remove entries.

This removes the need for the intermediate sub_fdts list and the
reconstruction logic in kho_finalize().  kho_finalize() now only needs to
trigger memory map serialization.

Link: https://lkml.kernel.org/r/20251114190002.3311679-11-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Coiby Xu <coxu@redhat.com>
Cc: Dave Vasilevsky <dave@vasilevsky.ca>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:36 -08:00
Pasha Tatashin
9a4301f715 kho: remove abort functionality and support state refresh
Previously, KHO required a dedicated kho_abort() function to clean up
state before kho_finalize() could be called again.  This was necessary to
handle complex unwind paths when using notifiers.

With the shift to direct memory preservation, the explicit abort step is
no longer strictly necessary.

Remove kho_abort() and refactor kho_finalize() to handle re-entry.  If
kho_finalize() is called while KHO is already finalized, it will now
automatically clean up the previous memory map and state before generating
a new one.  This allows the KHO state to be updated/refreshed simply by
triggering finalize again.

Update debugfs to return -EINVAL if userspace attempts to write 0 to the
finalize attribute, as explicit abort is no longer supported.

Link: https://lkml.kernel.org/r/20251114190002.3311679-10-pasha.tatashin@soleen.com
Suggested-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Coiby Xu <coxu@redhat.com>
Cc: Dave Vasilevsky <dave@vasilevsky.ca>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:36 -08:00
Pasha Tatashin
efa3a9775a kho: remove global preserved_mem_map and store state in FDT
Currently, the serialized memory map is tracked via
kho_out.preserved_mem_map and copied to the FDT during finalization.  This
double tracking is redundant.

Remove preserved_mem_map from kho_out.  Instead, maintain the physical
address of the head chunk directly in the preserved-memory-map FDT
property.

Introduce kho_update_memory_map() to manage this property. This function
handles:
1. Retrieving and freeing any existing serialized map (handling the
   abort/retry case).
2. Updating the FDT property with the new chunk address.

This establishes the FDT as the single source of truth for the handover
state.

Link: https://lkml.kernel.org/r/20251114190002.3311679-9-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Coiby Xu <coxu@redhat.com>
Cc: Dave Vasilevsky <dave@vasilevsky.ca>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:35 -08:00
Pasha Tatashin
71960fe134 kho: simplify serialization and remove __kho_abort
Currently, __kho_finalize() performs memory serialization in the middle of
FDT construction.  If FDT construction fails later, the function must
manually clean up the serialized memory via __kho_abort().

Refactor __kho_finalize() to perform kho_mem_serialize() only after the
FDT has been successfully constructed and finished.  This reordering has
two benefits:
1. It avoids expensive serialization work if FDT generation fails.
2. It removes the need for cleanup in the FDT error path.

As a result, the internal helper __kho_abort() is no longer needed for
internal error handling.  Inline its remaining logic (cleanup of the
preserved memory map) directly into kho_abort() and remove the helper.

Link: https://lkml.kernel.org/r/20251114190002.3311679-8-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Coiby Xu <coxu@redhat.com>
Cc: Dave Vasilevsky <dave@vasilevsky.ca>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:35 -08:00
Pasha Tatashin
e268689a52 kho: always expose output FDT in debugfs
Currently, the output FDT is added to debugfs only when KHO is finalized
and removed when aborted.

There is no need to hide the FDT based on the state.  Always expose it
starting from initialization.  This aids the transition toward removing
the explicit abort functionality and converting KHO to be fully stateless.

Link: https://lkml.kernel.org/r/20251114190002.3311679-7-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Coiby Xu <coxu@redhat.com>
Cc: Dave Vasilevsky <dave@vasilevsky.ca>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:35 -08:00
Pasha Tatashin
53f8f064eb kho: verify deserialization status and fix FDT alignment access
During boot, kho_restore_folio() relies on the memory map having been
successfully deserialized.  If deserialization fails or no map is present,
attempting to restore the FDT folio is unsafe.

Update kho_mem_deserialize() to return a boolean indicating success.  Use
this return value in kho_memory_init() to disable KHO if deserialization
fails.  Also, the incoming FDT folio is never used, there is no reason to
restore it.

Additionally, use get_unaligned() to retrieve the memory map pointer from
the FDT.  FDT properties are not guaranteed to be naturally aligned, and
accessing a 64-bit value via a pointer that is only 32-bit aligned can
cause faults.

Link: https://lkml.kernel.org/r/20251114190002.3311679-6-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Coiby Xu <coxu@redhat.com>
Cc: Dave Vasilevsky <dave@vasilevsky.ca>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:35 -08:00
Pasha Tatashin
85de0090bd kho: preserve FDT folio only once during initialization
Currently, the FDT folio is preserved inside __kho_finalize().  If the
user performs multiple finalize/abort cycles, kho_preserve_folio() is
called repeatedly for the same FDT folio.

Since the FDT folio is allocated once during kho_init(), it should be
marked for preservation at the same time.  Move the preservation call to
kho_init() to align the preservation state with the object's lifecycle and
simplify the finalize path.

Also, pre-zero the FDT tree so we do not expose random bits to the user
and to the next kernel by using the new kho_alloc_preserve() api.

Link: https://lkml.kernel.org/r/20251114190002.3311679-5-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Coiby Xu <coxu@redhat.com>
Cc: Dave Vasilevsky <dave@vasilevsky.ca>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:34 -08:00
Pasha Tatashin
4c205677af kho: introduce high-level memory allocation API
Currently, clients of KHO must manually allocate memory (e.g., via
alloc_pages), calculate the page order, and explicitly call
kho_preserve_folio().  Similarly, cleanup requires separate calls to
unpreserve and free the memory.

Introduce a high-level API to streamline this common pattern:

- kho_alloc_preserve(size): Allocates physically contiguous, zeroed
  memory and immediately marks it for preservation.
- kho_unpreserve_free(ptr): Unpreserves and frees the memory
  in the current kernel.
- kho_restore_free(ptr): Restores the struct page state of
  preserved memory in the new kernel and immediately frees it to the
  page allocator.

[pasha.tatashin@soleen.com: build fixes]
  Link: https://lkml.kernel.org/r/CA+CK2bBgXDhrHwTVgxrw7YTQ-0=LgW0t66CwPCgG=C85ftz4zw@mail.gmail.com
Link: https://lkml.kernel.org/r/20251114190002.3311679-4-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Coiby Xu <coxu@redhat.com>
Cc: Dave Vasilevsky <dave@vasilevsky.ca>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:34 -08:00
Pasha Tatashin
8c3819f627 kho: convert __kho_abort() to return void
The internal helper __kho_abort() always returns 0 and has no failure
paths.  Its return value is ignored by __kho_finalize and checked
needlessly by kho_abort.

Change the return type to void to reflect that this function cannot fail,
and simplify kho_abort by removing dead error handling code.

Link: https://lkml.kernel.org/r/20251114190002.3311679-3-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Coiby Xu <coxu@redhat.com>
Cc: Dave Vasilevsky <dave@vasilevsky.ca>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:34 -08:00
Pasha Tatashin
077a4851b0 kho: fix misleading log message in kho_populate()
Patch series "kho: simplify state machine and enable dynamic updates", v2.

This patch series refactors the Kexec Handover subsystem to transition
from a rigid, state-locked model to a dynamic, re-entrant architecture. 
It also introduces usability improvements.

Motivation
Currently, KHO relies on a strict state machine where memory
preservation is locked upon finalization. If a change is required, the
user must explicitly "abort" to reset the state. Additionally, the kexec
image cannot be loaded until KHO is finalized, and the FDT is rebuilt
from scratch on every finalization.

This series simplifies this workflow to support "load early, finalize
late" scenarios.

Key Changes

State Machine Simplification:
- Removed kho_abort(). kho_finalize() is now re-entrant; calling it a
  second time automatically flushes the previous serialized state and
  generates a fresh one.

- Removed kho_out.finalized checks from preservation APIs, allowing
  drivers to add/remove pages even after an initial finalization.

- Decoupled kexec_file_load from KHO finalization. The KHO FDT physical
  address is now stable from boot, allowing the kexec image to be loaded
  before the handover metadata is finalized.

FDT Management:
- The FDT is now updated in-place dynamically when subtrees are added or
  removed, removing the need for complex reconstruction logic.

- The output FDT is always exposed in debugfs (initialized and zeroed at
  boot), improving visibility and debugging capabilities throughout the
  system lifecycle.

- Removed the redundant global preserved_mem_map pointer, establishing
  the FDT property as the single source of truth.

New Features & API Enhancements:
- High-Level Allocators: Introduced kho_alloc_preserve() and friends to
  reduce boilerplate for drivers that need to allocate, preserve, and
  eventually restore simple memory buffers.

- Configuration: Added CONFIG_KEXEC_HANDOVER_ENABLE_DEFAULT to allow KHO
  to be active by default without requiring the kho=on command line
  parameter.

Fixes:
- Fixed potential alignment faults when accessing 64-bit FDT properties.

- Fixed the lifecycle of the FDT folio preservation (now preserved once
  at init).


This patch (of 13):

The log message in kho_populate() currently states "Will skip init for
some devices".  This implies that Kexec Handover always involves skipping
device initialization.

However, KHO is a generic mechanism used to preserve kernel memory across
reboot for various purposes, such as memfd, telemetry, or reserve_mem. 
Skipping device initialization is a specific property of live update
drivers using KHO, not a property of the mechanism itself.

Remove the misleading suffix to accurately reflect the generic nature of
KHO discovery.

Link: https://lkml.kernel.org/r/20251114190002.3311679-2-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Baoquan He <bhe@redhat.com>
Cc: Coiby Xu <coxu@redhat.com>
Cc: Dave Vasilevsky <dave@vasilevsky.ca>
Cc: Eric Biggers <ebiggers@google.com>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:34 -08:00
Zhu Yanjun
8db839caee liveupdate: kho: use %pe format specifier for error pointer printing
Make pr_xxx() call to use the %pe format specifier instead of %d.  The %pe
specifier prints a symbolic error string (e.g., -ENOMEM, -EINVAL) when
given an error pointer created with ERR_PTR(err).

This change enhances the clarity and diagnostic value of the error message
by showing a descriptive error name rather than a numeric error code.

Note, that some err are still printed by value, as those errors might come
from libfdt and not regular errnos.

Link: https://lkml.kernel.org/r/20251101142325.1326536-10-pasha.tatashin@soleen.com
Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
Co-developed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Changyuan Lyu <changyuanl@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:33 -08:00
Pasha Tatashin
c332ebd9c0 MAINTAINERS: update KHO maintainers
Changyuan does not have cycles to commit to the upstream work of KHO. 
Remove from KHO maintainers.

Link: https://lkml.kernel.org/r/20251101142325.1326536-9-pasha.tatashin@soleen.com
Signed-off-by: Changyuan Lyu <changyuanl@google.com>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Simon Horman <horms@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:33 -08:00
Pasha Tatashin
48a1b2321d liveupdate: kho: move to kernel/liveupdate
Move KHO to kernel/liveupdate/ in preparation of placing all Live Update
core kernel related files to the same place.

[pasha.tatashin@soleen.com: disable the menu when DEFERRED_STRUCT_PAGE_INIT]
  Link: https://lkml.kernel.org/r/CA+CK2bAvh9Oa2SLfsbJ8zztpEjrgr_hr-uGgF1coy8yoibT39A@mail.gmail.com
Link: https://lkml.kernel.org/r/20251101142325.1326536-8-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Changyuan Lyu <changyuanl@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Simon Horman <horms@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:33 -08:00
Pasha Tatashin
99cd2ffac6 kho: don't unpreserve memory during abort
KHO allows clients to preserve memory regions at any point before the KHO
state is finalized.  The finalization process itself involves KHO
performing its own actions, such as serializing the overall preserved
memory map.

If this finalization process is aborted, the current implementation
destroys KHO's internal memory tracking structures
(`kho_out.ser.track.orders`).  This behavior effectively unpreserves all
memory from KHO's perspective, regardless of whether those preservations
were made by clients before the finalization attempt or by KHO itself
during finalization.

This premature unpreservation is incorrect.  An abort of the finalization
process should only undo actions taken by KHO as part of that specific
finalization attempt.  Individual memory regions preserved by clients
prior to finalization should remain preserved, as their lifecycle is
managed by the clients themselves.  These clients might still need to call
kho_unpreserve_folio() or kho_unpreserve_phys() based on their own logic,
even after a KHO finalization attempt is aborted.

Link: https://lkml.kernel.org/r/20251101142325.1326536-7-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Changyuan Lyu <changyuanl@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Simon Horman <horms@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:33 -08:00
Pasha Tatashin
ce405ed510 test_kho: unpreserve memory in case of error
If there is an error half way through KHO memory preservation, we should
rollback and unpreserve everything that is partially preserved.

Link: https://lkml.kernel.org/r/20251101142325.1326536-6-pasha.tatashin@soleen.com
Co-developed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Changyuan Lyu <changyuanl@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Simon Horman <horms@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:32 -08:00
Pasha Tatashin
f5bfd4793a memblock: unpreserve memory in case of error
If there is an error half way through KHO memory preservation, we should
rollback and unpreserve everything that is partially preserved.

[akpm@linux-foundation.org: s/err_no_fdt_page/err_report/ in prepare_kho_fdt(), per Mike]
Link: https://lkml.kernel.org/r/20251101142325.1326536-5-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Suggested-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Changyuan Lyu <changyuanl@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Simon Horman <horms@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:32 -08:00
Pasha Tatashin
36f8f7ef7f kho: add interfaces to unpreserve folios, page ranges, and vmalloc
Allow users of KHO to cancel the previous preservation by adding the
necessary interfaces to unpreserve folio, pages, and vmallocs.

Link: https://lkml.kernel.org/r/20251101142325.1326536-4-pasha.tatashin@soleen.com
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Changyuan Lyu <changyuanl@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Simon Horman <horms@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:32 -08:00
Mike Rapoport (Microsoft)
70f9133096 kho: drop notifiers
The KHO framework uses a notifier chain as the mechanism for clients to
participate in the finalization process.  While this works for a single,
central state machine, it is too restrictive for kernel-internal
components like pstore/reserve_mem or IMA.  These components need a
simpler, direct way to register their state for preservation (e.g., during
their initcall) without being part of a complex, shutdown-time notifier
sequence.  The notifier model forces all participants into a single
finalization flow and makes direct preservation from an arbitrary context
difficult.  This patch refactors the client participation model by
removing the notifier chain and introducing a direct API for managing FDT
subtrees.

The core kho_finalize() and kho_abort() state machine remains, but clients
now register their data with KHO beforehand.

Link: https://lkml.kernel.org/r/20251101142325.1326536-3-pasha.tatashin@soleen.com
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Co-developed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Changyuan Lyu <changyuanl@google.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pratyush Yadav <pratyush@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Simon Horman <horms@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:32 -08:00
Pasha Tatashin
03d3963464 kho: make debugfs interface optional
Patch series "liveupdate: Rework KHO for in-kernel users", v9.

This series refactors the KHO framework to better support in-kernel users
like the upcoming LUO.  The current design, which relies on a notifier
chain and debugfs for control, is too restrictive for direct programmatic
use.

The core of this rework is the removal of the notifier chain in favor of a
direct registration API.  This decouples clients from the shutdown-time
finalization sequence, allowing them to manage their preserved state more
flexibly and at any time.

In support of this new model, this series also:
 - Makes the debugfs interface optional.
 - Introduces APIs to unpreserve memory and fixes a bug in the abort
   path where client state was being incorrectly discarded. Note that
   this is an interim step, as a more comprehensive fix is planned as
   part of the stateless KHO work [1].
 - Moves all KHO code into a new kernel/liveupdate/ directory to
   consolidate live update components.


This patch (of 9):

Currently, KHO is controlled via debugfs interface, but once LUO is
introduced, it can control KHO, and the debug interface becomes optional.

Add a separate config CONFIG_KEXEC_HANDOVER_DEBUGFS that enables the
debugfs interface, and allows to inspect the tree.

Move all debugfs related code to a new file to keep the .c files clear of
ifdefs.

Link: https://lkml.kernel.org/r/20251101142325.1326536-1-pasha.tatashin@soleen.com
Link: https://lkml.kernel.org/r/20251101142325.1326536-2-pasha.tatashin@soleen.com
Link: https://lore.kernel.org/all/20251020100306.2709352-1-jasonmiu@google.com [1]
Co-developed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Pratyush Yadav <pratyush@kernel.org>
Cc: Alexander Graf <graf@amazon.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Changyuan Lyu <changyuanl@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Simon Horman <horms@kernel.org>
Cc: Zhu Yanjun <yanjun.zhu@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:31 -08:00
Bala-Vignesh-Reddy
e6fbd1759c selftests: complete kselftest include centralization
This follow-up patch completes centralization of kselftest.h and
ksefltest_harness.h includes in remaining seltests files, replacing all
relative paths with a non-relative paths using shared -I include path in
lib.mk

Tested with gcc-13.3 and clang-18.1, and cross-compiled successfully on
riscv, arm64, x86_64 and powerpc arch.

[reddybalavignesh9979@gmail.com: add selftests include path for kselftest.h]
  Link: https://lkml.kernel.org/r/20251017090201.317521-1-reddybalavignesh9979@gmail.com
Link: https://lkml.kernel.org/r/20251016104409.68985-1-reddybalavignesh9979@gmail.com
Signed-off-by: Bala-Vignesh-Reddy <reddybalavignesh9979@gmail.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/lkml/20250820143954.33d95635e504e94df01930d0@linux-foundation.org/
Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Günther Noack <gnoack@google.com>
Cc: Jakub Kacinski <kuba@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mickael Salaun <mic@digikod.net>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Simon Horman <horms@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:31 -08:00
Mateusz Guzik
262ef8e55b fork: stop ignoring NUMA while handling cached thread stacks
1. the numa parameter was straight up ignored.
2. nothing was done to check if the to-be-cached/allocated stack matches
   the local node

The id remains ignored on free in case of memoryless nodes.

Note the current caching is already bad as the cache keeps overflowing
and a different solution is needed for the long run, to be worked
out(tm).

Stats collected over a kernel build with the patch with the following
topology:
  NUMA node(s):              2
  NUMA node0 CPU(s):         0-11
  NUMA node1 CPU(s):         12-23

caller's node vs stack backing pages on free:
matching:	50083 (70%)
mismatched:	21492 (30%)

caching efficiency:
cached:		32651 (65.2%)
dropped:	17432 (34.8%)

Link: https://lkml.kernel.org/r/20251120054015.3019419-1-mjguzik@gmail.com
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Linus Waleij <linus.walleij@linaro.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:31 -08:00
Eric Dumazet
94984bfed5 rbtree: inline rb_last()
This is a very small function, inlining it saves cpu cycles in TCP by
reducing register pressure and removing call/ret overhead.

It also reduces vmlinux text size by 122 bytes on a typical x86_64 build.

Before:

size vmlinux
   text    data     bss     dec     hex filename
34811781        22177365        5685248 62674394        3bc55da vmlinux

After:

size vmlinux
   text	   data	    bss	    dec	    hex	filename
34811659	22177365	5685248	62674272	3bc5560	vmlinux

[ojeda@kernel.org: fix rust build]
  Link: https://lkml.kernel.org/r/20251120085518.1463498-1-ojeda@kernel.org
Link: https://lkml.kernel.org/r/20251114140646.3817319-3-edumazet@google.com
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
Reviewed-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Cc: Jakub Kacinski <kuba@kernel.org>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Stehen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:30 -08:00
Eric Dumazet
c2d2dad245 rbtree: inline rb_first()
Patch series "rbree: inline rb_first() and rb_last()".

Inline these two small helpers, heavily used in TCP and FQ packet scheduler,
and in many other places.

This reduces kernel text size, and brings an 1.5 % improvement on network
TCP stress test.


This patch (of 2):

This is a very small function, inlining it saves cpu cycles by reducing
register pressure and removing call/ret overhead.

It also reduces vmlinux text size by 744 bytes on a typical x86_64 build.

Before:

size vmlinux
   text	   data	    bss	    dec	    hex	filename
34812525	22177365	5685248	62675138	3bc58c2	vmlinux

After:

size vmlinux
   text	   data	    bss	    dec	    hex	filename
34811781	22177365	5685248	62674394	3bc55da	vmlinux

[ojeda@kernel.org: fix rust build]
  Link: https://lkml.kernel.org/r/20251120085518.1463498-1-ojeda@kernel.org
Link: https://lkml.kernel.org/r/20251114140646.3817319-1-edumazet@google.com
Link: https://lkml.kernel.org/r/20251114140646.3817319-2-edumazet@google.com
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
Reviewed-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Cc: Jakub Kacinski <kuba@kernel.org>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Stehen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-27 14:24:30 -08:00
Andrew Morton
bc947af677 Merge branch 'mm-hotfixes-stable' into mm-nonmm-stable in order to be able
to merge "kho: make debugfs interface optional" into mm-nonmm-stable.
2025-11-27 14:17:02 -08:00
Dan Carpenter
099a60cca1 remoteproc: imx_dsp_rproc: Fix NULL vs IS_ERR() bug in imx_dsp_rproc_add_carveout()
The devm_ioremap_resource_wc() function never returns NULL, it returns
error pointers.  Update the error checking to match.

Fixes: 67a7bc7f0358 ("remoteproc: Use of_reserved_mem_region_* functions for "memory-region"")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Link: https://lore.kernel.org/r/aSf6OerBbPcxBUVt@stanley.mountain
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-11-27 08:52:54 -07:00
Bjorn Andersson
ac82dbc539 remoteproc: st: Fix indexing of memory-regions
The recent transition to use of_reserved_mem_region_to_resource()
changes the while loop to a for loop, but the increment of the "index"
variable was left behind at the end of the loop, as highlighted by the
following error/warning:

  error: variable 'index' is incremented both in the loop header and in the loop body [-Werror,-Wfor-loop-analysis]

Drop the extra increment to avoid skipping over every other
memory-region in the loop.

Fixes: 67a7bc7f0358 ("remoteproc: Use of_reserved_mem_region_* functions for "memory-region"")
Signed-off-by: Bjorn Andersson <bjorn.andersson@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20251126-st-remoteproc-double-index-v1-1-3b0a8b21ac18@oss.qualcomm.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-11-27 08:35:19 -07:00
Matthieu Buffet
e61462232a
selftests/landlock: Fix makefile header list
Make all headers part of make's dependencies computations.
Otherwise, updating audit.h, common.h, scoped_base_variants.h,
scoped_common.h, scoped_multiple_domain_variants.h, or wrappers.h,
re-running make and running selftests could lead to testing stale headers.

Fixes: 6a500b22971c ("selftests/landlock: Add tests for audit flags and domain IDs")
Fixes: fefcf0f7cf47 ("selftests/landlock: Test abstract UNIX socket scoping")
Fixes: 5147779d5e1b ("selftests/landlock: Add wrappers.h")
Signed-off-by: Matthieu Buffet <matthieu@buffet.re>
Link: https://lore.kernel.org/r/20251027011440.1838514-1-matthieu@buffet.re
Signed-off-by: Mickaël Salaün <mic@digikod.net>
2025-11-26 20:20:23 +01:00
Tingmao Wang
335ef80e4a
landlock: Make docs in cred.h and domain.h visible
Currently even though the structures in these files have documentation,
they are not shown in the "Landlock LSM: kernel documentation" page.

Signed-off-by: Tingmao Wang <m@maowtm.org>
Link: https://lore.kernel.org/r/6050e764c2679cba01715653e5f1f4f17091d8f8.1759103277.git.m@maowtm.org
[mic: Synchronize date]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
2025-11-26 20:20:23 +01:00
Tingmao Wang
f4d3ef2dd0
landlock: Minor comments improvements
This patch contains some small comment changes.  The first three
comments for ruleset.c, I sort of made along the way while working on /
trying to understand Landlock, and the one from ruleset.h was from the
hashtable patch but extracted here.  In fs.c, one comment which I found
would have been helpful to me when reading this.

Signed-off-by: Tingmao Wang <m@maowtm.org>
Link: https://lore.kernel.org/r/20250602134150.67189-1-m@maowtm.org
Link: https://lore.kernel.org/r/20297185fd71ffbb5ce4fec14b38e5444c719c96.1748379182.git.m@maowtm.org
[mic: Squash patches with updated description, cosmetic fixes]
Signed-off-by: Mickaël Salaün <mic@digikod.net>
2025-11-26 20:20:21 +01:00
Nickolay Goppen
950c74fd6c remoteproc: qcom: pas: Add support for SDM660 CDSP
Compute DSP in SDM660 is compatible with generic cdsp_resource_init
descriptor.

Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Tested-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com> # ifc6560
Signed-off-by: Nickolay Goppen <setotau@mainlining.org>
Link: https://lore.kernel.org/r/20251110-qcom-sdm660-cdsp-v3-3-cc3c37287e72@mainlining.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
2025-11-26 11:41:01 -06:00
Nickolay Goppen
acd6c28a25 dt-bindings: remoteproc: qcom: adsp: Add SDM660 CDSP compatible
Add compatible for the compute DSP remoteproc found in SDM660.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Nickolay Goppen <setotau@mainlining.org>
Link: https://lore.kernel.org/r/20251110-qcom-sdm660-cdsp-v3-2-cc3c37287e72@mainlining.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
2025-11-26 11:41:01 -06:00
Nickolay Goppen
db03780e43 dt-bindings: remoteproc: qcom: adsp: Add missing constrains for SDM660 ADSP
Since SDM660 ADSP node uses "xo" clock, interrupts and "cx" power domain
properties add corresponding constrains for SDM660 ADSP.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Nickolay Goppen <setotau@mainlining.org>
Link: https://lore.kernel.org/r/20251110-qcom-sdm660-cdsp-v3-1-cc3c37287e72@mainlining.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
2025-11-26 11:41:00 -06:00
Konrad Dybcio
ca079ec3eb dt-bindings: remoteproc: qcom,sc8280xp-pas: Fix CDSP power desc
The power requirements for the CDSP instances on SC8280XP aren't fully
described, with only one of the three present. Fix that.

Fixes: ee651cd1e944 ("dt-bindings: remoteproc: qcom: pas: Add sc8280xp adsp and nsp pair")
Signed-off-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20251104-topic-8280_mxc-v1-2-df545af0ef94@oss.qualcomm.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
2025-11-26 11:40:31 -06:00
Sakari Ailus
7f07a5c3e2 remoteproc: omap: Remove redundant pm_runtime_mark_last_busy() calls
pm_runtime_put_autosuspend(), pm_runtime_put_sync_autosuspend(),
pm_runtime_autosuspend() and pm_request_autosuspend() now include a call
to pm_runtime_mark_last_busy(). Remove the now-reduntant explicit call to
pm_runtime_mark_last_busy().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Link: https://lore.kernel.org/r/20250704075445.3221481-1-sakari.ailus@linux.intel.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
2025-11-26 11:37:44 -06:00
Dan Carpenter
30065e73d7 nvdimm: Prevent integer overflow in ramdax_get_config_data()
The "cmd->in_offset" variable comes from the user via the __nd_ioctl()
function.  The problem is that the "cmd->in_offset + cmd->in_length"
addition could have an integer wrapping issue if cmd->in_offset is close
to UINT_MAX .  Both "cmd->in_offset" and "cmd->in_length" are u32
variables.

Fixes: 43bc0aa19a21 ("nvdimm: allow exposing RAM carveouts as NVDIMM DIMM devices")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Link: https://patch.msgid.link/aSbuiYCznEIZDa02@stanley.mountain
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
2025-11-26 10:58:23 -06:00
Rob Herring (Arm)
c70b9d5fdc remoteproc: qcom: Use of_reserved_mem_region_* functions for "memory-region"
Use the newly added of_reserved_mem_region_to_resource() and
of_reserved_mem_region_count() functions to handle "memory-region"
properties.

The error handling is a bit different in some cases. Often
"memory-region" is optional, so failed lookup is not an error. But then
an error in of_reserved_mem_lookup() is treated as an error. However,
that distinction is not really important. Either the region is available
and usable or it is not. So now, it is just
of_reserved_mem_region_to_resource() which is checked for an error.

Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://lore.kernel.org/r/20251124182751.507624-2-robh@kernel.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
2025-11-26 10:27:29 -06:00
Alice Ryhl
6297fb3863 rust: id_pool: rename IdPool::new() to with_capacity()
We want to change ::new() to take no parameters and produce a pool that
is as large as possible while also being inline because that is the
constructor that Rust Binder actually needs.

However, to avoid complications in examples, we still need the current
constructor. So rename it to with_capacity(), which is the idiomatic
Rust name for this kind constructor.

Reviewed-by: Burak Emir <bqe@google.com>
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-26 11:25:35 -05:00
Alice Ryhl
d0cf6512bb rust: bitmap: add BitmapVec::new_inline()
This constructor is useful when you just want to create a BitmapVec
without allocating but don't care how large it is.

Acked-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Reviewed-by: Burak Emir <bqe@google.com>
Reviewed-by: Danilo Krummrich <dakr@kernel.org>
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-26 11:25:35 -05:00
Alice Ryhl
f5535d78e1 rust: bitmap: add MAX_LEN and MAX_INLINE_LEN constants
To avoid hard-coding these values in drivers, define constants for them
that drivers can reference. Also, update all instances in bitmap.rs and
id_pool.rs that use these values to use the new constants.

Signed-off-by: Alice Ryhl <aliceryhl@google.com>
Reviewed-by: Burak Emir <bqe@google.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-26 11:25:35 -05:00
Srinivas Kandagatla
112766cdf2 rpmsg: glink: remove duplicate code for rpmsg device remove
rpmsg device remove code is duplicated in at-least 2-3 places, add a
helper function to remove this duplicated code.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20250822100043.2604794-3-srinivas.kandagatla@oss.qualcomm.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
2025-11-26 10:16:10 -06:00
Srinivas Kandagatla
a53e356df5 rpmsg: glink: fix rpmsg device leak
While testing rpmsg-char interface it was noticed that duplicate sysfs
entries are getting created and below warning is noticed.

Reason for this is that we are leaking rpmsg device pointer, setting it
null without actually unregistering device.
Any further attempts to unregister fail because rpdev is NULL,
resulting in a leak.

Fix this by unregistering rpmsg device before removing its reference
from rpmsg channel.

sysfs: cannot create duplicate filename '/devices/platform/soc@0/3700000.remot
eproc/remoteproc/remoteproc1/3700000.remoteproc:glink-edge/3700000.remoteproc:
glink-edge.adsp_apps.-1.-1'
[  114.115347] CPU: 0 UID: 0 PID: 9 Comm: kworker/0:0 Not
 tainted 6.16.0-rc4 #7 PREEMPT
[  114.115355] Hardware name: Qualcomm Technologies, Inc. Robotics RB3gen2 (DT)
[  114.115358] Workqueue: events qcom_glink_work
[  114.115371] Call trace:8
[  114.115374]  show_stack+0x18/0x24 (C)
[  114.115382]  dump_stack_lvl+0x60/0x80
[  114.115388]  dump_stack+0x18/0x24
[  114.115393]  sysfs_warn_dup+0x64/0x80
[  114.115402]  sysfs_create_dir_ns+0xf4/0x120
[  114.115409]  kobject_add_internal+0x98/0x260
[  114.115416]  kobject_add+0x9c/0x108
[  114.115421]  device_add+0xc4/0x7a0
[  114.115429]  rpmsg_register_device+0x5c/0xb0
[  114.115434]  qcom_glink_work+0x4bc/0x820
[  114.115438]  process_one_work+0x148/0x284
[  114.115446]  worker_thread+0x2c4/0x3e0
[  114.115452]  kthread+0x12c/0x204
[  114.115457]  ret_from_fork+0x10/0x20
[  114.115464] kobject: kobject_add_internal failed for 3700000.remoteproc:
glink-edge.adsp_apps.-1.-1 with -EEXIST, don't try to register things with
the same name in the same directory.
[  114.250045] rpmsg 3700000.remoteproc:glink-edge.adsp_apps.-1.-1:
device_add failed: -17

Fixes: 835764ddd9af ("rpmsg: glink: Move the common glink protocol implementation to glink_native.c")
Cc: Stable@vger.kernel.org
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@oss.qualcomm.com>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20250822100043.2604794-2-srinivas.kandagatla@oss.qualcomm.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
2025-11-26 10:16:05 -06:00
Luca Weiss
a1f2c2d55a remoteproc: qcom_q6v5_pas: Use resource with CX PD for MSM8974
MSM8974 requires the CX power domain, so use the msm8996_adsp_resource
which has cx under proxy_pd_names and is otherwise equivalent.

Signed-off-by: Luca Weiss <luca@lucaweiss.eu>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20250621-msm8974-rpmpd-switch-v1-2-0a2cb303c446@lucaweiss.eu
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
2025-11-26 10:12:03 -06:00
Luca Weiss
3d447dcdae dt-bindings: remoteproc: qcom,adsp: Make msm8974 use CX as power domain
Using CX as a regulator is an artifact of earlier times. Instead use CX
power rail as power domain from rpmpd.

Signed-off-by: Luca Weiss <luca@lucaweiss.eu>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Link: https://lore.kernel.org/r/20250621-msm8974-rpmpd-switch-v1-1-0a2cb303c446@lucaweiss.eu
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
2025-11-26 10:11:49 -06:00
Rob Herring (Arm)
67a7bc7f03 remoteproc: Use of_reserved_mem_region_* functions for "memory-region"
Use the newly added of_reserved_mem_region_to_resource() and
of_reserved_mem_region_count() functions to handle "memory-region"
properties.

The error handling is a bit different in some cases. Often
"memory-region" is optional, so failed lookup is not an error. But then
an error in of_reserved_mem_lookup() is treated as an error. However,
that distinction is not really important. Either the region is available
and usable or it is not. So now, it is just
of_reserved_mem_region_to_resource() which is checked for an error.

Acked-by: Arnaud Pouliquen <arnaud.pouliquen@foss.st.com>
Tested-by: Peng Fan <peng.fan@nxp.com> # i.MX93-11x11-EVK for imx_rproc.c
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be> # rcar
Tested-by: Beleswar Padhi <b-padhi@ti.com> # TI
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://lore.kernel.org/r/20251124182751.507624-1-robh@kernel.org
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-11-25 08:34:31 -07:00
Dai Ngo
99f5aa14f0 NFSD: Add trace point for SCSI fencing operation.
Add trace point to print client IP address, net namespace number,
device name and status of SCSI pr_preempt command.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-25 09:09:42 -05:00
Dai Ngo
6f52063db9 NFSD: use correct reservation type in nfsd4_scsi_fence_client
The reservation type argument for the pr_preempt call should match the
one used in nfsd4_block_get_device_info_scsi.

Fixes: f99d4fbdae67 ("nfsd: add SCSI layout support")
Cc: stable@vger.kernel.org
Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-25 09:09:42 -05:00
Chuck Lever
1c873a2fd1 xdrgen: Don't generate unnecessary semicolon
The Jinja2 templates add a semicolon at the end of every function.
The C language does not require this punctuation.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-25 09:09:42 -05:00
Chuck Lever
f7cb94fad4 xdrgen: Fix union declarations
Add a missing template file. This file is used when a union is
defined as a public API (ie, "pragma public <union name>;").

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-25 09:09:42 -05:00
Olga Kornievskaia
14282cc3cf NFSD: don't start nfsd if sv_permsocks is empty
Previously, while trying to create a server instance, if no
listening sockets were present then default parameter udp
and tcp listeners were created. It's unclear what purpose
was of starting these listeners were and how this could have
been triggered by the userland setup. This patch proposed
to ensure the reverse that we never end in a situation where
no listener sockets are created and we are trying to create
nfsd threads.

The problem it solves is: when nfs.conf only has tcp=n (and
nothing else for the choice of transports), nfsdctl would
still start the server and create udp and tcp listeners.

Signed-off-by: Olga Kornievskaia <okorniev@redhat.com>
Reviewed-by: NeilBrown <neil@brown.name>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-25 09:09:42 -05:00
Khushal Chitturi
b0f8e1f1f5 xdrgen: handle _XdrString in union encoder/decoder
Running xdrgen on xdrgen/tests/test.x fails when
generating encoder or decoder functions for union
members of type _XdrString. It was because _XdrString
does not have a spec attribute like _XdrBasic,
leading to AttributeError.

This patch updates emit_union_case_spec_definition
and emit_union_case_spec_decoder/encoder to handle
_XdrString by assigning type_name = "char *" and
avoiding referencing to spec.

Testing: Fixed xdrgen tool was run on originally failing
test file (tools/net/sunrpc/xdrgen/tests/test.x) and now
completes without AttributeError. Modified xdrgen tool was
also run against nfs4_1.x (Documentation/sunrpc/xdr/nfs4_1.x).
The output header file matches with nfs4_1.h
(include/linux/sunrpc/xdrgen/nfs4_1.h).
This validates the patch for all XDR input files currently
within the kernel.

Changes since v2:
- Moved the shebang to the first line
- Removed SPDX header to match style of current xdrgen files

Changes since v1:
- Corrected email address in Signed-off-by.
- Wrapped patch description lines to 72 characters.

Signed-off-by: Khushal Chitturi <kc9282016@gmail.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-25 09:09:42 -05:00
Chuck Lever
42ba5bd2e2 xdrgen: Fix the variable-length opaque field decoder template
Ensure that variable-length opaques are decoded into the named
field, and do not overwrite the structure itself.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-25 09:09:42 -05:00
Chuck Lever
3bd937b49a xdrgen: Make the xdrgen script location-independent
The @pythondir@ placeholder is meant for build-time substitution,
such as with autoconf. autoconf is not used in the kernel. Let's
replace that mechanism with one that better enables the xdrgen
script to be run from any directory.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-25 09:09:42 -05:00
Chuck Lever
75a9b40f3b xdrgen: Generalize/harden pathname construction
Use Python's built-in Path constructor to find the Jinja templates.
This provides better error checking, proper use of path component
separators, and more reliable location of the template files.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-25 09:09:42 -05:00
Andy Shevchenko
4bd68e4753 cpumask: Don't use "proxy" headers
Update header inclusions to follow IWYU (Include What You Use)
principle.

Note that kernel.h is discouraged to be included as it's written
at the top of that file.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 16:08:18 -05:00
Geert Uytterhoeven
c604cb5fdf soc: renesas: Use bitfield helpers
Use the field_get() helper, instead of open-coding the same operation.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:47 -05:00
Geert Uytterhoeven
3937b05bb7 clk: renesas: Use bitfield helpers
Use the FIELD_{GET,PREP}() and field_{get,prep}() helpers for const
respective non-const bitfields, instead of open-coding the same
operations.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:47 -05:00
Geert Uytterhoeven
b1cff2f4b2 ALSA: usb-audio: Convert to common field_{get,prep}() helpers
Drop the driver-specific field_get() and field_prep() macros, in favor
of the globally available variants from <linux/bitfield.h>.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:47 -05:00
Geert Uytterhoeven
610c4408a2 soc: renesas: rz-sysc: Convert to common field_get() helper
Drop the driver-specific field_get() macro, in favor of the globally
available variant from <linux/bitfield.h>.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:47 -05:00
Geert Uytterhoeven
bb0e7fda87 pinctrl: ma35: Convert to common field_{get,prep}() helpers
Drop the driver-specific field_get() and field_prep() macros, in favor
of the globally available variants from <linux/bitfield.h>.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:47 -05:00
Geert Uytterhoeven
1fe1c28a10 iio: mlx90614: Convert to common field_{get,prep}() helpers
Drop the driver-specific field_get() and field_prep() macros, in favor
of the globally available variants from <linux/bitfield.h>.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Crt Mori <cmo@melexis.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:47 -05:00
Geert Uytterhoeven
54bfd90ca3 iio: dac: Convert to common field_prep() helper
Drop the driver-specific field_prep() macro, in favor of the globally
available variant from <linux/bitfield.h>.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:47 -05:00
Geert Uytterhoeven
2ef26ba819 gpio: aspeed: Convert to common field_{get,prep}() helpers
Drop the driver-specific field_get() and field_prep() macros, in favor
of the globally available variants from <linux/bitfield.h>.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:47 -05:00
Geert Uytterhoeven
331a1457d8 EDAC/ie31200: Convert to common field_get() helper
Drop the driver-specific field_get() macro, in favor of the globally
available variant from <linux/bitfield.h>.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:47 -05:00
Geert Uytterhoeven
350f06c9e2 crypto: qat - convert to common field_get() helper
Drop the driver-specific field_get() macro, in favor of the globally
available variant from <linux/bitfield.h>.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:47 -05:00
Geert Uytterhoeven
0f8407a1f1 clk: at91: Convert to common field_{get,prep}() helpers
Drop the driver-specific field_get() and field_prep() macros, in favor
of the globally available variants from <linux/bitfield.h>.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Acked-by: Claudiu Beznea <claudiu.beznea@tuxon.dev>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:47 -05:00
Geert Uytterhoeven
c1c6ab80b2 bitfield: Add non-constant field_{prep,get}() helpers
The existing FIELD_{GET,PREP}() macros are limited to compile-time
constants.  However, it is very common to prepare or extract bitfield
elements where the bitfield mask is not a compile-time constant.

To avoid this limitation, the AT91 clock driver and several other
drivers already have their own non-const field_{prep,get}() macros.
Make them available for general use by adding them to
<linux/bitfield.h>, and improve them slightly:
  1. Avoid evaluating macro parameters more than once,
  2. Replace "ffs() - 1" by "__ffs()",
  3. Support 64-bit use on 32-bit architectures,
  4. Wire field_{get,prep}() to FIELD_{GET,PREP}() when mask is
     actually constant.

This is deliberately not merged into the existing FIELD_{GET,PREP}()
macros, as people expressed the desire to keep stricter variants for
increased safety, or for performance critical paths.

Yury: use __mask withing new macros.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Crt Mori <cmo@melexis.com>
Acked-by: Nuno Sá <nuno.sa@analog.com>
Acked-by: Richard Genoud <richard.genoud@bootlin.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@intel.com>
Reviewed-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:46 -05:00
Geert Uytterhoeven
2a6c045640 bitfield: Add less-checking __FIELD_{GET,PREP}()
The BUILD_BUG_ON_MSG() check against "~0ull" works only with "unsigned
(long) long" _mask types.  For constant masks, that condition is usually
met, as GENMASK() yields an UL value.  The few places where the
constant mask is stored in an intermediate variable were fixed by
changing the variable type to u64 (see e.g. [1] and [2]).

However, for non-constant masks, smaller unsigned types should be valid,
too, but currently lead to "result of comparison of constant
18446744073709551615 with expression of type ... is always
false"-warnings with clang and W=1.

Hence refactor the __BF_FIELD_CHECK() helper, and factor out
__FIELD_{GET,PREP}().  The later lack the single problematic check, but
are otherwise identical to FIELD_{GET,PREP}(), and are intended to be
used in the fully non-const variants later.

[1] commit 5c667d5a5a3ec166 ("clk: sp7021: Adjust width of _m in
    HWM_FIELD_PREP()")
[2] commit cfd6fb45cfaf46fa ("crypto: ccree - avoid out-of-range
    warnings from clang")

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Link: https://git.kernel.org/torvalds/c/5c667d5a5a3ec166 [1]
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:46 -05:00
Geert Uytterhoeven
85a8ff1185 ALSA: usb-audio: #undef field_{get,prep}() before local definition
Prepare for the advent of globally available common field_get() and
field_prep() macros by undefining the symbols before defining local
variants.  This prevents redefinition warnings from the C preprocessor
when introducing the common macros later.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:46 -05:00
Geert Uytterhoeven
138ab44108 soc: renesas: rz-sysc: #undef field_get() before local definition
Prepare for the advent of a globally available common field_get() macro
by undefining the symbol before defining a local variant.  This prevents
redefinition warnings from the C preprocessor when introducing the common
macro later.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:46 -05:00
Geert Uytterhoeven
2fc00c008e pinctrl: ma35: #undef field_{get,prep}() before local definition
Prepare for the advent of globally available common field_get() and
field_prep() macros by undefining the symbols before defining local
variants.  This prevents redefinition warnings from the C preprocessor
when introducing the common macros later.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:46 -05:00
Geert Uytterhoeven
8a838dabf1 iio: mlx90614: #undef field_{get,prep}() before local definition
Prepare for the advent of globally available common field_get() and
field_prep() macros by undefining the symbols before defining local
variants.  This prevents redefinition warnings from the C preprocessor
when introducing the common macros later.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:46 -05:00
Geert Uytterhoeven
27856d2b2b iio: dac: ad3530r: #undef field_prep() before local definition
Prepare for the advent of a globally available common field_prep() macro
by undefining the symbol before defining a local variant.  This prevents
redefinition warnings from the C preprocessor when introducing the common
macro later.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:46 -05:00
Geert Uytterhoeven
d1e1a7271e gpio: aspeed: #undef field_{get,prep}() before local definition
Prepare for the advent of globally available common field_get() and
field_prep() macros by undefining the symbols before defining local
variants.  This prevents redefinition warnings from the C preprocessor
when introducing the common macros later.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:46 -05:00
Geert Uytterhoeven
d51b09a0fe EDAC/ie31200: #undef field_get() before local definition
Prepare for the advent of a globally available common field_get() macro
by undefining the symbol before defining a local variant.  This prevents
redefinition warnings from the C preprocessor when introducing the common
macro later.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:46 -05:00
Geert Uytterhoeven
7996cbdb3f crypto: qat - #undef field_get() before local definition
Prepare for the advent of a globally available common field_get() macro
by undefining the symbol before defining a local variant.  This prevents
redefinition warnings from the C preprocessor when introducing the common
macro later.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:46 -05:00
Geert Uytterhoeven
dbfe51513a clk: at91: pmc: #undef field_{get,prep}() before definition
Prepare for the advent of globally available common field_get() and
field_prep() macros by undefining the symbols before defining local
variants.  This prevents redefinition warnings from the C preprocessor
when introducing the common macros later.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Acked-by: Claudiu Beznea <claudiu.beznea@tuxon.dev>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-24 14:15:46 -05:00
Ahmet Eray Karadag
58b6fcd2ab ocfs2: mark inode bad upon validation failure during read
A VFS cache inconsistency, potentially triggered by sequences like
buffered writes followed by open(O_DIRECT), can result in an invalid
on-disk inode block (e.g., bad signature).  OCFS2 detects this corruption
when reading the inode block via ocfs2_validate_inode_block(), logs
"Invalid dinode", and often switches the filesystem to read-only mode.

The VFS open(O_DIRECT) operation appears to incorrectly clear the inode's
I_DIRTY flag without ensuring the dirty metadata (reflecting the earlier
buffered write, e.g., an updated i_size) is flushed to disk.  This leaves
the in-memory VFS inode object "in limbo" with an updated size (e.g.,
38639 from the write) but marked clean, while its on-disk counterpart
remains stale (e.g., size 0) or invalid.

Currently, the function reading the inode block
(ocfs2_read_inode_block_full()) fails to call make_bad_inode() upon
detecting the validation error.  Because the in-memory inode is not marked
bad, subsequent operations (like ftruncate) proceed erroneously.  They
eventually reach code (e.g., ocfs2_truncate_file()) that compares the
inconsistent in-memory size (38639) against the invalid/stale on-disk size
(0), leading to kernel crashes via BUG_ON.

Fix this by calling make_bad_inode(inode) within the error handling path
of ocfs2_read_inode_block_full() immediately after a block read or
validation error occurs.  This ensures VFS is properly notified about the
corrupt inode at the point of detection.  Marking the inode bad allows VFS
to correctly fail subsequent operations targeting this inode early,
preventing kernel panics caused by operating on known inconsistent inode
states.

Link: https://lkml.kernel.org/r/20251118001833.423470-2-eraykrdg1@gmail.com
Link: https://lore.kernel.org/all/20251029225748.11361-2-eraykrdg1@gmail.com/T/
Signed-off-by: Albin Babu Varghese <albinbabuvarghese20@gmail.com>
Signed-off-by: Ahmet Eray Karadag <eraykrdg1@gmail.com>
Reported-by: syzbot+b93b65ee321c97861072@syzkaller.appspotmail.com
Link: https://syzkaller.appspot.com/bug?extid=b93b65ee321c97861072
Reviewed-by: Heming Zhao <heming.zhao@suse.com>
Co-developed-by: Albin Babu Varghese <albinbabuvarghese20@gmail.com>
Acked-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: David Hunter <david.hunter.linux@gmail.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:45 -08:00
Thorsten Blum
13db54aad7 ocfs2: replace deprecated strcpy with strscpy
strcpy() has been deprecated [1] because it performs no bounds checking on
the destination buffer, which can lead to buffer overflows.  Replace it
with the safer strscpy(), and copy directly into '->rf_signature' instead
of using the start of the struct as the destination buffer.

Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strcpy [1]
Link: https://lkml.kernel.org/r/20251118185345.132411-3-thorsten.blum@linux.dev
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:45 -08:00
Thorsten Blum
4022ba2005 ocfs2: replace deprecated strcpy in ocfs2_create_xattr_block
strcpy() has been deprecated [1] because it performs no bounds checking on
the destination buffer, which can lead to buffer overflows.  Replace it
with the safer strscpy(), and copy directly into '->xb_signature' instead
of using the start of the struct as the destination buffer.

Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strcpy [1]
Link: https://lkml.kernel.org/r/20251118185345.132411-2-thorsten.blum@linux.dev
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:45 -08:00
Chia-Liang Wang
ff713698ba lib: ratelimit: fix spelling mistake 'seperately'
Corrects a spelling mistake in a comment in ratelimit.c where 'seperately'
was used instead of 'separately'.

Link: https://lkml.kernel.org/r/20251119101144.3175-1-a0979625527@icloud.com
Signed-off-by: Chia-Liang Wang <a0979652527@icloud.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:45 -08:00
Lance Yang
2fe869ecbd MAINTAINERS: add Petr as a reviewer of hung task detector
Petr has been actively reviewing hung task detector patches lately.  It's
always good to have a fresh pair of eyes, so let's make it official.

I checked with him, and he's happy to be added.

Link: https://lkml.kernel.org/r/20251119110822.46566-1-lance.yang@linux.dev
Signed-off-by: Lance Yang <lance.yang@linux.dev>
Acked-by: Petr Mladek <pmladek@suse.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:44 -08:00
Alice Ryhl
9031b852c9 uaccess: gate _copy_[to|from]_user on !INLINE_COPY_FROM_USER
These methods only exist when INLINE_COPY_FROM_USER is disabled, so update
the header file to reflect that.

This fixes the following error on builds that enable both RUST and
INLINE_COPY_FROM_USER.

ERROR: modpost: "_copy_from_user" [samples/rust/rust_misc_device.ko] undefined!
ERROR: modpost: "_copy_to_user" [samples/rust/rust_misc_device.ko] undefined!

This error is triggered because when a method is available both as a
rust_helper_* and normal method, Rust will call the normal method.

[akpm@linux-foundation.org: s/INLINE_COPY_FROM_USER/INLINE_COPY_TO_USER/, per Alice]
Link: https://lkml.kernel.org/r/20251118173250.2821388-1-aliceryhl@google.com
Fixes: d99dc586ca7c ("uaccess: decouple INLINE_COPY_FROM_USER and CONFIG_RUST")
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Andreas Hindborg <a.hindborg@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Björn Roy Baron <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Trevor Gross <tmgross@umich.edu>
Cc: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:44 -08:00
Sourabh Jain
aa0145563c crash: export crashkernel CMA reservation to userspace
Add a sysfs entry /sys/kernel/kexec_crash_cma_ranges to expose all CMA
crashkernel ranges.

This allows userspace tools configuring kdump to determine how much memory
is reserved for crashkernel.  If CMA is used, tools can warn users when
attempting to capture user pages with CMA reservation.

The new sysfs hold the CMA ranges in below format:

cat /sys/kernel/kexec_crash_cma_ranges
100000000-10c7fffff

The reason for not including Crash CMA Ranges in /proc/iomem is to avoid
conflicts.  It has been observed that contiguous memory ranges are
sometimes shown as two separate System RAM entries in /proc/iomem.  If a
CMA range overlaps two System RAM ranges, adding crashk_res to /proc/iomem
can create a conflict.  Reference [1] describes one such instance on the
PowerPC architecture.

Link: https://lkml.kernel.org/r/20251118071023.1673329-1-sourabhjain@linux.ibm.com
Link: https://lore.kernel.org/all/20251016142831.144515-1-sourabhjain@linux.ibm.com/ [1]
Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Aditya Gupta <adityag@linux.ibm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Cc: Shivang Upadhyay <shivangu@linux.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:44 -08:00
Sourabh Jain
fdd76c8d63 Documentation/ABI: add kexec and kdump sysfs interface
Add an ABI document for following kexec and kdump sysfs interface:
- /sys/kernel/kexec_loaded
- /sys/kernel/kexec_crash_loaded
- /sys/kernel/kexec_crash_size
- /sys/kernel/crash_elfcorehdr_size

Link: https://lkml.kernel.org/r/20251117035153.1199665-1-sourabhjain@linux.ibm.com
Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
Cc: Aditya Gupta <adityag@linux.ibm.com>
Cc: Baoquan he <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mahesh J Salgaonkar <mahesh@linux.ibm.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Cc: Shivang Upadhyay <shivangu@linux.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:44 -08:00
Guan-Chun Wu
b1b72ac25f ceph: replace local base64 helpers with lib/base64
Remove the ceph_base64_encode() and ceph_base64_decode() functions and
replace their usage with the generic base64_encode() and base64_decode()
helpers from lib/base64.

This eliminates the custom implementation in Ceph, reduces code
duplication, and relies on the shared Base64 code in lib.  The helpers
preserve RFC 3501-compliant Base64 encoding without padding, so there are
no functional changes.

This change also improves performance: encoding is about 2.7x faster and
decoding achieves 43-52x speedups compared to the previous local
implementation.

Link: https://lkml.kernel.org/r/20251114060240.89965-1-409411716@gms.tku.edu.tw
Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
Reviewed-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Cc: Keith Busch <kbusch@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Xiubo Li <xiubli@redhat.com>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: "Theodore Y. Ts'o" <tytso@mit.edu>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: David Laight <david.laight.linux@gmail.com>
Cc: Yu-Sheng Huang <home7438072@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:44 -08:00
Guan-Chun Wu
7794510e20 fscrypt: replace local base64url helpers with lib/base64
Replace the base64url encoding and decoding functions in fscrypt with the
generic base64_encode() and base64_decode() helpers from lib/base64.

This removes the custom implementation in fscrypt, reduces code
duplication, and relies on the shared Base64 implementation in lib.  The
helpers preserve RFC 4648-compliant URL-safe Base64 encoding without
padding, so there are no functional changes.

This change also improves performance: encoding is about 2.7x faster and
decoding achieves 43-52x speedups compared to the previous implementation.

Link: https://lkml.kernel.org/r/20251114060221.89734-1-409411716@gms.tku.edu.tw
Reviewed-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Laight <david.laight.linux@gmail.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Keith Busch <kbusch@kernel.org>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: "Theodore Y. Ts'o" <tytso@mit.edu>
Cc: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Cc: Xiubo Li <xiubli@redhat.com>
Cc: Yu-Sheng Huang <home7438072@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:44 -08:00
Guan-Chun Wu
8b365c4f5b lib: add KUnit tests for base64 encoding/decoding
Add a KUnit test suite to validate the base64 helpers.  The tests cover
both encoding and decoding, including padded and unpadded forms as defined
by RFC 4648 (standard base64), and add negative cases for malformed inputs
and padding errors.

The test suite also validates other variants (URLSAFE, IMAP) to ensure
their correctness.

In addition to functional checks, the suite includes simple
microbenchmarks which report average encode/decode latency for small (64B)
and larger (1KB) inputs.  These numbers are informational only and do not
gate the tests.

Kconfig (BASE64_KUNIT) and lib/tests/Makefile are updated accordingly.

Sample KUnit output:

    KTAP version 1
    # Subtest: base64
    # module: base64_kunit
    1..4
    # base64_performance_tests: [64B] encode run : 32ns
    # base64_performance_tests: [64B] decode run : 35ns
    # base64_performance_tests: [1KB] encode run : 510ns
    # base64_performance_tests: [1KB] decode run : 530ns
    ok 1 base64_performance_tests
    ok 2 base64_std_encode_tests
    ok 3 base64_std_decode_tests
    ok 4 base64_variant_tests
    # base64: pass:4 fail:0 skip:0 total:4
    # Totals: pass:4 fail:0 skip:0 total:4

Link: https://lkml.kernel.org/r/20251114060157.89507-1-409411716@gms.tku.edu.tw
Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
Reviewed-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Laight <david.laight.linux@gmail.com>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Keith Busch <kbusch@kernel.org>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: "Theodore Y. Ts'o" <tytso@mit.edu>
Cc: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Cc: Xiubo Li <xiubli@redhat.com>
Cc: Yu-Sheng Huang <home7438072@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:44 -08:00
Guan-Chun Wu
9c7d3cf94d lib/base64: rework encode/decode for speed and stricter validation
The old base64 implementation relied on a bit-accumulator loop, which was
slow for larger inputs and too permissive in validation.  It would accept
extra '=', missing '=', or even '=' appearing in the middle of the input,
allowing malformed strings to pass.  This patch reworks the internals to
improve performance and enforce stricter validation.

Changes:
 - Encoder:
   * Process input in 3-byte blocks, mapping 24 bits into four 6-bit
     symbols, avoiding bit-by-bit shifting and reducing loop iterations.
   * Handle the final 1-2 leftover bytes explicitly and emit '=' only when
     requested.
 - Decoder:
   * Based on the reverse lookup tables from the previous patch, decode
     input in 4-character groups.
   * Each group is looked up directly, converted into numeric values, and
     combined into 3 output bytes.
   * Explicitly handle padded and unpadded forms:
      - With padding: input length must be a multiple of 4, and '=' is
        allowed only in the last two positions. Reject stray or early '='.
      - Without padding: validate tail lengths (2 or 3 chars) and require
        unused low bits to be zero.
   * Removed the bit-accumulator style loop to reduce loop iterations.

Performance (x86_64, Intel Core i7-10700 @ 2.90GHz, avg over 1000 runs,
KUnit):

Encode:
  64B   ~90ns   -> ~32ns   (~2.8x)
  1KB  ~1332ns  -> ~510ns  (~2.6x)

Decode:
  64B  ~1530ns  -> ~35ns   (~43.7x)
  1KB ~27726ns  -> ~530ns  (~52.3x)

[akpm@linux-foundation.org: remove u32 casts, per David and Guan-Chun]
Link: https://lkml.kernel.org/r/20251114060132.89279-1-409411716@gms.tku.edu.tw
Co-developed-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Co-developed-by: Yu-Sheng Huang <home7438072@gmail.com>
Signed-off-by: Yu-Sheng Huang <home7438072@gmail.com>
Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
Reviewed-by: David Laight <david.laight.linux@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Keith Busch <kbusch@kernel.org>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: "Theodore Y. Ts'o" <tytso@mit.edu>
Cc: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Cc: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:44 -08:00
Kuan-Wei Chiu
c4eb7ad32e lib/base64: optimize base64_decode() with reverse lookup tables
Replace the use of strchr() in base64_decode() with precomputed reverse
lookup tables for each variant. This avoids repeated string scans and
improves performance. Use -1 in the tables to mark invalid characters.

Decode:
  64B   ~1530ns  ->  ~80ns    (~19.1x)
  1KB  ~27726ns  -> ~1239ns   (~22.4x)

[akpm@linux-foundation.org: fix kernedoc]
Link: https://lkml.kernel.org/r/20251114060107.89026-1-409411716@gms.tku.edu.tw
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Co-developed-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
Reviewed-by: David Laight <david.laight.linux@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Keith Busch <kbusch@kernel.org>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: "Theodore Y. Ts'o" <tytso@mit.edu>
Cc: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Cc: Xiubo Li <xiubli@redhat.com>
Cc: Yu-Sheng Huang <home7438072@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:44 -08:00
Kuan-Wei Chiu
f1e2ca801c lib/base64: add support for multiple variants
Patch series " lib/base64: add generic encoder/decoder, migrate users", v5.

This series introduces a generic Base64 encoder/decoder to the kernel
library, eliminating duplicated implementations and delivering significant
performance improvements.

The Base64 API has been extended to support multiple variants (Standard,
URL-safe, and IMAP) as defined in RFC 4648 and RFC 3501.  The API now
takes a variant parameter and an option to control padding.  As part of
this series, users are migrated to the new interface while preserving
their specific formats: fscrypt now uses BASE64_URLSAFE, Ceph uses
BASE64_IMAP, and NVMe is updated to BASE64_STD.

On the encoder side, the implementation processes input in 3-byte blocks,
mapping 24 bits directly to 4 output symbols.  This avoids bit-by-bit
streaming and reduces loop overhead, achieving about a 2.7x speedup
compared to previous implementations.

On the decoder side, replace strchr() lookups with per-variant reverse
tables and process input in 4-character groups.  Each group is mapped to
numeric values and combined into 3 bytes.  Padded and unpadded forms are
validated explicitly, rejecting invalid '=' usage and enforcing tail
rules.  This improves throughput by ~43-52x.


This patch (of 6):

Extend the base64 API to support multiple variants (standard, URL-safe,
and IMAP) as defined in RFC 4648 and RFC 3501.  The API now takes a
variant parameter and an option to control padding.  Update NVMe auth code
to use the new interface with BASE64_STD.

Link: https://lkml.kernel.org/r/20251114055829.87814-1-409411716@gms.tku.edu.tw
Link: https://lkml.kernel.org/r/20251114060045.88792-1-409411716@gms.tku.edu.tw
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Co-developed-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
Reviewed-by: David Laight <david.laight.linux@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Eric Biggers <ebiggers@kernel.org>
Cc: Ilya Dryomov <idryomov@gmail.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Keith Busch <kbusch@kernel.org>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: "Theodore Y. Ts'o" <tytso@mit.edu>
Cc: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Cc: Xiubo Li <xiubli@redhat.com>
Cc: Yu-Sheng Huang <home7438072@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:43 -08:00
Feng Tang
03ef32d665 sys_info: add a default kernel sys_info mask
Which serves as a global default sys_info mask.  When users want the same
system information for many error cases (panic, hung, lockup ...), they
can chose to set this global knob only once, while not setting up each
individual sys_info knobs.

This just adds a 'lazy' option, and doesn't change existing kernel
behavior as the mask is 0 by default.

Link: https://lkml.kernel.org/r/20251113111039.22701-5-feng.tang@linux.alibaba.com
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:43 -08:00
Feng Tang
a9af76a787 watchdog: add sys_info sysctls to dump sys info on system lockup
When soft/hard lockup happens, developers may need different kinds of
system information (call-stacks, memory info, locks, etc.) to help
debugging.

Add 'softlockup_sys_info' and 'hardlockup_sys_info' sysctl knobs to take
human readable string like "tasks,mem,timers,locks,ftrace,...", and when
system lockup happens, all requested information will be printed out. 
(refer kernel/sys_info.c for more details).

Link: https://lkml.kernel.org/r/20251113111039.22701-4-feng.tang@linux.alibaba.com
Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:43 -08:00
Feng Tang
8b2b9b4f6f hung_task: add hung_task_sys_info sysctl to dump sys info on task-hung
When task-hung happens, developers may need different kinds of system
information (call-stacks, memory info, locks, etc.) to help debugging.

Add 'hung_task_sys_info' sysctl knob to take human readable string like
"tasks,mem,timers,locks,ftrace,...", and when task-hung happens, all
requested information will be dumped.  (refer kernel/sys_info.c for more
details).

Meanwhile, the newly introduced sys_info() call is used to unify some
existing info-dumping knobs.

[feng.tang@linux.alibaba.com: maintain consistecy established behavior, per Lance and Petr]
  Link: https://lkml.kernel.org/r/aRncJo1mA5Zk77Hr@U-2FWC9VHC-2323.local
Link: https://lkml.kernel.org/r/20251113111039.22701-3-feng.tang@linux.alibaba.com
Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com>
Suggested-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:43 -08:00
Feng Tang
5f264c00b6 docs: panic: correct some sys_ifo names in sysctl doc
Patch series "Enable hung_task and lockup cases to dump system info on
demand", v2.

When working on kernel stability issues: panic, task-hung and soft/hard
lockup are frequently met.  And to debug them, user may need lots of
system information at that time, like task call stacks, lock info, memory
info, ftrace dump, etc.

panic case already uses sys_info() for this purpose, and has a
'panic_sys_info' sysctl(also support cmdline setup) interface to take
human readable string like "tasks,mem,timers,locks,ftrace,..." to control
what kinds of information is needed.  Which is also helpful to debug
task-hung and lockup cases.

So this patchset introduces the similar sys_info sysctl interface for
task-hung and lockup cases.

his is mainly for debugging and the info dumping could be intrusive, like
dumping call stack for all tasks when system has huge number of tasks,
similarly for ftrace dump (we may add tracing_stop() and tracing_start()
around it)

Locally these have been used in our bug chasing for stability issues and
were helpful.

As Andrew suggested, add a configurable global 'kernel_sys_info' knob. 
When error scenarios like panic/hung-task/lockup etc doesn't setup their
own sys_info knob and calls sys_info() with parameter "0", this global
knob will take effect.  It could be used for other kernel cases like OOM,
which may not need one dedicated sys_info knob.


This patch (of 4):

Some sys_info names wered forgotten to change in patch iterations, while
the right names are defined in kernel/sys_info.c.

Link: https://lkml.kernel.org/r/20251113111039.22701-1-feng.tang@linux.alibaba.com
Link: https://lkml.kernel.org/r/20251113111039.22701-2-feng.tang@linux.alibaba.com
Fixes: d747755917bf ("panic: add 'panic_sys_info' sysctl to take human readable string parameter")
Signed-off-by: Feng Tang <feng.tang@linux.alibaba.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:43 -08:00
Kuan-Wei Chiu
9ab38c5216 Revert "lib/plist.c: enforce memory ordering in plist_check_list"
This reverts commit 7abcb84f953df037d40fad66f2109db318dd155b.

The introduction of WRITE_ONCE() calls for the 'prev' and 'next' variables
inside plist_check_list() was a misapplication.  WRITE_ONCE() is
fundamentally a compiler barrier designed to prevent compiler
optimizations (like caching or reordering) on shared memory locations. 
However, the variables 'prev' and 'next' are local, stack-allocated
pointers accessed only by the current thread's invocation of the function.

Since these pointers are thread-local and are never accessed concurrently,
applying WRITE_ONCE() to them is semantically incorrect and unnecessary. 
Furthermore, the use of WRITE_ONCE() on local variables prevents the
compiler from performing standard optimizations, such as keeping these
variables cached solely in CPU registers throughout the loop, potentially
introducing performance overhead.  Restore the conventional C assignment
for local loop variables, allowing the compiler to generate optimal code.

Link: https://lkml.kernel.org/r/20251113193413.499309-1-visitorckw@gmail.com
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Cc: I Hsin Cheng <richard120310@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:43 -08:00
Gustavo Padovan
b50144900a MAINTAINERS: remove Gustavo from sync framework
I haven't been involved in the work anymore for some time.  It is only
fair that I remove myself from it and let other continue to take care of
it.

Link: https://lkml.kernel.org/r/20251112134330.64130-1-gustavo.padovan@collabora.com
Signed-off-by: Gustavo Padovan <gustavo@padovan.org>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:43 -08:00
Xie Yuanbin
242b872239 include/linux/once_lite.h: fix judgment in WARN_ONCE with clang
For c code:
```c
extern int xx;
void test(void)
{
	if (WARN_ONCE(xx, "x"))
		__asm__ volatile ("nop":::);
}
```

Clang will generate the following assembly code:
```assemble
test:
	movl	xx(%rip), %eax // Assume xx == 0 (likely case)
	testl	%eax, %eax // judge once
	je	.LBB0_3    // jump to .LBB0_3
	testb	$1, test.__already_done(%rip)
	je	.LBB0_2
.LBB0_3:
	testl	%eax, %eax // judge again
	je	.LBB0_5    // jump to .LBB0_5
.LBB0_4:
	nop
.LBB0_5:
	retq
	// omit
```

In the above code, `xx == 0` should be a likely case, but in this case,
xx has been judged twice.

Test info:
1. kernel source:
linux-next
commit 9c0826a5d9aa4d52206d ("Add linux-next specific files for 20251107")
2. compiler:
clang: Debian clang version 21.1.4 (8) with
Debian LLD 21.1.4 (compatible with GNU linkers)
3. config:
base on default x86_64_defconfig, and setting:
CONFIG_MITIGATION_RETHUNK=n
CONFIG_STACKPROTECTOR=n

Add unlikely to __ret_cond to help the compiler optimize correctly.

[akpm@linux-foundation.org: undo whitespace changes]
Link: https://lkml.kernel.org/r/20251109083715.24495-1-qq570070308@gmail.com
Signed-off-by: Xie Yuanbin <qq570070308@gmail.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Maninder Singh <maninder1.s@samsung.com>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:43 -08:00
Ryusuke Konishi
1ab980e90c MAINTAINERS: update nilfs2 entry
Viacheslav has kindly offered to help with the maintenance of nilfs2 by
upstreaming patches, similar to the HFS/HFS+ tree.  I've accepted his
offer, and will therefore add him as a co-maintainer and switch the
project's git tree for that role.

At the same time, change the outdated status field to Maintained to
reflect the current state.

Link: https://lkml.kernel.org/r/20251107153530.9023-1-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Acked-by: Viacheslav Dubeyko <slava@dubeyko.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:43 -08:00
zhang jiao
14954cd190 fs/proc/page: remove unused KPMBITS
KPMBITS is never referenced in the code. Just remove it.

Link: https://lkml.kernel.org/r/20251106010735.1603-1-zhangjiao2@cmss.chinamobile.com
Signed-off-by: zhang jiao <zhangjiao2@cmss.chinamobile.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Liu Ye <liuye@kylinos.cn>
Cc: Luiz Capitulino <luizcap@redhat.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:42 -08:00
Andy Shevchenko
f3fb126fdc math.h: amend abs() kernel-doc and add a note about signed type limits
- amend the kernel-doc so the description is decoupled from the
  parameter descriptions.

- add a note to explain behaviour for the signed types when supplied
  value is the minimum (e.g., INT_MIN for int type).

Link: https://lkml.kernel.org/r/20251106152051.2361551-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Jonathan Cameron <jic23@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:42 -08:00
Ilya Leoshkevich
581ee79a25 scripts/gdb/symbols: make BPF debug info available to GDB
One can debug BPF programs with QEMU gdbstub by setting a breakpoint on
bpf_prog_kallsyms_add(), waiting for a hit with a matching aux.name, and
then setting a breakpoint on bpf_func.  This is tedious, error-prone, and
also lacks line numbers.

Automate this in a way similar to the existing support for modules in
lx-symbols.

Enumerate and monitor changes to both BPF kallsyms and JITed progs.  For
each ksym, generate and compile a synthetic .s file containing the name,
code, and size.  In addition, if this ksym is also a prog, and not a
trampoline, add line number information.

Ensure that this is a no-op if the kernel is built without BPF support or
if "as" is missing.  In theory the "as" dependency may be dropped by
generating the synthetic .o file manually, but this is too much complexity
for too little benefit.

Now one can debug BPF progs out of the box like this:

    (gdb) lx-symbols -bpf
    (gdb) b bpf_prog_4e612a6a881a086b_arena_list_add
    Breakpoint 2 (bpf_prog_4e612a6a881a086b_arena_list_add) pending.

    # ./test_progs -t arena_list

    Thread 4 hit Breakpoint 2, bpf_prog_4e612a6a881a086b_arena_list_add ()
        at linux/tools/testing/selftests/bpf/progs/arena_list.c:51
    51              list_head = &global_head;
    (gdb) n
    bpf_prog_4e612a6a881a086b_arena_list_add () at linux/tools/testing/selftests/bpf/progs/arena_list.c:53
    53              for (i = zero; i < cnt && can_loop; i++) {

This also works for subprogs.

Link: https://lkml.kernel.org/r/20251106124600.86736-3-iii@linux.ibm.com
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Daniel Borkman <daniel@iogearbox.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Kieran Bingham <kbingham@kernel.org>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:42 -08:00
Ilya Leoshkevich
caa71919a6 scripts/gdb/radix-tree: add lx-radix-tree-command
Patch series "scripts/gdb/symbols: make BPF debug info available to GDB",
v2.

This series greatly simplifies debugging BPF progs when using QEMU gdbstub
by providing symbol names, sizes, and line numbers to GDB.

Patch 1 adds radix tree iteration, which is necessary for parsing
prog_idr.  Patch 2 is the actual implementation; its description contains
some details on how to use this.


This patch (of 2):

Add a function and a command to iterate over radix tree contents. 
Duplicate the C implementation in Python, but drop support for tagging.

Link: https://lkml.kernel.org/r/20251106124600.86736-1-iii@linux.ibm.com
Link: https://lkml.kernel.org/r/20251106124600.86736-2-iii@linux.ibm.com
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Daniel Borkman <daniel@iogearbox.net>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Kieran Bingham <kbingham@kernel.org>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:42 -08:00
Pratyush Yadav
c9dddd9816 MAINTAINERS: add Pratyush as a reviewer for KHO
I have been reviewing most patches for KHO already, and it is easier to
spot them if I am directly in Cc.

Link: https://lkml.kernel.org/r/20251105102022.18798-1-pratyush@kernel.org
Signed-off-by: Pratyush Yadav <pratyush@kernel.org>
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Pratyush Yadav <pratyush@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:42 -08:00
David Laight
1d1ef8c1fb lib: test_mul_u64_u64_div_u64(): test the 32bit code on 64bit
There are slight differences in the mul_u64_add_u64_div_u64() code between
32bit and 64bit systems.

Compile and test the 32bit version on 64bit hosts for better test
coverage.

Link: https://lkml.kernel.org/r/20251105201035.64043-10-david.laight.linux@gmail.com
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
Cc: Biju Das <biju.das.jz@bp.renesas.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:42 -08:00
David Laight
d10bb374c4 lib: mul_u64_u64_div_u64(): optimise the divide code
Replace the bit by bit algorithm with one that generates 16 bits per
iteration on 32bit architectures and 32 bits on 64bit ones.

On my zen 5 this reduces the time for the tests (using the generic code)
from ~3350ns to ~1000ns.

Running the 32bit algorithm on 64bit x86 takes ~1500ns.  It'll be slightly
slower on a real 32bit system, mostly due to register pressure.

The savings for 32bit x86 are much higher (tested in userspace).  The
worst case (lots of bits in the quotient) drops from ~900 clocks to ~130
(pretty much independant of the arguments).  Other 32bit architectures may
see better savings.

It is possibly to optimise for divisors that span less than
__LONG_WIDTH__/2 bits.  However I suspect they don't happen that often and
it doesn't remove any slow cpu divide instructions which dominate the
result.

Typical improvements for 64bit random divides:
               old     new
sandy bridge:  470     150
haswell:       400     144
piledriver:    960     467   I think rdpmc is very slow.
zen5:          244      80
(Timing is 'rdpmc; mul_div(); rdpmc' with the multiply depending on the
first rdpmc and the second rdpmc depending on the quotient.)

Object code (64bit x86 test program): old 0x173 new 0x141.

Link: https://lkml.kernel.org/r/20251105201035.64043-9-david.laight.linux@gmail.com
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
Cc: Biju Das <biju.das.jz@bp.renesas.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:42 -08:00
David Laight
630f96a687 lib: mul_u64_u64_div_u64(): optimise multiply on 32bit x86
gcc generates horrid code for both ((u64)u32_a * u32_b) and (u64_a +
u32_b).  As well as the extra instructions it can generate a lot of spills
to stack (including spills of constant zeros and even multiplies by
constant zero).

mul_u32_u32() already exists to optimise the multiply.  Add a similar
add_u64_32() for the addition.  Disable both for clang - it generates
better code without them.

Move the 64x64 => 128 multiply into a static inline helper function for
code clarity.  No need for the a/b_hi/lo variables, the implicit casts on
the function calls do the work for us.  Should have minimal effect on the
generated code.

Use mul_u32_u32() and add_u64_u32() in the 64x64 => 128 multiply in
mul_u64_add_u64_div_u64().

Link: https://lkml.kernel.org/r/20251105201035.64043-8-david.laight.linux@gmail.com
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
Cc: Biju Das <biju.das.jz@bp.renesas.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:42 -08:00
David Laight
f0bff2eb04 lib: test_mul_u64_u64_div_u64(): test both generic and arch versions
Change the #if in div64.c so that test_mul_u64_u64_div_u64.c can compile
and test the generic version (including the 'long multiply') on
architectures (eg amd64) that define their own copy.

Test the kernel version and the locally compiled version on all arch. 
Output the time taken (in ns) on the 'test completed' trace.

For reference, on my zen 5, the optimised version takes ~220ns and the
generic version ~3350ns.  Using the native multiply saves ~200ns and
adding back the ilog2() 'optimisation' test adds ~50ms.

Link: https://lkml.kernel.org/r/20251105201035.64043-7-david.laight.linux@gmail.com
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
Cc: Biju Das <biju.das.jz@bp.renesas.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:42 -08:00
David Laight
500db21917 lib: add tests for mul_u64_u64_div_u64_roundup()
Replicate the existing mul_u64_u64_div_u64() test cases with round up. 
Update the shell script that verifies the table, remove the comment
markers so that it can be directly pasted into a shell.

Rename the divisor from 'c' to 'd' to match mul_u64_add_u64_div_u64().

It any tests fail then fail the module load with -EINVAL.

Link: https://lkml.kernel.org/r/20251105201035.64043-6-david.laight.linux@gmail.com
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
Cc: Biju Das <biju.das.jz@bp.renesas.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:42 -08:00
David Laight
6480241f31 lib: add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup()
The existing mul_u64_u64_div_u64() rounds down, a 'rounding up' variant
needs 'divisor - 1' adding in between the multiply and divide so cannot
easily be done by a caller.

Add mul_u64_add_u64_div_u64(a, b, c, d) that calculates (a * b + c)/d and
implement the 'round down' and 'round up' using it.

Update the x86-64 asm to optimise for 'c' being a constant zero.

Add kerndoc definitions for all three functions.

Link: https://lkml.kernel.org/r/20251105201035.64043-5-david.laight.linux@gmail.com
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
Cc: Biju Das <biju.das.jz@bp.renesas.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:41 -08:00
David Laight
d91f891d58 lib: mul_u64_u64_div_u64(): simplify check for a 64bit product
If the product is only 64bits div64_u64() can be used for the divide. 
Replace the pre-multiply check (ilog2(a) + ilog2(b) <= 62) with a simple
post-multiply check that the high 64bits are zero.

This has the advantage of being simpler, more accurate and less code.  It
will always be faster when the product is larger than 64bits.

Most 64bit cpu have a native 64x64=128 bit multiply, this is needed (for
the low 64bits) even when div64_u64() is called - so the early check gains
nothing and is just extra code.

32bit cpu will need a compare (etc) to generate the 64bit ilog2() from two
32bit bit scans - so that is non-trivial.  (Never mind the mess of x86's
'bsr' and any oddball cpu without fast bit-scan instructions.) Whereas the
additional instructions for the 128bit multiply result are pretty much one
multiply and two adds (typically the 'adc $0,%reg' can be run in parallel
with the instruction that follows).

The only outliers are 64bit systems without 128bit mutiply and simple in
order 32bit ones with fast bit scan but needing extra instructions to get
the high bits of the multiply result.  I doubt it makes much difference to
either, the latter is definitely not mainstream.

If anyone is worried about the analysis they can look at the generated
code for x86 (especially when cmov isn't used).

Link: https://lkml.kernel.org/r/20251105201035.64043-4-david.laight.linux@gmail.com
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
Cc: Biju Das <biju.das.jz@bp.renesas.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:41 -08:00
David Laight
08092babd3 lib: mul_u64_u64_div_u64(): combine overflow and divide by zero checks
Since the overflow check always triggers when the divisor is zero
move the check for divide by zero inside the overflow check.
This means there is only one test in the normal path.

Link: https://lkml.kernel.org/r/20251105201035.64043-3-david.laight.linux@gmail.com
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
Cc: Biju Das <biju.das.jz@bp.renesas.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:41 -08:00
David Laight
5944f875ac lib: mul_u64_u64_div_u64(): rename parameter 'c' to 'd'
Patch series "Implement mul_u64_u64_div_u64_roundup()", v5.

The pwm-stm32.c code wants a 'rounding up' version of
mul_u64_u64_div_u64().  This can be done simply by adding 'divisor - 1' to
the 128bit product.  Implement mul_u64_add_u64_div_u64(a, b, c, d) = (a *
b + c)/d based on the existing code.  Define mul_u64_u64_div_u64(a, b, d)
as mul_u64_add_u64_div_u64(a, b, 0, d) and mul_u64_u64_div_u64_roundup(a,
b, d) as mul_u64_add_u64_div_u64(a, b, d-1, d).

Only x86-64 has an optimsed (asm) version of the function.  That is
optimised to avoid the 'add c' when c is known to be zero.  In all other
cases the extra code will be noise compared to the software divide code.

The test module has been updated to test mul_u64_u64_div_u64_roundup() and
also enhanced it to verify the C division code on x86-64 and the 32bit
division code on 64bit.


This patch (of 9):

Change to prototype from mul_u64_u64_div_u64(u64 a, u64 b, u64 c) to
mul_u64_u64_div_u64(u64 a, u64 b, u64 d).  Using 'd' for 'divisor' makes
more sense.

An upcoming change adds a 'c' parameter to calculate (a * b + c)/d.

Link: https://lkml.kernel.org/r/20251105201035.64043-1-david.laight.linux@gmail.com
Link: https://lkml.kernel.org/r/20251105201035.64043-2-david.laight.linux@gmail.com
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
Cc: Biju Das <biju.das.jz@bp.renesas.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:41 -08:00
Christoph Hellwig
af9b65d686 kernel/hung_task: unexport sysctl_hung_task_timeout_secs
This was added by the bcachefs pull requests despite various
objections, and with bcachefs removed is now unused.

This reverts commit 5c3273ec3c6a ("kernel/hung_task.c: export
sysctl_hung_task_timeout_secs").

Link: https://lkml.kernel.org/r/20251104121920.2430568-1-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:41 -08:00
Andy Shevchenko
bd97c97641 util_macros.h: fix kernel-doc for u64_to_user_ptr()
The added documentation to u64_to_user_ptr() misspelled the function name.
Fix it.

Link: https://lkml.kernel.org/r/20251104183834.1046584-1-andriy.shevchenko@linux.intel.com
Fixes: 029c896c4105 ("kernel.h: move PTR_IF() and u64_to_user_ptr() to util_macros.h")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Alexandru Ardelean <aardelean@baylibre.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:41 -08:00
Carlos López
464c7ea5c3 checkpatch: add IDR to the deprecated list
As of commit 85656ec193e9, the IDR interface is marked as deprecated in
the documentation, but no checks are made in that regard for new code. 
Add the existing IDR initialization APIs to the deprecated list in
checkpatch, so that if new code is introduced using these APIs, a warning
is emitted.

Link: https://lkml.kernel.org/r/20251031111908.2266077-2-clopez@suse.de
Signed-off-by: Carlos López <clopez@suse.de>
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Joe Perches <joe@perches.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:41 -08:00
Borislav Petkov (AMD)
7f37d88f5c lib/Kconfig.debug: cleanup CONFIG_DEBUG_SECTION_MISMATCH help text
Simplify formulations, correct flow, split it into proper paragraphs and
update structure.

No functional changes.

Link: https://lkml.kernel.org/r/20251029122743.1110-1-bp@kernel.org
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:41 -08:00
Deepanshu Kartikey
93ce0ff117 ocfs2: validate cl_bpc in allocator inodes to prevent divide-by-zero
The chain allocator field cl_bpc (blocks per cluster) is read from disk
and used in division operations without validation.  A corrupted
filesystem image with cl_bpc=0 causes a divide-by-zero crash in the
kernel:

  divide error: 0000 [#1] PREEMPT SMP KASAN
  RIP: 0010:ocfs2_bg_discontig_add_extent fs/ocfs2/suballoc.c:335 [inline]
  RIP: 0010:ocfs2_block_group_fill+0x5bd/0xa70 fs/ocfs2/suballoc.c:386
  Call Trace:
   ocfs2_block_group_alloc+0x7e9/0x1330 fs/ocfs2/suballoc.c:703
   ocfs2_reserve_suballoc_bits+0x20a6/0x4640 fs/ocfs2/suballoc.c:834
   ocfs2_reserve_new_inode+0x4f4/0xcc0 fs/ocfs2/suballoc.c:1074
   ocfs2_mknod+0x83c/0x2050 fs/ocfs2/namei.c:306

This patch adds validation in ocfs2_validate_inode_block() to ensure
cl_bpc matches the expected value calculated from the superblock's cluster
size and block size for chain allocator inodes (identified by
OCFS2_CHAIN_FL).

Moving the validation to inode validation time (rather than allocation time)
has several benefits:
- Validates once when the inode is read, rather than on every allocation
- Protects all code paths that use cl_bpc (allocation, resize, etc.)
- Follows the existing pattern of inode validation in OCFS2
- Centralizes validation logic

The validation catches both:
- Zero values that cause divide-by-zero crashes
- Non-zero but incorrect values indicating filesystem corruption or
  mismatched filesystem geometry

With this fix, mounting a corrupted filesystem produces:

  OCFS2: ERROR (device loop0): ocfs2_validate_inode_block: Inode 74
         has corrupted cl_bpc: ondisk=0 expected=16

instead of a kernel crash.

[dmantipov@yandex.ru: combine into the series and tweak the message to fit the commonly used style]
Link: https://lkml.kernel.org/r/20251030153003.1934585-2-dmantipov@yandex.ru
Link: https://lore.kernel.org/ocfs2-devel/20251026132625.12348-1-kartikey406@gmail.com/T/#u [v1]
Link: https://lore.kernel.org/all/20251027124131.10002-1-kartikey406@gmail.com/T/ [v2]
Signed-off-by: Deepanshu Kartikey <kartikey406@gmail.com>
Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Reported-by: syzbot+fd8af97c7227fe605d95@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=fd8af97c7227fe605d95
Tested-by: syzbot+fd8af97c7227fe605d95@syzkaller.appspotmail.com
Suggested-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Heming Zhao <heming.zhao@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:41 -08:00
Dmitry Antipov
e1c70505ee ocfs2: add extra consistency checks for chain allocator dinodes
When validating chain allocator dinode in 'ocfs2_validate_inode_block()',
add an extra checks whether a) the maximum amount of chain records in
'struct ocfs2_chain_list' matches the value calculated based on the
filesystem block size, and b) the next free slot index is within the valid
range.

Link: https://lkml.kernel.org/r/20251030153003.1934585-1-dmantipov@yandex.ru
Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Reported-by: syzbot+77026564530dbc29b854@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=77026564530dbc29b854
Reported-by: syzbot+5054473a31f78f735416@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=5054473a31f78f735416
Suggested-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Deepanshu Kartikey <kartikey406@gmail.com>
Cc: Heming Zhao <heming.zhao@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:40 -08:00
Andy Shevchenko
9125163273 panic: sys_info: factor out read and write handlers
For the sake of the code readability and easier maintenance factor out
read and write sys_info handlers.

[akpm@linux-foundation.org: coding-style cleanups]
Link: https://lkml.kernel.org/r/20251030132007.3742368-7-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Cc: Feng Tang <feng.tang@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:40 -08:00
Andy Shevchenko
f791dcc842 panic: sys_info: deduplicate local variable 'table; assignments
The both handlers use the local 'table' variable and assign the same data
to it, deduplicate that.

Link: https://lkml.kernel.org/r/20251030132007.3742368-6-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Feng Tang <feng.tang@linux.alibaba.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:40 -08:00
Andy Shevchenko
eb72c4667f panic: sys_info: rewrite a fix for a compilation error (make W=1)
Compiler was not happy about dead variable in use:

lib/sys_info.c:52:19: error: variable 'sys_info_avail' is not needed and will not be emitted [-Werror,-Wunneeded-internal-declaration]
   52 | static const char sys_info_avail[] = "tasks,mem,timers,locks,ftrace,all_bt,blocked_tasks";
      |                   ^~~~~~~~~~~~~~

This was fixed by adding __maybe_unused attribute that just hides the
issue and didn't actually fix the root cause.  Rewrite the fix by moving
the local variable from stack to a heap.

As a side effect this drops unneeded "synchronisation" of duplicative info
and also makes code ready for the further refactoring.

Link: https://lkml.kernel.org/r/20251030132007.3742368-5-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Cc: Feng Tang <feng.tang@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:40 -08:00
Andy Shevchenko
d13adc6147 panic: sys_info:replace struct sys_info_name with plain array of strings
There is no need to keep a custom structure just for the need of a plain
array of strings.  Replace struct sys_info_name with plain array of
strings.

With that done, simplify the code, in particular, naturally use
for_each_set_bit() when iterating over si_bits_global bitmap.

Link: https://lkml.kernel.org/r/20251030132007.3742368-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Cc: Feng Tang <feng.tang@linux.alibaba.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:40 -08:00
Andy Shevchenko
760fc597c3 panic: sys_info: align constant definition names with parameters
Align constant definition names with parameters to make it easier to map. 
It's also better to maintain and extend the names while keeping their
uniqueness.

Link: https://lkml.kernel.org/r/20251030132007.3742368-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Feng Tang <feng.tang@linux.alibaba.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:40 -08:00
Andy Shevchenko
d79a3aeb74 panic: sys_info: capture si_bits_global before iterating over it
Patch series "panic: sys_info: Refactor and fix a potential issue", v3.

While targeting the compilation issue due to dangling variable, I have
noticed more opportunities for refactoring that helps to avoid above
mentioned compilation issue in a cleaner way and also fixes a potential
problem with global variable access.


This patch (of 6):

The for-loop might re-read the content of the memory the si_bits_global
points to on each iteration.  Instead, just capture it for the sake of
consistency and use that instead.

Link: https://lkml.kernel.org/r/20251030132007.3742368-1-andriy.shevchenko@linux.intel.com
Link: https://lkml.kernel.org/r/20251030132007.3742368-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Feng Tang <feng.tang@linux.alibaba.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:40 -08:00
Jeff Layton
898f944652 lockd: don't allow locking on reexported NFSv2/3
Since commit 9254c8ae9b81 ("nfsd: disallow file locking and delegations
for NFSv4 reexport"), file locking when reexporting an NFS mount via
NFSv4 is expressly prohibited by nfsd. Do the same in lockd:

Add a new  nlmsvc_file_cannot_lock() helper that will test whether file
locking is allowed for a given file, and return nlm_lck_denied_nolocks
if it isn't.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Olga Kornievskaia <okorniev@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-20 16:29:35 -05:00
Christoph Hellwig
f6dcad1d74 MAINTAINERS: add a nfsd blocklayout reviewer
Add a minimal entry for the block layout driver to make sure Christoph
who wrote the code gets Cced on all patches.  The actual maintenance
stays with the nfsd maintainer team.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-20 16:29:35 -05:00
David Laight
d1cadd4bfc nodemask: use min() instead of min_t()
min_t(unsigned int, a, b) casts an 'unsigned long' to 'unsigned int'.
Use min(a, b) instead as it promotes any 'unsigned int' to 'unsigned long'
and so cannot discard significant bits.

In this case the 'unsigned long' value is small enough that the result
is ok.

Detected by an extra check added to min_t().

Signed-off-by: David Laight <david.laight.linux@gmail.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-20 10:02:22 -05:00
Peng Fan
6f880e7bd1 remoteproc: imx_dsp_rproc: Simplify start/stop error handling
Replace goto-based error handling with early return pattern in
imx_dsp_rproc_{start,stop}() functions, and simplify if-else logic.

No functional changes, only code structure improvements for better
maintainability.

Signed-off-by: Peng Fan <peng.fan@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/20251119-imx-dsp-2025-11-19-v4-12-adafd342d07b@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-11-19 09:16:55 -07:00
Peng Fan
5c33a631a5 remoteproc: imx_rproc: Remove enum imx_rproc_method
There is no user of enum imx_rproc_method after moved to ops based
method. Remove it.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Reviewed-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Tested-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20251119-imx-dsp-2025-11-19-v4-11-adafd342d07b@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-11-19 09:15:59 -07:00
Peng Fan
3f5c1277a9 remoteproc: imx_dsp_rproc: Simplify IMX_RPROC_RESET_CONTROLLER switch case
Introduce imx_dsp_rproc_reset_ctr_{start, stop, detect_mode}() helper
functions for i.MX variants using IMX_RPROC_RESET_CONTROLLER to manage
remote processors.

Allows the removal of the IMX_RPROC_RESET_CONTROLLER switch-case blocks
from imx_dsp_rproc_[start,stop,detect_mode](), resulting in cleaner and
more maintainable code.

No functional changes.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Reviewed-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Tested-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20251119-imx-dsp-2025-11-19-v4-10-adafd342d07b@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-11-19 09:14:59 -07:00
Peng Fan
d5eb4d512f remoteproc: imx_dsp_rproc: Simplify IMX_RPROC_SCU_API switch case
Introduce imx_dsp_rproc_scu_api_{start, stop, detect_mode}() helper
functions for i.MX variants using IMX_RPROC_SCU_API to manage remote
processors.

Allows the removal of the IMX_RPROC_SCU_API switch-case blocks from
imx_dsp_rproc_[start,stop,detect_mode](), resulting in cleaner and more
maintainable code.

No functional changes.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Reviewed-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Tested-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20251119-imx-dsp-2025-11-19-v4-9-adafd342d07b@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-11-19 09:13:58 -07:00
Peng Fan
9f356d1277 remoteproc: imx_dsp_rproc: Simplify IMX_RPROC_MMIO switch case
Introduce imx_dsp_rproc_mmio_{start, stop, detect_mode}() helper functions
for i.MX variants using IMX_RPROC_MMIO to manage remote processors.

Allows the removal of the IMX_RPROC_MMIO switch-case blocks from
imx_dsp_rproc_[start,stop,detect_mode](), resulting in cleaner and more
maintainable code.

No functional changes.

Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Tested-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20251119-imx-dsp-2025-11-19-v4-8-adafd342d07b@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-11-19 09:12:46 -07:00
Peng Fan
66395eac5d remoteproc: imx_dsp_rproc: Move imx_dsp_rproc_dcfg closer to imx_dsp_rproc_of_match
Move the imx_dsp_rproc_dcfg structure definitions closer to
imx_dsp_rproc_of_match to prepare for adding start/stop/detect_mode ops for
each i.MX variant.

Avoids the need to declare function prototypes such as
'static int imx_dsp_rproc_mbox_init(struct imx_dsp_rproc *priv)' at the
beginning of the file, improving code organization and readability.

No functional changes.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Reviewed-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Tested-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20251119-imx-dsp-2025-11-19-v4-7-adafd342d07b@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-11-19 09:11:52 -07:00
Peng Fan
606e481169 remoteproc: imx_dsp_rproc: Use start/stop/detect_mode ops from imx_rproc_dcfg
Allow each platform to provide its own implementation of start/stop/
detect_mode operations, and prepare to eliminate the need for multiple
switch-case statements.

Improve code readability and maintainability by encapsulating
platform-specific behavior.

No functional changes.

Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Tested-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20251119-imx-dsp-2025-11-19-v4-6-adafd342d07b@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-11-19 09:10:32 -07:00
Peng Fan
8049dc7b63 remoteproc: imx_dsp_rproc: Drop extra space
Drop extra space between return and zero.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Reviewed-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Tested-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20251119-imx-dsp-2025-11-19-v4-5-adafd342d07b@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-11-19 09:09:21 -07:00
Peng Fan
e819a62d52 remoteproc: imx_dsp_rproc: Use dev_err_probe() for firmware and mode errors
Use dev_err_probe() to simplify the code. No functional change.

Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Tested-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20251119-imx-dsp-2025-11-19-v4-4-adafd342d07b@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-11-19 09:08:18 -07:00
Peng Fan
4120602423 remoteproc: imx_dsp_rproc: Use devm_pm_runtime_enable() helper
Current code on the cleanup path just disables runtime PM for a device.

Using resource managed version devm_pm_runtime_enable() registers a cleanup
callback that sets autosuspend to false and then disables runtime PM for
a device. So, basically the same functionality as we don't use autosuspend
anyway.

As a result, the .remove callback is no longer needed, reducing boilerplate
code.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Reviewed-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Tested-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20251119-imx-dsp-2025-11-19-v4-3-adafd342d07b@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-11-19 09:07:03 -07:00
Peng Fan
36951036a7 remoteproc: imx_dsp_rproc: Use devm_rproc_add() helper
Replace manual rproc_add() and cleanup logic with devm_rproc_add(), which
ties the remoteproc lifecycle to the device's lifecycle. This simplifies
error handling and ensures proper cleanup.

No functional changes.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Reviewed-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Tested-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20251119-imx-dsp-2025-11-19-v4-2-adafd342d07b@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-11-19 09:06:05 -07:00
Peng Fan
3003773ad6 remoteproc: imx_dsp_rproc: Simplify power domain attach and error handling
Refactor imx_dsp_attach_pm_domains() to use devm_pm_domain_attach_list()
directly, removing manual detach logic and simplifying resource management.

Also replace verbose error handling in imx_dsp_rproc_probe() with
dev_err_probe() for cleaner and more consistent error reporting.

No functional changes.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Reviewed-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Tested-by: Iuliana Prodan <iuliana.prodan@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20251119-imx-dsp-2025-11-19-v4-1-adafd342d07b@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-11-19 09:04:53 -07:00
Eric Biggers
78cd170d03 nfsd: Use MD5 library instead of crypto_shash
Update NFSD's support for "legacy client tracking" (which uses MD5) to
use the MD5 library instead of crypto_shash.  This has several benefits:

- Simpler code.  Notably, much of the error-handling code is no longer
  needed, since the library functions can't fail.

- Improved performance due to reduced overhead.  A microbenchmark of
  nfs4_make_rec_clidname() shows a speedup from 1455 cycles to 425.

- The MD5 code can now safely be built as a loadable module when nfsd is
  built as a loadable module.  (Previously, nfsd forced the MD5 code to
  built-in, presumably to work around the unreliability of the
  name-based loading.)  Thus select MD5 from the tristate option NFSD if
  NFSD_LEGACY_CLIENT_TRACKING, instead of from the bool option NFSD_V4.

- Fixes a bug where legacy client tracking was not supported on kernels
  booted with "fips=1", due to crypto_shash not allowing MD5 to be used.
  This particular use of MD5 is not for a cryptographic purpose, though,
  so it is acceptable even when fips=1 (see
  https://lore.kernel.org/r/dae495a93cbcc482f4ca23c3a0d9360a1fd8c3a8.camel@redhat.com/).

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Scott Mayhew <smayhew@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-17 08:46:12 -05:00
NeilBrown
fceb8734e7 nfsd: stop pretending that we cache the SEQUENCE reply.
nfsd does not cache the reply to a SEQUENCE.  As the comment above
nfsd4_replay_cache_entry() says:

 * The sequence operation is not cached because we can use the slot and
 * session values.

The comment above nfsd4_cache_this() suggests otherwise.

 * The session reply cache only needs to cache replies that the client
 * actually asked us to.  But it's almost free for us to cache compounds
 * consisting of only a SEQUENCE op, so we may as well cache those too.
 * Also, the protocol doesn't give us a convenient response in the case
 * of a replay of a solo SEQUENCE op that wasn't cached

The code in nfsd4_store_cache_entry() makes it clear that only responses
beyond 'cstate.data_offset' are actually cached, and data_offset is set
at the end of nfsd4_encode_sequence() *after* the sequence response has
been encoded.

This patch simplifies code and removes the confusing comments.

- nfsd4_is_solo_sequence() is discarded as not-useful.
- nfsd4_cache_this() is now trivial so it too is discarded with the
  code placed in-line at the one call-site in nfsd4_store_cache_entry().
- nfsd4_enc_sequence_replay() is open-coded in to
  nfsd4_replay_cache_entry(), and then simplified to (hopefully) make
  the process of replaying a reply clearer.

Signed-off-by: NeilBrown <neil@brown.name>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-17 08:46:12 -05:00
Bagas Sanjaya
8320b75b2b NFS: nfsd-maintainer-entry-profile: Inline function name prefixes
Sphinx reports htmldocs warnings:

Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst:185: ERROR: Unknown target name: "nfsd". [docutils]
Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst:188: ERROR: Unknown target name: "nfsdn". [docutils]
Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst:192: ERROR: Unknown target name: "nfsd4m". [docutils]

These are due to Sphinx confusing function name prefixes for external
link syntax. Fix the warnings by inlining the prefixes.

Fixes: 3a1ce35030e1e0 ("NFSD: Add a subsystem policy document")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/linux-next/20251117174218.29365f30@canb.auug.org.au/
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-17 08:46:06 -05:00
Thorsten Blum
a73d4a0556 drivers/xen/xenbus: Replace deprecated strcpy in xenbus_transaction_end
strcpy() is deprecated; inline the read-only string instead. Fix the
function comment and use bool instead of int while we're at it.

Link: https://github.com/KSPP/linux/issues/88
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Signed-off-by: Juergen Gross <jgross@suse.com>
Message-ID: <20251031112145.103257-2-thorsten.blum@linux.dev>
2025-11-17 08:48:40 +01:00
Thorsten Blum
6fec913ff1 drivers/xen/xenbus: Simplify return statement in join()
Don't unnecessarily negate 'buffer' and simplify the return statement.

Reviewed-by: Jason Andryuk <jason.andryuk@amd.com>
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Signed-off-by: Juergen Gross <jgross@suse.com>
Message-ID: <20251112171410.3140-2-thorsten.blum@linux.dev>
2025-11-17 07:55:10 +01:00
Chuck Lever
3a1ce35030 NFSD: Add a subsystem policy document
Steer contributors to NFSD's patchworks instance, list our patch
submission preferences, and more. The new document is based on the
existing netdev and xfs subsystem policy documents.

This is an attempt to add transparency to the process of accepting
contributions to NFSD and getting them merged upstream.

Suggested-by: "Darrick J. Wong" <djwong@kernel.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: NeilBrown <neil@brown.name>
[ cel: Hand-edits to address review comments ]
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
Jeff Layton
6b3b697d65 sunrpc: allocate a separate bvec array for socket sends
svc_tcp_sendmsg() calls xdr_buf_to_bvec() with the second slot of
rq_bvec as the start, but doesn't reduce the array length by one, which
could lead to an array overrun. Also, rq_bvec is always rq_maxpages in
length, which can be too short in some cases, since the TCP record
marker consumes a slot.

Fix both problems by adding a separate bvec array to the svc_sock that
is specifically for sending. For TCP, make this array one slot longer
than rq_maxpages, to account for the record marker. For UDP, only
allocate as large an array as we need since it's limited to 64k of
payload.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
Chuck Lever
ebd3330d1c SUNRPC: Improve "fragment too large" warning
Including the client IP address that generated the overrun traffic
seems like it would be helpful. The message now reads:

  kernel: svc: nfsd oversized RPC fragment (1064958 octets) from 100.64.0.11:45866

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
Chuck Lever
d686e64e93 NFSD: Implement NFSD_IO_DIRECT for NFS READ
Add an experimental option that forces NFS READ operations to use
direct I/O instead of reading through the NFS server's page cache.

There is already at least one other layer of read caching: the page
cache on NFS clients.

The server's page cache, in many cases, is unlikely to provide
additional benefit. Some benchmarks have demonstrated that the
server's page cache is actively detrimental for workloads whose
working set is larger than the server's available physical memory.

For instance, on small NFS servers, cached NFS file content can
squeeze out local memory consumers. For large sequential workloads,
an enormous amount of data flows into and out of the page cache
and is consumed by NFS clients exactly once -- caching that data
is expensive to do and totally valueless.

For now this is a hidden option that can be enabled on test
systems for benchmarking. In the longer term, this option might
be enabled persistently or per-export. When the exported file
system does not support direct I/O, NFSD falls back to using
either DONTCACHE or buffered I/O to fulfill NFS READ requests.

Suggested-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
Chuck Lever
d7de37d6d7 NFSD: Relocate the xdr_reserve_space_vec() call site
In order to detect when a direct READ is possible, we need the send
buffer's .page_len to be zero when there is nothing in the buffer's
.pages array yet.

However, when xdr_reserve_space_vec() extends the size of the
xdr_stream to accommodate a READ payload, it adds to the send
buffer's .page_len.

It should be safe to reserve the stream space /after/ the VFS read
operation completes. This is, for example, how an NFSv3 READ works:
the VFS read goes into the rq_bvec, and is then added to the send
xdr_stream later by svcxdr_encode_opaque_pages().

Now that xdr_reserve_space_vec() uses the number of bytes actually
read, the xdr_truncate_encode() call is no longer necessary.

Reviewed-by: NeilBrown <neil@brown.name>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
Mike Snitzer
803bc849f0 NFSD: pass nfsd_file to nfsd_iter_read()
Prepare for nfsd_iter_read() to use the DIO alignment stored in
nfsd_file by passing the nfsd_file to nfsd_iter_read() rather than
just the file which is associaed with the nfsd_file.

This means nfsd4_encode_readv() now also needs the nfsd_file rather
than the file.  Instead of changing the file arg to be the nfsd_file,
we discard the file arg as the nfsd_file (and indeed the file) is
already available via the "read" argument.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: NeilBrown <neil@brown.name>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
Sergey Bashirov
cc6c40e09d NFSD/blocklayout: Support multiple extents per LAYOUTGET
Allow the pNFS server to respond with multiple extents to a LAYOUTGET
request, thereby avoiding unnecessary load on the server and improving
performance for the client. The number of LAYOUTGET requests is
significantly reduced for various file access patterns, including
random and parallel writes.

Additionally, this change allows the client to request layouts with the
loga_minlength value greater than the minimum possible length of a single
extent in XFS. We use this functionality to fix a livelock in the client.

Signed-off-by: Sergey Bashirov <sergeybashirov@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
Sergey Bashirov
0cd0d15d47 NFSD/blocklayout: Introduce layout content structure
Add a layout content structure instead of a single extent. The ability
to store and encode an array of extents is then used to implement support
for multiple extents per LAYOUTGET.

Signed-off-by: Sergey Bashirov <sergeybashirov@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
Sergey Bashirov
a1dce715c6 NFSD/blocklayout: Extract extent mapping from proc_layoutget
No changes in functionality. Split the proc_layoutget function to
create a helper function that maps single extent to the requested
range. This helper function is then used to implement support for
multiple extents per LAYOUTGET.

Signed-off-by: Sergey Bashirov <sergeybashirov@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
Sergey Bashirov
3524b021b0 NFSD/blocklayout: Fix minlength check in proc_layoutget
The extent returned by the file system may have a smaller offset than
the segment offset requested by the client. In this case, the minimum
segment length must be checked against the requested range. Otherwise,
the client may not be able to continue the read/write operation.

Fixes: 8650b8a05850 ("nfsd: pNFS block layout driver")
Signed-off-by: Sergey Bashirov <sergeybashirov@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
Chuck Lever
566a414558 svcrdma: Increase the server's default RPC/RDMA credit grant
The range of commits from commit e3274026e2ec ("SUNRPC: move all of
xprt handling into svc_xprt_handle()") to commit 15d39883ee7d
("SUNRPC: change the back-channel queue to lwq") enabled NFSD
performance to scale better as the number of nfsd threads is
increased. These commits were merged in v6.7.

Now that the nfsd thread count can scale to more threads, permit
individual clients to make more use of those threads. Increase the
RPC/RDMA per-connection credit grant from 64 to 128 -- same as the
Linux NFS client.

Simple single client fio-based benchmarking so far shows only
improvement, no regression.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
Chuck Lever
166274a245 NFSD: Update comment documenting unsupported fattr4 attributes
TIME_CREATE has been supported since commit e377a3e698fb ("nfsd: Add
support for the birth time attribute").

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
Matvey Kovalev
bfce8e4273 nfsd: delete unreachable confusing code in nfs4_open_delegation()
op_delegate_type is assigned OPEN_DELEGATE_NONE just before the if-block
where condition specifies it not be equal to OPEN_DELEGATE_NONE. Compiler
treats the block as unreachable and optimizes it out from the resulting
executable.

In that aspect commit d08d32e6e5c0 ("nfsd4: return delegation immediately
if lease fails") notably makes no difference.

Seems it's better to just drop this code instead of fiddling with memory
barriers or atomics.

Found by Linux Verification Center (linuxtesting.org) with SVACE.

Signed-off-by: Matvey Kovalev <matvey.kovalev@ispras.ru>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
Chuck Lever
ccd608e29b NFSD: Add array bounds-checking in nfsd_iter_read()
The *count parameter does not appear to be explicitly restricted
to being smaller than rsize, so it might be possible to overrun
the rq_bvec or rq_pages arrays.

Rather than overrunning these arrays (damage done!) and then WARNING
once, let's harden the loop so that it terminates before the end of
the arrays are reached. This should result in a short read, which is
OK -- clients recover by sending additional READ requests for the
remaining unread bytes.

Reported-by: NeilBrown <neil@brown.name>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Mike Snitzer <snitzer@kernel.org>
Reviewed-by: NeilBrown <neil@brown.name>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
Jeff Layton
b5fc406bc7 nfsd: switch the default for NFSD_LEGACY_CLIENT_TRACKING to "n"
We added this Kconfig option a little over a year ago. Switch the
default to "n" in preparation for its eventual removal.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
NeilBrown
4552f4e3f2 nfsd: change nfs4_client_to_reclaim() to allocate data
The calling convention for nfs4_client_to_reclaim() is clumsy in that
the caller needs to free memory if the function fails.  It is much
cleaner if the function frees its own memory.

This patch changes nfs4_client_to_reclaim() to re-allocate the .data
fields to be stored in the newly allocated struct nfs4_client_reclaim,
and to free everything on failure.

__cld_pipe_inprogress_downcall() needs to allocate the data anyway to
copy it from user-space, so now that data is allocated twice.  I think
that is a small price to pay for a cleaner interface.

Signed-off-by: NeilBrown <neil@brown.name>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
NeilBrown
89bd77cf43 nfsd: move name lookup out of nfsd4_list_rec_dir()
nfsd4_list_rec_dir() is called with two different callbacks.
One of the callbacks uses vfs_rmdir() to remove the directory.
The other doesn't use the dentry at all, just the name.

As only one callback needs the dentry, this patch moves the lookup into
that callback.  This prepares of changes to how directory operations
are locked.

Signed-off-by: NeilBrown <neil@brown.name>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
Chuck Lever
bf94dea7fd svcrdma: Release transport resources synchronously
NFSD has always supported added network listeners. The new netlink
protocol now enables the removal of listeners.

Olga noticed that if an RDMA listener is removed and immediately
re-added, the deferred __svc_rdma_free() function might not have
run yet, so some or all of the old listener's RDMA resources
linger, which prevents a new listener on the same address from
being created.

Also, svc_xprt_free() does a module_put() just after calling
->xpo_free(). That means if there is deferred work going on, the
module could be unloaded before that work is even started,
resulting in a UAF.

Neil asks:
> What particular part of __svc_rdma_free() needs to run in order for a
> subsequent registration to succeed?
> Can that bit be run directory from svc_rdma_free() rather than be
> delayed?
> (I know almost nothing about rdma so forgive me if the answers to these
> questions seems obvious)

The reasons I can recall are:

 - Some of the transport tear-down work can sleep
 - Releasing a cm_id is tricky and can deadlock

We might be able to mitigate the second issue with judicious
application of transport reference counting.

Reported-by: Olga Kornievskaia <okorniev@redhat.com>
Closes: https://lore.kernel.org/linux-nfs/20250821204328.89218-1-okorniev@redhat.com/
Suggested-by: NeilBrown <neil@brown.name>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
2025-11-16 18:20:11 -05:00
Haotian Zhang
5bcc5786a0 watchdog: starfive: Fix resource leak in probe error path
If pm_runtime_put_sync() fails after watchdog_register_device()
succeeds, the probe function jumps to err_exit without
unregistering the watchdog device. This leaves the watchdog
registered in the subsystem while the driver fails to load,
resulting in a resource leak.

Add a new error label err_unregister_wdt to properly unregister
the watchdog device.

Fixes: 8bc22a2f1bf0 ("watchdog: starfive: Check pm_runtime_enabled() before decrementing usage counter")
Signed-off-by: Haotian Zhang <vulab@iscas.ac.cn>
Reviewed-by: Wim Van Sebroeck <wim@linux-watchdog.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:29:01 +01:00
Christian Marangi
6fbf541520 dt-bindings: watchdog: airoha: Add support for Airoha AN7583 SoC
Add compatible for Airoha AN7583 SoC. The implementation is exactly the
same of Airoha EN7581 hence we add the compatible in addition to EN7581
ones.

Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:28:12 +01:00
Aleksander Jan Bajkowski
26f2f5ed16 dt-bindings: watchdog: lantiq,wdt: convert bindings to dtschema
Convert the Lantiq WDT Watchdog bindings to yaml format.

Signed-off-by: Aleksander Jan Bajkowski <olek2@wp.pl>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:28:12 +01:00
Heiko Stuebner
a8c762cbd1 dt-bindings: watchdog: Add RK3506 compatible
The watchdog used on the RK3506 is still the same snps,dw-wdt compatible
one that is in use since the RK3066 days, so add the RK3506 to the
variant list.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
pw-bot: not-applicable
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:28:12 +01:00
Jingyi Wang
aa33a6c8ce dt-bindings: watchdog: Document Qualcomm Kaanapali watchdog
Add devicetree binding for watchdog present on Qualcomm Kaanapali SoC.

Signed-off-by: Jingyi Wang <jingyi.wang@oss.qualcomm.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:28:11 +01:00
Haotian Zhang
25c0b472ea watchdog: wdat_wdt: Fix ACPI table leak in probe function
wdat_wdt_probe() calls acpi_get_table() to obtain the WDAT ACPI table but
never calls acpi_put_table() on any paths. This causes a permanent ACPI
table memory leak.

Add a single cleanup path which calls acpi_put_table() to ensure
the ACPI table is always released.

Fixes: 058dfc767008 ("ACPI / watchdog: Add support for WDAT hardware watchdog")
Suggested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Haotian Zhang <vulab@iscas.ac.cn>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:28:10 +01:00
Binbin Zhou
e0c50cddbd watchdog: loongson1: Add Loongson-2k0300 watchdog support
According to the manual, the Loongson-2K0300 watchdog is similar to the
Loongson-1, except for some register offsets and inconsistent register
bit definitions. Separate definitions via driver_data suffice.

Co-developed-by: Xiaochuang Mao <maoxiaochuan@loongson.cn>
Signed-off-by: Xiaochuang Mao <maoxiaochuan@loongson.cn>
Signed-off-by: Binbin Zhou <zhoubinbin@loongson.cn>
Reviewed-by: Huacai Chen <chenhuacai@loongson.cn>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:28:10 +01:00
Binbin Zhou
e4948e8011 dt-bindings: watchdog: loongson,ls1x-wdt: Add ls2k0300-wdt compatible
Add "loongson,ls2k0300-wdt" compatible to the dt-schema document, which
is similar to Loongson-1 watchdog, but with differences in some register
offsets and bit definitions.

Signed-off-by: Binbin Zhou <zhoubinbin@loongson.cn>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
pw-bot: not-applicable
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:28:10 +01:00
Binbin Zhou
9d8ca99d60 watchdog: loongson1: Drop CONFIG_OF
The general recommendation is to not use of_match_ptr() or CONFIG_OF
ifdef.

Drop them.

Signed-off-by: Binbin Zhou <zhoubinbin@loongson.cn>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:28:09 +01:00
Binbin Zhou
6121d0b889 watchdog: loongson1: Simplify ls1x_wdt_probe code
Remove meaningless output to simplify ls1x_wdt_probe().

Signed-off-by: Binbin Zhou <zhoubinbin@loongson.cn>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:28:09 +01:00
Binbin Zhou
f909b3d4f1 watchdog: loongson1: Add missing MODULE_PARM_DESC
Add documentation for module_param so that they're visible with
modinfo command.

Signed-off-by: Binbin Zhou <zhoubinbin@loongson.cn>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:28:08 +01:00
Zoe Gates
f0a4bf61f1 watchdog/diag288: Fix module comment typos
Correct spelling and capitalizaion in the header comment so the
documentation reads cleanly.

Signed-off-by: Zoe Gates <zoe@zeocities.dev>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:28:08 +01:00
Jack Hsu
a742d1713c dt-bindings: watchdog: Support MediaTek MT8189 wdt
modify dt-binding for support mt8189 dts node of wdt

Signed-off-by: Jack Hsu <jh.hsu@mediatek.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:24:29 +01:00
Louis-Alexis Eyraud
26d21c835f dt-bindings: watchdog: mediatek,mtk-wdt: Add compatible for MT8189 SoC
Add compatible string for the watchdog block on MT8189 SoC, which is
compatible with the one used on MT6589.

Signed-off-by: Louis-Alexis Eyraud <louisalexis.eyraud@collabora.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
pw-bot: not-applicable
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:57 +01:00
Krzysztof Kozlowski
550d1bda39 dt-bindings: mfd: rohm,bd96801-pmic: Correct timeout-sec length and reference watchdog schema
The parent node of ROHM BD96801 PMIC is also holding properties for the
watchdog, thus it should reference watchdog.yaml schema.  OTOH, the
timeout-sec property is used only as one number.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Acked-by: Matti Vaittinen <mazziesaccount@gmail.com>
Acked-by: Lee Jones <lee@kernel.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:56 +01:00
Krzysztof Kozlowski
017bca9163 dt-bindings: watchdog: Allow node names named 'pmic'
Watchdog is often part of more complex devices like Power Management ICs
(PMIC), e.g. on rohm,bd96801, and the schema can be referenced by a
binding describing parent (main) node.  Allow another typical name for
such PMIC devices: pmic.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:56 +01:00
Krzysztof Kozlowski
0917135963 dt-bindings: watchdog: Restrict timeout-sec to one number
Linux kernel expects only one number for the watchdog timeout and the
type is an array (defined in property-units.yaml in DT schema), so
restrict the property.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:55 +01:00
Wolfram Sang
babe81b061 watchdog: renesas_wwdt: add driver
This driver adds support for the Renesas Window Watchdog Timer (WWDT).
Because it can only be setup once after boot and we cannot know if this
already happened in early boot stages, it is mandated that the firmware
configures the watchdog. Linux then adapts according to the given
setup. Note that this watchdog only reports an overflow to the Error
Control Module (ECM) and does not reset the SoC on its own.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:55 +01:00
Wolfram Sang
ece1ad19c3 dt-bindings: watchdog: Add Renesas WWDT
Describe the Window Watchdog Timer found on Renesas R-Car SoCs from late
Gen3 onwards.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
pw-bot: not-applicable
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:54 +01:00
Rob Herring (Arm)
fbd10d9670 dt-bindings: watchdog: Convert marvell,orion-wdt to DT schema
Convert the Marvell Orion and Armada watchdog binding to DT schema
format. It's a straight-forward conversion.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Gregory CLEMENT <gregory.clement@bootlin.com>
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:54 +01:00
Rob Herring (Arm)
af34a25336 dt-bindings: watchdog: Convert TI OMAP to DT schema
Convert the TI OMAP watchdog binding to DT schema format. The compatible
string list was incomplete. The "reg" and "interrupts" properties were
missing. "ti,hwmods" is also deprecated and not required.

Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:53 +01:00
Chin-Ting Kuo
13e8664671 watchdog: aspeed: Add support for AST2700 platform
Add AST2700 platform support to the ASPEED watchdog driver. This includes
a new per-platform configuration with SCU reset status register at
SCU1_070 and support for 5 reset mask registers.

Signed-off-by: Chin-Ting Kuo <chin-ting_kuo@aspeedtech.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:53 +01:00
Chin-Ting Kuo
0eb54296dc watchdog: aspeed: Support variable number of reset mask registers
Starting from the AST2600 platform, the SoC design has become more
complex, with an increased number of reset mask registers.
To support this, introduce a new field 'num_reset_masks' in the
'aspeed_wdt_config' structure to specify the number of reset mask
registers per platform. This change removes the need for hardcoded
platform-specific logic and improves scalability for future SoCs.

Signed-off-by: Chin-Ting Kuo <chin-ting_kuo@aspeedtech.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:52 +01:00
Chin-Ting Kuo
b3bc229b54 dt-bindings: watchdog: aspeed,ast2400-wdt: Add support for AST2700
Add support for the AST2700 SoC in the ASPEED watchdog device tree
bindings. This includes:

- Adding "aspeed,ast2700-wdt" to the compatible string list.
- Extending the "aspeed,reset-mask" property description for AST2700.
- Defining AST2700-specific reset mask bits in aspeed-wdt.h,
  covering RESET1 to RESET5.

Signed-off-by: Chin-Ting Kuo <chin-ting_kuo@aspeedtech.com>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:52 +01:00
Wolfram Sang
1cafd2a850 dt-bindings: watchdog: renesas,wdt: add SWDT exception for V3H
The SWDT on V3H has no reset bit. Make resets optional on this SoC.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:51 +01:00
Wolfram Sang
fcba285525 dt-bindings: watchdog: factor out RZ/V2H(P) watchdog
Renesas created different watchdog IPs but they are all handled in the
same binding documentation. This leads to a lot of conditional handling
which makes it unnecessarily hard to add new items. Factor out the
RZ/V2H(P) watchdog to make handling easier.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:51 +01:00
Wolfram Sang
909c8ea1ad dt-bindings: watchdog: factor out RZ/G2L watchdog
Renesas created different watchdog IPs but they are all handled in the
same binding documentation. This leads to a lot of conditional handling
which makes it unnecessarily hard to add new items. Factor out the
RZ/G2L watchdog to make handling easier.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Biju Das <biju.das.jz@bp.renesas.com>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:50 +01:00
Wolfram Sang
a3e32b41c2 dt-bindings: watchdog: factor out RZ/N1 watchdog
Renesas created different watchdog IPs but they are all handled in the
same binding documentation. This leads to a lot of conditional handling
which makes it unnecessarily hard to add new items. Factor out the RZ/N1
watchdog to make handling easier.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:50 +01:00
Wolfram Sang
055f0576e8 dt-bindings: watchdog: factor out RZ/A watchdog
Renesas created different watchdog IPs but they are all handled in the
same binding documentation. This leads to a lot of conditional handling
which makes it unnecessarily hard to add new items. Factor out the RZ/A
watchdog to make handling easier.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:49 +01:00
Li Qiang
7aa31ee9ec via_wdt: fix critical boot hang due to unnamed resource allocation
The VIA watchdog driver uses allocate_resource() to reserve a MMIO
region for the watchdog control register. However, the allocated
resource was not given a name, which causes the kernel resource tree
to contain an entry marked as "<BAD>" under /proc/iomem on x86
platforms.

During boot, this unnamed resource can lead to a critical hang because
subsequent resource lookups and conflict checks fail to handle the
invalid entry properly.

Signed-off-by: Li Qiang <liqiang01@kylinos.cn>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
2025-11-15 15:19:48 +01:00
Dan Williams
f7ae6d4ec6 PCI/TSM: Add 'dsm' and 'bound' attributes for dependent functions
PCI/TSM sysfs for physical function 0 devices, i.e. the "DSM" (Device
Security Manager), contains the 'connect' and 'disconnect' attributes.
After a successful 'connect' operation the DSM, its dependent functions
(SR-IOV virtual functions, non-zero multi-functions, or downstream
endpoints of a switch DSM) are candidates for being transitioned into a
TDISP (TEE Device Interface Security Protocol) operational state, via
pci_tsm_bind(). At present sysfs is blind to which devices are capable of
TDISP operation and it is ambiguous which functions are serviced by which
DSMs.

Add a 'dsm' attribute to identify a function's DSM device, and add a
'bound' attribute to identify when a function has entered a TDISP
operational state.

Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Samuel Ortiz <sameo@rivosinc.com>
Cc: Alexey Kardashevskiy <aik@amd.com>
Cc: Xu Yilun <yilun.xu@linux.intel.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251113021446.436830-9-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-14 15:06:57 -08:00
Dan Williams
c316c75d57 PCI/TSM: Add pci_tsm_guest_req() for managing TDIs
A PCIe device function interface assigned to a TVM is a TEE Device
Interface (TDI). A TDI instantiated by pci_tsm_bind() needs additional
steps taken by the TVM to be accepted into the TVM's Trusted Compute
Boundary (TCB) and transitioned to the RUN state.

pci_tsm_guest_req() is a channel for the guest to request TDISP collateral,
like Device Interface Reports, and effect TDISP state changes, like
LOCKED->RUN transititions. Similar to IDE establishment and pci_tsm_bind(),
these are long running operations involving SPDM message passing via the
DOE mailbox.

The path for a TVM to invoke pci_tsm_guest_req() is:
* TSM triggers exit via guest-to-host-interface ABI (implementation specific)
* VMM invokes handler (KVM handle_exit() -> userspace io)
* handler issues request (userspace io handler -> ioctl() ->
  pci_tsm_guest_req())
* handler supplies response
* VMM posts response, notifies/re-enters TVM

This path is purely a transport for messages from TVM to platform TSM. By
design the host kernel does not and must not care about the content of
these messages. I.e. the host kernel is not in the TCB of the TVM.

As this is an opaque passthrough interface, similar to fwctl, the kernel
requires that implementations stay within the bounds defined by 'enum
pci_tsm_req_scope'. Violation of those expectations likely has market and
regulatory consequences. Out of scope requests are blocked by default.

Co-developed-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251113021446.436830-8-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-14 15:06:57 -08:00
Dan Williams
50cbec192f PCI/TSM: Add pci_tsm_bind() helper for instantiating TDIs
After a PCIe device has established a secure link and session between a TEE
Security Manager (TSM) and its local Device Security Manager (DSM), the
device or its subfunctions are candidates to be bound to a private memory
context, a TVM. A PCIe device function interface assigned to a TVM is a TEE
Device Interface (TDI).

The pci_tsm_bind() requests the low-level TSM driver to associate the
device with private MMIO and private IOMMU context resources of a given TVM
represented by a @kvm argument. A device in the bound state corresponds to
the TDISP protocol LOCKED state and awaits validation by the TVM. It is a
'struct pci_tsm_link_ops' operation because, similar to IDE establishment,
it involves host side resource establishment and context setup on behalf of
the guest. It is also expected to be performed lazily to allow for
operation of the device in non-confidential "shared" context for pre-lock
configuration.

Co-developed-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251113021446.436830-7-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-14 15:06:57 -08:00
Dan Williams
079115370d PCI/IDE: Initialize an ID for all IDE streams
The PCIe spec defines two types of streams - selective and link.  Each
stream has an ID from the same bucket so a stream ID does not tell the
type.  The spec defines an "enable" bit for every stream and required
stream IDs to be unique among all enabled stream but there is no such
requirement for disabled streams.

However, when IDE_KM is programming keys, an IDE-capable device needs
to know the type of stream being programmed to write it directly to
the hardware as keys are relatively large, possibly many of them and
devices often struggle with keeping around rather big data not being
used.

Walk through all streams on a device and initialise the IDs to some
unique number, both link and selective.

The weakest part of this proposal is the host bridge ide_stream_ids_ida.
Technically, a Stream ID only needs to be unique within a given partner
pair. However, with "anonymous" / unassigned streams there is no convenient
place to track the available ids. Proceed with an ida in the host bridge
for now, but consider moving this tracking to be an ide_stream_ids_ida per
device.

Co-developed-by: Alexey Kardashevskiy <aik@amd.com>
Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251113021446.436830-6-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-14 15:06:57 -08:00
Xu Yilun
f86e51399c PCI/IDE: Add Address Association Register setup for downstream MMIO
The address ranges for downstream Address Association Registers need to
cover memory addresses for all functions (PFs/VFs/downstream devices)
managed by a Device Security Manager (DSM). The proposed solution is get
the memory (32-bit only) range and prefetchable-memory (64-bit capable)
range from the immediate ancestor downstream port (either the direct-attach
RP or deepest switch port when switch attached).

Similar to RID association, address associations will be set by default if
hardware sets 'Number of Address Association Register Blocks' in the
'Selective IDE Stream Capability Register' to a non-zero value. TSM drivers
can opt-out of the settings by zero'ing out unwanted / unsupported address
ranges. E.g. TDX Connect only supports prefetachable (64-bit capable)
memory ranges for the Address Association setting.

If the immediate downstream port provides both a memory range and
prefetchable-memory range, but the IDE partner port only provides 1 Address
Association Register block then the TSM driver can pick which range to
associate, or let the PCI core prioritize memory.

Note, the Address Association Register setup for upstream requests is still
uncertain so is not included.

Co-developed-by: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Co-developed-by: Arto Merilainen <amerilainen@nvidia.com>
Signed-off-by: Arto Merilainen <amerilainen@nvidia.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Co-developed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251114010227.567693-1-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-14 15:06:57 -08:00
Dan Williams
c16af019d9 resource: Introduce resource_assigned() for discerning active resources
A PCI bridge resource lifecycle involves both a "request" and "assign"
phase. At any point in time that resource may not yet be assigned, or may
have failed to assign (because it does not fit).

There are multiple conventions to determine when assignment has not
completed: IORESOURCE_UNSET, IORESOURCE_DISABLED, and checking whether the
resource is parented.

In code paths that are known to not be racing assignment, e.g. post
subsys_initcall(), the most reliable method to judge that a bridge resource
is assigned is to check the resource is parented [1].

Introduce a resource_assigned() helper for this purpose.

Link: http://lore.kernel.org/2b9f7f7b-d6a4-be59-14d4-7b4ffccfe373@linux.intel.com [1]
Suggested-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251113021446.436830-4-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-13 17:05:08 -08:00
Dan Williams
e5b5f8b7c2 PCI/TSM: Drop stub for pci_tsm_doe_transfer()
Just like pci_tsm_pf0_{con,de}structor(), in the CONFIG_PCI_TSM=n case there
should be no callers of pci_tsm_doe_transfer().

Reported-by: Xu Yilun <yilun.xu@linux.intel.com>
Closes: http://lore.kernel.org/aRFfk14DJWEVhC/R@yilunxu-OptiPlex-7050
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251113021446.436830-3-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-13 17:05:08 -08:00
Dan Williams
110c155e8a drivers/virt: Drop VIRT_DRIVERS build dependency
All of the objects in drivers/virt/ have their own configuration symbols to
gate compilation. I.e. nothing gets added to the kernel with
CONFIG_VIRT_DRIVERS=y in isolation.

Unconditionally descend into drivers/virt/ so that consumers do not need to
add an additional CONFIG_VIRT_DRIVERS dependency.

Fix warnings of the form:

    Kconfig warnings: (for reference only)
       WARNING: unmet direct dependencies detected for TSM
       Depends on [n]: VIRT_DRIVERS [=n]
       Selected by [y]:
       - PCI_TSM [=y] && PCI [=y]

...where PCI_TSM selects CONFIG_TSM, but fails to select
CONFIG_VIRT_DRIVERS.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202511041832.ylcgIiqN-lkp@intel.com/
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251113021446.436830-2-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-13 17:05:08 -08:00
Thorsten Blum
3e4b89e970 nilfs2: replace vmalloc + copy_from_user with vmemdup_user
Replace vmalloc() followed by copy_from_user() with vmemdup_user() to
improve nilfs_ioctl_clean_segments() and nilfs_ioctl_set_suinfo().  Use
kvfree() to free the buffers created by vmemdup_user().

Use u64_to_user_ptr() instead of manually casting the pointers and
remove the obsolete 'out_free' label.

No functional changes intended.

Link: https://lkml.kernel.org/r/20251030154700.7444-1-konishi.ryusuke@gmail.com
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:17 -08:00
Hao Ge
ded7d97442 mailmap: add entry for Hao Ge
Use hao.ge@linux.dev as the main address for kernel work

Link: https://lkml.kernel.org/r/20251030121746.230747-1-hao.ge@linux.dev
Signed-off-by: Hao Ge <gehao@kylinos.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:17 -08:00
Oleg Nesterov
c25d24d0f4 release_task: kill unnecessary rcu_read_lock() around dec_rlimit_ucounts()
rcu_read_lock() was added to shut RCU-lockdep up when this code used
__task_cred()->rcu_dereference(), but after the commit 21d1c5e386bc
("Reimplement RLIMIT_NPROC on top of ucounts") it is no longer needed:
task_ucounts()->task_cred_xxx() takes rcu_read_lock() itself.

NOTE: task_ucounts() returns the pointer to another rcu-protected data,
struct ucounts.  So it should either be used when task->real_cred and thus
task->real_cred->ucounts is stable (release_task, copy_process,
copy_creds), or it should be called under rcu_read_lock().  In both cases
it is pointless to take rcu_read_lock() to read the cred->ucounts pointer.

Link: https://lkml.kernel.org/r/20251026143140.GA22463@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Alexey Gladkov <legion@kernel.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Kees Cook <kees@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:17 -08:00
Yu-Chun Lin
f9925019f4 mailmap: add entry for Yu-Chun Lin
Map my personal email to my business email.

Link: https://lkml.kernel.org/r/20251027100309.22035-1-eleanor.lin@realtek.com
Signed-off-by: Yu-Chun Lin <eleanor.lin@realtek.com>
Cc: Kuan-Wei Chiu <visitorckw@gmail.com>
Cc: Stanley Chang <stanley_chang@realtek.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:17 -08:00
Dr. David Alan Gilbert
a0b8c6af29 lib/xxhash: remove more unused xxh functions
xxh32_reset() and xxh32_copy_state() are unused, and with those gone, the
xxh32_state struct is also unused.

xxh64_copy_state() is also unused.

Remove them all.

(Also fixes a comment above the xxh64_state that referred to it as
xxh32_state).

Link: https://lkml.kernel.org/r/20251024205120.454508-1-linux@treblig.org
Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:17 -08:00
Ye Bin
6c2e6e2c1a dynamic_debug: add support for print stack
In practical problem diagnosis, especially during the boot phase, it is
often desirable to know the call sequence.  However, currently, apart from
adding print statements and recompiling the kernel, there seems to be no
good alternative.  If dynamic_debug supported printing the call stack, it
would be very helpful for diagnosing issues.  This patch add support '+d'
for dump stack.

Link: https://lkml.kernel.org/r/20251025080003.312536-1-yebin@huaweicloud.com
Signed-off-by: Ye Bin <yebin10@huawei.com>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Jim Cromie <jim.cromie@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:16 -08:00
Dmitry Antipov
a2b1c419ff ocfs2: add inline inode consistency check to ocfs2_validate_inode_block()
In 'ocfs2_validate_inode_block()', add an extra check whether an inode
with inline data (i.e.  self-contained) has no clusters, thus preventing
an invalid inode from being passed to 'ocfs2_evict_inode()' and below.

Link: https://lkml.kernel.org/r/20251023141650.417129-1-dmantipov@yandex.ru
Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Reported-by: syzbot+c16daba279a1161acfb0@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=c16daba279a1161acfb0
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Heming Zhao <heming.zhao@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:16 -08:00
Joseph Qi
6e89373cec ocfs2: convert to host endian in ocfs2_validate_inode_block
Convert to host endian when checking OCFS2_VALID_FL to keep consistent
with other checks.

Link: https://lkml.kernel.org/r/20251025123218.3997866-2-joseph.qi@linux.alibaba.com
Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Reviewed-by: Heming Zhao <heming.zhao@suse.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:16 -08:00
Joseph Qi
c9dff86eb7 ocfs2: use correct endian in ocfs2_dinode_has_extents
Fields in ocfs2_dinode is little endian, covert to host endian when
checking those contents.

Link: https://lkml.kernel.org/r/20251025123218.3997866-1-joseph.qi@linux.alibaba.com
Fixes: fdbb6cd96ed5 ("ocfs2: correct l_next_free_rec in online check")
Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Reviewed-by: Heming Zhao <heming.zhao@suse.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:16 -08:00
Dmitry Antipov
390ac56cf0 ocfs2: add boundary check to ocfs2_check_dir_entry()
In 'ocfs2_check_dir_entry()', add extra check whether at least the
smallest possible dirent may be located at the specified offset within
bh's data, thus preventing an out-of-bounds accesses below.

Link: https://lkml.kernel.org/r/20251013062826.122586-1-dmantipov@yandex.ru
Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Reported-by: syzbot+b20bbf680bb0f2ecedae@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=b20bbf680bb0f2ecedae
Reviewed-by: Heming Zhao <heming.zhao@suse.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:16 -08:00
Yury Norov (NVIDIA)
d99dc586ca uaccess: decouple INLINE_COPY_FROM_USER and CONFIG_RUST
Commit 1f9a8286bc0c ("uaccess: always export _copy_[from|to]_user with
CONFIG_RUST") exports _copy_{from,to}_user() unconditionally, if RUST is
enabled.  This pollutes exported symbols namespace, and spreads RUST
ifdefery in core files.

It's better to declare a corresponding helper under the rust/helpers,
similarly to how non-underscored copy_{from,to}_user() is handled.

[yury.norov@gmail.com: drop rust part of comment for _copy_from_user(), per Alice]
  Link: https://lkml.kernel.org/r/20251024154754.99768-1-yury.norov@gmail.com
Link: https://lkml.kernel.org/r/20251023171607.1171534-1-yury.norov@gmail.com
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Miguel Ojeda <ojeda@kernel.org>
Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Tested-by: Alice Ryhl <aliceryhl@google.com>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Andreas Hindborg <a.hindborg@kernel.org>
Cc: Björn Roy Baron <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Trevor Gross <tmgross@umich.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:16 -08:00
Douglas Anderson
032a730268 init/main.c: wrap long kernel cmdline when printing to logs
The kernel cmdline length is allowed to be longer than what printk can
handle.  When this happens the cmdline that's printed to the kernel ring
buffer at bootup is cutoff and some kernel cmdline options are "hidden"
from the logs.  This undercuts the usefulness of the log message.

Specifically, grepping for COMMAND_LINE_SIZE shows that 2048 is common and
some architectures even define it as 4096.  s390 allows a CONFIG-based
maximum up to 1MB (though it's not expected that anyone will go over the
default max of 4096 [1]).

The maximum message pr_notice() seems to be able to handle (based on
experiment) is 1021 characters.  This appears to be based on the current
value of PRINTKRB_RECORD_MAX as 1024 and the fact that pr_notice() spends
2 characters on the loglevel prefix and we have a '\n' at the end.

While it would be possible to increase the limits of printk() (and
therefore pr_notice()) somewhat, it doesn't appear possible to increase it
enough to fully include a 2048-character cmdline without breaking
userspace.  Specifically on at least two tested userspaces (ChromeOS plus
the Debian-based distro I'm typing this message on) the `dmesg` tool reads
lines from `/dev/kmsg` in 2047-byte chunks.  As per
`Documentation/ABI/testing/dev-kmsg`:

  Every read() from the opened device node receives one record
  of the kernel's printk buffer.
  ...
  Messages in the record ring buffer get overwritten as whole,
  there are never partial messages received by read().

We simply can't fit a 2048-byte cmdline plus the "Kernel command line:"
prefix plus info about time/log_level/etc in a 2047-byte read.

The above means that if we want to avoid the truncation we need to do some
type of wrapping of the cmdline when printing.

Add wrapping to the printout of the kernel command line.  By default, the
wrapping is set to 1021 characters to avoid breaking anyone, but allow
wrapping to be set lower by a Kconfig knob
"CONFIG_CMDLINE_LOG_WRAP_IDEAL_LEN".  Any tools that are correctly parsing
the cmdline today (because it is less than 1021 characters) will see no
difference in their behavior.  The format of wrapped output is designed to
be matched by anyone using "grep" to search for the cmdline and also to be
easy for tools to handle.  Anyone who is sure their tools (if any) handle
the wrapped format can choose a lower wrapping value and have prettier
output.

Setting CONFIG_CMDLINE_LOG_WRAP_IDEAL_LEN to 0 fully disables the wrapping
logic.  This means that long command lines will be truncated again, but
this config could be set if command lines are expected to be long and
userspace is known not to handle parsing logs with the wrapping.

Wrapping is based on spaces, ignoring quotes.  All lines are prefixed with
"Kernel command line: " and lines that are not the last line have a " \"
suffix added to them.  The prefix and suffix count towards the line length
for wrapping purposes.  The ideal length will be exceeded if no
appropriate place to wrap is found.

The wrapping function added here is fairly generic and could be made a
library function (somewhat like print_hex_dump()) if it's needed elsewhere
in the kernel.  However, having printk() directly incorporate this
wrapping would be unlikely to be a good idea since it would break
printouts into more than one record without any obvious common line prefix
to tie lines together.  It would also be extra overhead when, in general,
kernel log message should simply be kept smaller than 1021 bytes.  For
some discussion on this topic, see responses to the v1 posting of this
patch [2].

[akpm@linux-foundation.org: make print_kernel_cmdline __init]
[dianders@chromium.org: v4]
  Link: https://lkml.kernel.org/r/20251027082204.v4.1.I095f1e2c6c27f9f4de0b4841f725f356c643a13f@changeid
Link: https://lkml.kernel.org/r/20251023113257.v3.1.I095f1e2c6c27f9f4de0b4841f725f356c643a13f@changeid
Link: https://lore.kernel.org/r/20251021131633.26700Dd6-hca@linux.ibm.com [1]
Link: https://lore.kernel.org/r/CAD=FV=VNyt1zG_8pS64wgV8VkZWiWJymnZ-XCfkrfaAhhFSKcA@mail.gmail.com [2]
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andrew Chant <achant@google.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Francesco Valla <francesco@valla.it>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: guoweikang <guoweikang.kernel@gmail.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Hendrik Farr <kernel@jfarr.cc>
Cc: Jeff Xu <jeffxu@chromium.org>
Cc: Kees Cook <kees@kernel.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:16 -08:00
Vlad Kulikov
7229d74e5e ipc: create_ipc_ns: drop mqueue mount on sysctl setup failure
If setup_mq_sysctls(ns) fails after mq_init_ns(ns) succeeds, the error
path skipped releasing the internal kernel mqueue mount kept in
ns->mq_mnt. That leaves the vfsmount/superblock referenced until final
namespace teardown, i.e. a resource leak on this rare failure edge.

Unwind it by calling mntput(ns->mq_mnt) before dropping user_ns and
freeing the IPC namespace. This mirrors the normal ordering used in
free_ipc_ns().

Link: https://lkml.kernel.org/r/20251021181341.670297-1-vlad_kulikov_c@pm.me
Signed-off-by: Vlad Kulikov <vlad_kulikov_c@pm.me>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ma Wupeng <mawupeng1@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:15 -08:00
Dmitry Antipov
aa5b6a72cc ocfs2: add directory size check to ocfs2_find_dir_space_id()
Fix a null-pointer-deref which was detected by UBSAN:

KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f]
CPU: 0 UID: 0 PID: 5317 Comm: syz-executor310 Not tainted 6.15.0-syzkaller-12141-gec7714e49479 #0 PREEMPT(full) 

In 'ocfs2_find_dir_space_id()', add extra check whether the directory data
block is large enough to hold at least one directory entry, and raise
'ocfs2_error()' if the former is unexpectedly small.

Link: https://lkml.kernel.org/r/20251013103709.146001-1-dmantipov@yandex.ru
Reported-by: syzbot+ded9116588a7b73c34bc@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=ded9116588a7b73c34bc
Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Reviewed-by: Heming Zhao <heming.zhao@suse.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:15 -08:00
Petr Pavlu
37ade54f38 taint/module: remove unnecessary taint_flag.module field
The TAINT_RANDSTRUCT and TAINT_FWCTL flags are mistakenly set in the
taint_flags table as per-module flags.  While this can be trivially
corrected, the issue can be avoided altogether by removing the
taint_flag.module field.

This is possible because, since commit 7fd8329ba502 ("taint/module: Clean
up global and module taint flags handling") in 2016, the handling of
module taint flags has been fully generic.  Specifically,
module_flags_taint() can print all flags, and the required output buffer
size is properly defined in terms of TAINT_FLAGS_COUNT.  The actual
per-module flags are always those added to module.taints by calls to
add_taint_module().

Link: https://lkml.kernel.org/r/20251022082938.26670-1-petr.pavlu@suse.com
Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
Acked-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Aaron Tomlin <atomlin@atomlin.com>
Cc: Luis Chamberalin <mcgrof@kernel.org>
Cc: Petr Pavlu <petr.pavlu@suse.com>
Cc: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:15 -08:00
Randy Dunlap
ed4bbe7e8f taint: add reminder about updating docs and scripts
Sometimes people update taint-related pieces of the kernel without
updating the supporting documentation or scripts.  Add a reminder to do
this.

Link: https://lkml.kernel.org/r/20251015221626.1126156-1-rdunlap@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: David Gow <davidgow@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:15 -08:00
Sourabh Jain
adc15829fb crash: let architecture decide crash memory export to iomem_resource
With the generic crashkernel reservation, the kernel emits the following
warning on powerpc:

WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/mem.c:341 add_system_ram_resources+0xfc/0x180
Modules linked in:
CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-auto-12607-g5472d60c129f #1 VOLUNTARY
Hardware name: IBM,9080-HEX Power11 (architected) 0x820200 0xf000007 of:IBM,FW1110.01 (NH1110_069) hv:phyp pSeries
NIP:  c00000000201de3c LR: c00000000201de34 CTR: 0000000000000000
REGS: c000000127cef8a0 TRAP: 0700   Not tainted (6.17.0-auto-12607-g5472d60c129f)
MSR:  8000000002029033 <SF,VEC,EE,ME,IR,DR,RI,LE>  CR: 84000840  XER: 20040010
CFAR: c00000000017eed0 IRQMASK: 0
GPR00: c00000000201de34 c000000127cefb40 c0000000016a8100 0000000000000001
GPR04: c00000012005aa00 0000000020000000 c000000002b705c8 0000000000000000
GPR08: 000000007fffffff fffffffffffffff0 c000000002db8100 000000011fffffff
GPR12: c00000000201dd40 c000000002ff0000 c0000000000112bc 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 c0000000015a3808
GPR24: c00000000200468c c000000001699888 0000000000000106 c0000000020d1950
GPR28: c0000000014683f8 0000000081000200 c0000000015c1868 c000000002b9f710
NIP [c00000000201de3c] add_system_ram_resources+0xfc/0x180
LR [c00000000201de34] add_system_ram_resources+0xf4/0x180
Call Trace:
add_system_ram_resources+0xf4/0x180 (unreliable)
do_one_initcall+0x60/0x36c
do_initcalls+0x120/0x220
kernel_init_freeable+0x23c/0x390
kernel_init+0x34/0x26c
ret_from_kernel_user_thread+0x14/0x1c

This warning occurs due to a conflict between crashkernel and System RAM
iomem resources.

The generic crashkernel reservation adds the crashkernel memory range to
/proc/iomem during early initialization. Later, all memblock ranges are
added to /proc/iomem as System RAM. If the crashkernel region overlaps
with any memblock range, it causes a conflict while adding those memblock
regions as iomem resources, triggering the above warning. The conflicting
memblock regions are then omitted from /proc/iomem.

For example, if the following crashkernel region is added to /proc/iomem:
20000000-11fffffff : Crash kernel

then the following memblock regions System RAM regions fail to be inserted:
00000000-7fffffff : System RAM
80000000-257fffffff : System RAM

Fix this by not adding the crashkernel memory to /proc/iomem on powerpc.
Introduce an architecture hook to let each architecture decide whether to
export the crashkernel region to /proc/iomem.

For more info checkout commit c40dd2f766440 ("powerpc: Add System RAM
to /proc/iomem") and commit bce074bdbc36 ("powerpc: insert System RAM
resource to prevent crashkernel conflict")

Note: Before switching to the generic crashkernel reservation, powerpc
never exported the crashkernel region to /proc/iomem.

Link: https://lkml.kernel.org/r/20251016142831.144515-1-sourabhjain@linux.ibm.com
Fixes: e3185ee438c2 ("powerpc/crash: use generic crashkernel reservation").
Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
Reported-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Closes: https://lore.kernel.org/all/90937fe0-2e76-4c82-b27e-7b8a7fe3ac69@linux.ibm.com/
Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Cc: Baoquan he <bhe@redhat.com>
Cc: Hari Bathini <hbathini@linux.ibm.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:15 -08:00
WangYuli
01ef0296d2 .mailmap: add entry for WangYuli
Map my old, obsolete work email address to my current email address.

My current work email may not be ideal for timely communication, as
it requires a secure network environment for access due to security
policies.

Therefore, associate both my previous and current work email addresses
with an email address provided to me by AOSC Linux community. During
work hours, my commits will likely still be authored using my company
email address.

Link: https://lkml.kernel.org/r/20251014050747.527357-1-wangyuli@aosc.io
Signed-off-by: WangYuli <wangyl5933@chinaunicom.cn>
Signed-off-by: WangYuli <wangyuli@aosc.io>
Cc: Carlos Bilbao <carlos.bilbao@kernel.org>
Cc: Jarkko Sakkinen <jarkko@kernel.org>
Cc: Shannon Nelson <sln@onemain.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:15 -08:00
Ankan Biswas
57f3d89691 lib/xz: remove dead IA-64 (Itanium) support code
Support for the IA-64 (Itanium) architecture was removed in commit
cf8e8658100d ("arch: Remove Itanium (IA-64) architecture").

This patch drops the IA-64 specific decompression code from lib/xz, which
was conditionally compiled with the now-obsolete CONFIG_XZ_DEC_IA64
option.

Link: https://lkml.kernel.org/r/20251014052738.31185-1-spyjetfayed@gmail.com
Signed-off-by: Ankan Biswas <spyjetfayed@gmail.com>
Reviewed-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Reviewed-by: Khalid Aziz <khalid@kernel.org>
Acked-by: Lasse Collin <lasse.collin@tukaani.org>
Cc: David Hunter <david.hunter.linux@gmail.com>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:15 -08:00
Li RongQing
9544f9e694 hung_task: panic when there are more than N hung tasks at the same time
The hung_task_panic sysctl is currently a blunt instrument: it's all or
nothing.

Panicking on a single hung task can be an overreaction to a transient
glitch.  A more reliable indicator of a systemic problem is when
multiple tasks hang simultaneously.

Extend hung_task_panic to accept an integer threshold, allowing the
kernel to panic only when N hung tasks are detected in a single scan. 
This provides finer control to distinguish between isolated incidents
and system-wide failures.

The accepted values are:
- 0: Don't panic (unchanged)
- 1: Panic on the first hung task (unchanged)
- N > 1: Panic after N hung tasks are detected in a single scan

The original behavior is preserved for values 0 and 1, maintaining full
backward compatibility.

[lance.yang@linux.dev: new changelog]
Link: https://lkml.kernel.org/r/20251015063615.2632-1-lirongqing@baidu.com
Signed-off-by: Li RongQing <lirongqing@baidu.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Lance Yang <lance.yang@linux.dev>
Tested-by: Lance Yang <lance.yang@linux.dev>
Acked-by: Andrew Jeffery <andrew@codeconstruct.com.au> [aspeed_g5_defconfig]
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: Florian Wesphal <fw@strlen.de>
Cc: Jakub Kacinski <kuba@kernel.org>
Cc: Jason A. Donenfeld <jason@zx2c4.com>
Cc: Joel Granados <joel.granados@kernel.org>
Cc: Joel Stanley <joel@jms.id.au>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kees Cook <kees@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Paul E . McKenney" <paulmck@kernel.org>
Cc: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Phil Auld <pauld@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Simon Horman <horms@kernel.org>
Cc: Stanislav Fomichev <sdf@fomichev.me>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:14 -08:00
Thomas Weißschuh
05d6f1cc2d compiler.h: remove ARCH_SEL()
Its last user was removed in commit 8ea815399c3f ("compiler: remove
__ADDRESSABLE_ASM{_STR,}() again").

Link: https://lkml.kernel.org/r/20251013-arch-sel-v1-1-7eef9b22ceb0@linutronix.de
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Cc: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:14 -08:00
Dmitry Antipov
1b34743c31 ocfs2: add extra consistency check to ocfs2_dx_dir_lookup_rec()
In 'ocfs2_dx_dir_lookup_rec()', check whether an extent list length of the
directory indexing block matches the one configured via the superblock
parameters established at mount, thus preventing an out-of-bounds accesses
while iterating over the extent records below.

Link: https://lkml.kernel.org/r/20251007094626.196143-1-dmantipov@yandex.ru
Reported-by: syzbot+30b53487d00b4f7f0922@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=30b53487d00b4f7f0922
Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Reviewed-by: Heming Zhao <heming.zhao@suse.com>> 
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:14 -08:00
Dmitry Antipov
2f26f58df0 ocfs2: annotate flexible array members with __counted_by_le()
Annotate flexible array members of 'struct ocfs2_extent_list',
'struct ocfs2_chain_list', 'struct ocfs2_truncate_log',
'struct ocfs2_dx_entry_list', 'ocfs2_refcount_list' and
'struct ocfs2_xattr_header' with  '__counted_by_le()'
attribute to improve array bounds checking when
CONFIG_UBSAN_BOUNDS is enabled.

[dmantipov@yandex.ru: fix __counted_by_le() usage in ocfs2_expand_inline_dx_root()]
  Link: https://lkml.kernel.org/r/20251014070324.130313-1-dmantipov@yandex.ru
Link: https://lkml.kernel.org/r/20251007123526.213150-1-dmantipov@yandex.ru
Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Reviewed-by: Heming Zhao <heming.zhao@suse.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:14 -08:00
Lukas Bulwahn
cd4eaccc00 treewide: drop outdated compiler version remarks in Kconfig help texts
As of writing, Documentation/Changes states the minimal versions of GNU C
being 8.1, Clang being 15.0.0 and binutils being 2.30.  A few Kconfig help
texts are pointing out that specific GCC and Clang versions are needed,
but by now, those pointers to versions, such later than 4.0, later than
4.4, or clang later than 5.0, are obsolete and unlikely to be found by
users configuring their kernel builds anyway.

Drop these outdated remarks in Kconfig help texts referring to older
compiler and binutils versions.  No functional change.

Link: https://lkml.kernel.org/r/20251010082138.185752-1-lukas.bulwahn@redhat.com
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@redhat.com>
Cc: Bill Wendling <morbo@google.com>
Cc: Justin Stitt <justinstitt@google.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Russel King <linux@armlinux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:14 -08:00
Martin Kepplinger-Novaković
02582ac3b7 MAINTAINERS: apply name and email address changes for Martin
Update to new surname addition and currently used email address.

Link: https://lkml.kernel.org/r/20251011155903.7442-2-martink@posteo.de
Signed-off-by: Martin Kepplinger-Novaković <martink@posteo.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:14 -08:00
Martin Kepplinger-Novaković
0d63fc256a CREDITS: update Martin's information
Surname, address, email and the description changed. Apply these updates.

Link: https://lkml.kernel.org/r/20251011155903.7442-3-martink@posteo.de
Signed-off-by: Martin Kepplinger-Novaković <martink@posteo.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:14 -08:00
Martin Kepplinger-Novaković
b7d06a2ae1 mailmap: update name and email addresses
Apply my new surname, remove unused and update to currently used email
addresses.

Link: https://lkml.kernel.org/r/20251011155903.7442-1-martink@posteo.de
Signed-off-by: Martin Kepplinger-Novaković <martink@posteo.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:13 -08:00
Zhichi Lin
08bd4c46d5 scs: fix a wrong parameter in __scs_magic
__scs_magic() needs a 'void *' variable, but a 'struct task_struct *' is
given.  'task_scs(tsk)' is the starting address of the task's shadow call
stack, and '__scs_magic(task_scs(tsk))' is the end address of the task's
shadow call stack.  Here should be '__scs_magic(task_scs(tsk))'.

The user-visible effect of this bug is that when CONFIG_DEBUG_STACK_USAGE
is enabled, the shadow call stack usage checking function
(scs_check_usage) would scan an incorrect memory range.  This could lead
to:

1. **Inaccurate stack usage reporting**: The function would calculate
   wrong usage statistics for the shadow call stack, potentially showing
   incorrect value in kmsg.

2. **Potential kernel crash**: If the value of __scs_magic(tsk)is
   greater than that of __scs_magic(task_scs(tsk)), the for loop may
   access unmapped memory, potentially causing a kernel panic.  However,
   this scenario is unlikely because task_struct is allocated via the slab
   allocator (which typically returns lower addresses), while the shadow
   call stack returned by task_scs(tsk) is allocated via vmalloc(which
   typically returns higher addresses).

However, since this is purely a debugging feature
(CONFIG_DEBUG_STACK_USAGE), normal production systems should be not
unaffected.  The bug only impacts developers and testers who are actively
debugging stack usage with this configuration enabled.

Link: https://lkml.kernel.org/r/20251011082222.12965-1-zhichi.lin@vivo.com
Fixes: 5bbaf9d1fcb9 ("scs: Add support for stack usage debugging")
Signed-off-by: Jiyuan Xie <xiejiyuan@vivo.com>
Signed-off-by: Zhichi Lin <zhichi.lin@vivo.com>
Reviewed-by: Sami Tolvanen <samitolvanen@google.com>
Acked-by: Will Deacon <will@kernel.org>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Marco Elver <elver@google.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yee Lee <yee.lee@mediatek.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:13 -08:00
Justinien Bouron
6a2e57ad22 kexec_core: remove superfluous page offset handling in segment loading
During kexec_segment loading, when copying the content of the segment
(i.e.  kexec_segment::kbuf or kexec_segment::buf) to its associated pages,
kimage_load_{cma,normal,crash}_segment handle the case where the physical
address of the segment is not page aligned, e.g.  in
kimage_load_normal_segment:

	page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
	// ...
	ptr = kmap_local_page(page);
	// ...
	ptr += maddr & ~PAGE_MASK;
	mchunk = min_t(size_t, mbytes,
		PAGE_SIZE - (maddr & ~PAGE_MASK));
	// ^^^^ Non page-aligned segments handled here ^^^
	// ...
	if (image->file_mode)
		memcpy(ptr, kbuf, uchunk);
	else
		result = copy_from_user(ptr, buf, uchunk);

(similar logic is present in kimage_load_{cma,crash}_segment).

This is actually not needed because, prior to their loading, all
kexec_segments first go through a vetting step in
`sanity_check_segment_list`, which rejects any segment that is not
page-aligned:

	for (i = 0; i < nr_segments; i++) {
		unsigned long mstart, mend;
		mstart = image->segment[i].mem;
		mend   = mstart + image->segment[i].memsz;
		// ...
		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
			return -EADDRNOTAVAIL;
		// ...
	}

In case `sanity_check_segment_list` finds a non-page aligned the whole
kexec load is aborted and no segment is loaded.

This means that `kimage_load_{cma,normal,crash}_segment` never actually
have to handle non page-aligned segments and `(maddr & ~PAGE_MASK) == 0`
is always true no matter if the segment is coming from a file (i.e. 
`kexec_file_load` syscall), from a user-space buffer (i.e.  `kexec_load`
syscall) or created by the kernel through `kexec_add_buffer`.  In the
latter case, `kexec_add_buffer` actually enforces the page alignment:

	/* Ensure minimum alignment needed for segments. */
	kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE);
	kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);

[jbouron@amazon.com: v3]
  Link: https://lkml.kernel.org/r/20251024155009.39502-1-jbouron@amazon.com
Link: https://lkml.kernel.org/r/20250929160220.47616-1-jbouron@amazon.com
Signed-off-by: Justinien Bouron <jbouron@amazon.com>
Reviewed-by: Gunnar Kudrjavets <gunnarku@amazon.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@intel.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Marcos Paulo de Souza <mpdesouza@suse.com>
Cc: Mario Limonciello <mario.limonciello@amd.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:13 -08:00
Dmitry Antipov
8a7d58845f ocfs2: relax BUG() to ocfs2_error() in __ocfs2_move_extent()
In '__ocfs2_move_extent()', relax 'BUG()' to 'ocfs2_error()' just
to avoid crashing the whole kernel due to a filesystem corruption.

Fixes: 8f603e567aa7 ("Ocfs2/move_extents: move a range of extent.")
Link: https://lkml.kernel.org/r/20251009102349.181126-2-dmantipov@yandex.ru
Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Closes: https://syzkaller.appspot.com/bug?extid=727d161855d11d81e411
Reported-by: syzbot+727d161855d11d81e411@syzkaller.appspotmail.com
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-12 10:00:02 -08:00
Dmitry Antipov
41a5e87770 ocfs2: add extra flags check in ocfs2_ioctl_move_extents()
In 'ocfs2_ioctl_move_extents()', add extra check whether only actually
supported flags are passed via 'ioctl(..., OCFS2_IOC_MOVE_EXT, ...)',
and reject anything beyond OCFS2_MOVE_EXT_FL_AUTO_DEFRAG and
OCFS2_MOVE_EXT_FL_PART_DEFRAG with -EINVAL. In particular,
OCFS2_MOVE_EXT_FL_COMPLETE may be set by the kernel only and
should never be passed from userspace.

Link: https://lkml.kernel.org/r/20251009102349.181126-1-dmantipov@yandex.ru
Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Mark Fasheh <mark@fasheh.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-11 16:48:29 -08:00
Onur Özkan
55b453ed53 checkpatch: document new check PLACEHOLDER_USE
Adds documentation for the new check PLACEHOLDER_USE in checkpatch.

Link: https://lkml.kernel.org/r/20250917173725.22547-3-work@onurozkan.dev
Signed-off-by: Onur Özkan <work@onurozkan.dev>
Acked-by: Joe Perches <joe@perches.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-11 16:48:29 -08:00
Onur Özkan
fc387a0704 checkpatch: detect unhandled placeholders in cover letters
Add a new check PLACEHOLDER_USE to detect unhandled placeholders.  This
prevents sending patch series with incomplete patches (mostly in cover
letters) containing auto generated subject or blurb lines.

These placeholders can be seen on mailing lists.  With this change,
checkpatch will emit an error when such text is found.

Link: https://lkml.kernel.org/r/20250917173725.22547-2-work@onurozkan.dev
Signed-off-by: Onur Özkan <work@onurozkan.dev>
Acked-by: Joe Perches <joe@perches.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Dwaipayan Ray <dwaipayanray1@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-11 16:48:29 -08:00
Zhou Yuhang
969493d7d6 samples: fix coding style issues in Kconfig
Fix some coding style issues in Kconfig: use one tab to indent lines under
a config definition, and use an additional two spaces to indent help text.

Link: https://lkml.kernel.org/r/20250929062434.4114607-1-zhouyuhang1010@163.com
Signed-off-by: Zhou Yuhang <zhouyuhang@kylinos.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-11 16:48:28 -08:00
Dan Carpenter
5a3d530caa remoteproc: mediatek: Change the snprintf() checking
The snprintf() calls here work but they have several minor style issues:

1) It uses ARRAY_SIZE() which is the number of elements in an array.
   Since were talking about char that works, but it's more common to
   use sizeof() which is the number of bytes.
2) The printf format is "%1d".  The "1" ensures we always print at
   least 1 character but since numbers all have at least 1 digit this
   can be removed.
3) The kernel implementation of snprintf() cannot return negative error
   codes.  Also these particular calls to snprintf() can't return zero
   and the code to handle that zero return is sort of questionable.
4) In the current kernel the only "core_id" we print is "0" but if it
   was more than 9 then the output would be truncated so GCC complains.
   Add an "a >= sizeof(scp_fw_file)" check for output which is too long.

Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/aP8agyKj73bLZrTQ@stanley.mountain
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-11-10 10:40:44 -07:00
Aaron Tomlin
1ddac5cd7f
MAINTAINERS: Add myself as reviewer for module support
Voluntering as a reviewer for Module support.

Suggested-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Aaron Tomlin <atomlin@atomlin.com>
Acked-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Daniel Gomez <da.gomez@samsung.com>
Signed-off-by: Daniel Gomez <da.gomez@samsung.com>
2025-11-10 16:20:35 +01:00
Bagas Sanjaya
acd9ea1714 Documentation: btt: Unwrap bit 31-30 nested table
Bit 31-30 usage table is already formatted as reST simple table, but it
is wrapped in literal code block instead. Unwrap it.

Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Link: https://patch.msgid.link/20251105124707.44736-2-bagasdotme@gmail.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
2025-11-07 09:06:51 -06:00
Andy Shevchenko
6f15c3d715 bitops: Update kernel-doc in hweight.c to fix the issues with it
The kernel-doc in lib/hweight.c is global to  the file and
currently has issues:

Warning: lib/hweight.c:13 expecting prototype for hweightN(). Prototype was for __sw_hweight32() instead
Warning: lib/hweight.c:13 function parameter 'w' not described in '__sw_hweight32'

Update it accordingly.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-06 11:51:04 -05:00
Andy Shevchenko
0cb302c9c9 bitops: Add missed file to MAINTAINERS
In accordance with the history and nature of the operation
add lib/hweight.c to the BITOPS record in MAINTAINERS.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
2025-11-06 11:51:04 -05:00
Marco Crivellari
7e898a9a99 nvdimm: replace use of system_wq with system_percpu_wq
Currently if a user enqueues a work item using schedule_delayed_work() the
used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use
WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to
schedule_work() that is using system_wq and queue_work(), that makes use
again of WORK_CPU_UNBOUND.

This lack of consistency cannot be addressed without refactoring the API.

This patch continues the effort to refactor worqueue APIs, which has begun
with the change introducing new workqueues and a new alloc_workqueue flag:

commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq")
commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag")

Replace system_wq with system_percpu_wq, keeping the same old behavior.
The old wq (system_wq) will be kept for a few release cycles.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>> ---
Link: https://patch.msgid.link/20251105150826.248673-1-marco.crivellari@suse.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
2025-11-05 15:48:11 -06:00
Kees Cook
ae83f3b726
module: Add compile-time check for embedded NUL characters
Long ago, the kernel module license checks were bypassed by embedding a
NUL character in the MODULE_LICENSE() string[1]. By using a string like
"GPL\0proprietary text", the kernel would only read "GPL" due to C string
termination at the NUL byte, allowing proprietary modules to avoid kernel
tainting and access GPL-only symbols.

The MODULE_INFO() macro stores these strings in the .modinfo ELF
section, and get_next_modinfo() uses strcmp()-family functions
which stop at the first NUL. This split the embedded string into two
separate .modinfo entries, with only the first part being processed by
license_is_gpl_compatible().

Add a compile-time check using static_assert that compares the full
string length (sizeof - 1) against __builtin_strlen(), which stops at
the first NUL. If they differ, compilation fails with a clear error
message.

While this check can still be circumvented by modifying the ELF binary
post-compilation, it prevents accidental embedded NULs and forces
intentional abuse to require deliberate binary manipulation rather than
simple source-level tricks.

Build tested with test modules containing both valid and invalid license
strings. The check correctly rejects:

    MODULE_LICENSE("GPL\0proprietary")

while accepting normal declarations:

    MODULE_LICENSE("GPL")

Link: https://lwn.net/Articles/82305/ [1]
Suggested-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Kees Cook <kees@kernel.org>
Reviewed-by: Daniel Gomez <da.gomez@samsung.com>
Reviewed-by: Aaron Tomlin <atomlin@atomlin.com>
Reviewed-by: Petr Pavlu <petr.pavlu@suse.com>
Tested-by: Daniel Gomez <da.gomez@samsung.com>
Signed-off-by: Daniel Gomez <da.gomez@samsung.com>
2025-11-05 14:08:58 +01:00
Kees Cook
57e9853737
media: radio: si470x: Fix DRIVER_AUTHOR macro definition
The DRIVER_AUTHOR macro incorrectly included a semicolon in its
string literal definition. Right now, this wasn't causing any real
problem, but coming changes to the MODULE_INFO() macro make this more
sensitive. Specifically, when used with MODULE_AUTHOR(), this created
syntax errors during macro expansion:

    MODULE_AUTHOR(DRIVER_AUTHOR);

expands to:

    MODULE_INFO(author, "Joonyoung Shim <jy0922.shim@samsung.com>";)
                                                                  ^
                                                       syntax error

Remove the trailing semicolon from the DRIVER_AUTHOR definition.
Semicolons should only appear at the point of use, not in the macro
definition.

Reviewed-by: Hans Verkuil <hverkuil+cisco@kernel.org>
Signed-off-by: Kees Cook <kees@kernel.org>
Reviewed-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Reviewed-by: Daniel Gomez <da.gomez@samsung.com>
Tested-by: Daniel Gomez <da.gomez@samsung.com>
Signed-off-by: Daniel Gomez <da.gomez@samsung.com>
2025-11-05 14:08:56 +01:00
Kees Cook
9de2198ab9
media: dvb-usb-v2: lmedm04: Fix firmware macro definitions
The firmware filename macros incorrectly included semicolons in their
string literal definitions. Right now, this wasn't causing any real
problem, but coming changes to the MODULE_INFO() macro make this more
sensitive. Specifically, when used with MODULE_FIRMWARE(), this
created syntax errors during macro expansion:

    MODULE_FIRMWARE(LME2510_C_S7395);

expands to:

    MODULE_INFO(firmware, "dvb-usb-lme2510c-s7395.fw";)
                                                     ^
                                          syntax error

Remove the trailing semicolons from all six firmware filename macro
definitions. Semicolons should only appear at the point of use, not in
the macro definition.

Reviewed-by: Hans Verkuil <hverkuil+cisco@kernel.org>
Signed-off-by: Kees Cook <kees@kernel.org>
Reviewed-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Reviewed-by: Daniel Gomez <da.gomez@samsung.com>
Tested-by: Daniel Gomez <da.gomez@samsung.com>
Signed-off-by: Daniel Gomez <da.gomez@samsung.com>
2025-11-05 14:08:55 +01:00
Dan Williams
a4438f06b1 PCI/TSM: Report active IDE streams
Given that the platform TSM owns IDE Stream ID allocation, report the
active streams via the TSM class device. Establish a symlink from the
class device to the PCI endpoint device consuming the stream, named by
the Stream ID.

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Alexey Kardashevskiy <aik@amd.com>
Link: https://patch.msgid.link/20251031212902.2256310-10-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-03 19:27:41 -08:00
Dan Williams
9ddaf9c3ed PCI/IDE: Report available IDE streams
The limited number of link-encryption (IDE) streams that a given set of
host bridges supports is a platform specific detail. Provide
pci_ide_init_nr_streams() as a generic facility for either platform TSM
drivers, or PCI core native IDE, to report the number available streams.
After invoking pci_ide_init_nr_streams() an "available_secure_streams"
attribute appears in PCI host bridge sysfs to convey that count.

Introduce a device-type, @pci_host_bridge_type, now that both a release
method and sysfs attribute groups are being specified for all 'struct
pci_host_bridge' instances.

Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Samuel Ortiz <sameo@rivosinc.com>
Cc: Alexey Kardashevskiy <aik@amd.com>
Cc: Xu Yilun <yilun.xu@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251031212902.2256310-9-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-03 19:27:41 -08:00
Dan Williams
1e4d2ff3ae PCI/IDE: Add IDE establishment helpers
There are two components to establishing an encrypted link, provisioning
the stream in Partner Port config-space, and programming the keys into
the link layer via IDE_KM (IDE Key Management). This new library,
drivers/pci/ide.c, enables the former. IDE_KM, via a TSM low-level
driver, is saved for later.

With the platform TSM implementations of SEV-TIO and TDX Connect in mind
this library abstracts small differences in those implementations. For
example, TDX Connect handles Root Port register setup while SEV-TIO
expects System Software to update the Root Port registers. This is the
rationale for fine-grained 'setup' + 'enable' verbs.

The other design detail for TSM-coordinated IDE establishment is that
the TSM may manage allocation of Stream IDs, this is why the Stream ID
value is passed in to pci_ide_stream_setup().

The flow is:

pci_ide_stream_alloc():
    Allocate a Selective IDE Stream Register Block in each Partner Port
    (Endpoint + Root Port), and reserve a host bridge / platform stream
    slot. Gather Partner Port specific stream settings like Requester ID.

pci_ide_stream_register():
    Publish the stream in sysfs after allocating a Stream ID. In the TSM
    case the TSM allocates the Stream ID for the Partner Port pair.

pci_ide_stream_setup():
    Program the stream settings to a Partner Port. Caller is responsible
    for optionally calling this for the Root Port as well if the TSM
    implementation requires it.

pci_ide_stream_enable():
    Enable the stream after IDE_KM.

In support of system administrators auditing where platform, Root Port,
and Endpoint IDE stream resources are being spent, the allocated stream
is reflected as a symlink from the host bridge to the endpoint with the
name:

    stream%d.%d.%d

Where the tuple of integers reflects the allocated platform, Root Port,
and Endpoint stream index (Selective IDE Stream Register Block) values.

Thanks to Wu Hao for a draft implementation of this infrastructure.

Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Samuel Ortiz <sameo@rivosinc.com>
Co-developed-by: Alexey Kardashevskiy <aik@amd.com>
Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Co-developed-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251031212902.2256310-8-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-03 19:27:41 -08:00
Dan Williams
290b633a7d PCI: Establish document for PCI host bridge sysfs attributes
In preparation for adding more host bridge sysfs attributes, document the
existing naming format and 'firmware_node' attribute.

Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251031212902.2256310-7-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-03 19:27:41 -08:00
Dan Williams
c0c1262fbf PCI: Add PCIe Device 3 Extended Capability enumeration
PCIe r7.0 Section 7.7.9 Device 3 Extended Capability Structure, defines the
canonical location for determining the Flit Mode of a device. This status
is a dependency for PCIe IDE enabling. Add a new fm_enabled flag to 'struct
pci_dev'.

Cc: Lukas Wunner <lukas@wunner.de>
Cc: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Samuel Ortiz <sameo@rivosinc.com>
Cc: Alexey Kardashevskiy <aik@amd.com>
Cc: Xu Yilun <yilun.xu@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251031212902.2256310-6-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-03 19:27:41 -08:00
Dan Williams
3225f52cde PCI/TSM: Establish Secure Sessions and Link Encryption
The PCIe 7.0 specification, section 11, defines the Trusted Execution
Environment (TEE) Device Interface Security Protocol (TDISP).  This
protocol definition builds upon Component Measurement and Authentication
(CMA), and link Integrity and Data Encryption (IDE). It adds support for
assigning devices (PCI physical or virtual function) to a confidential VM
such that the assigned device is enabled to access guest private memory
protected by technologies like Intel TDX, AMD SEV-SNP, RISCV COVE, or ARM
CCA.

The "TSM" (TEE Security Manager) is a concept in the TDISP specification
of an agent that mediates between a "DSM" (Device Security Manager) and
system software in both a VMM and a confidential VM. A VMM uses TSM ABIs
to setup link security and assign devices. A confidential VM uses TSM
ABIs to transition an assigned device into the TDISP "RUN" state and
validate its configuration. From a Linux perspective the TSM abstracts
many of the details of TDISP, IDE, and CMA. Some of those details leak
through at times, but for the most part TDISP is an internal
implementation detail of the TSM.

CONFIG_PCI_TSM adds an "authenticated" attribute and "tsm/" subdirectory
to pci-sysfs. Consider that the TSM driver may itself be a PCI driver.
Userspace can watch for the arrival of a "TSM" device,
/sys/class/tsm/tsm0/uevent KOBJ_CHANGE, to know when the PCI core has
initialized TSM services.

The operations that can be executed against a PCI device are split into
two mutually exclusive operation sets, "Link" and "Security" (struct
pci_tsm_{link,security}_ops). The "Link" operations manage physical link
security properties and communication with the device's Device Security
Manager firmware. These are the host side operations in TDISP. The
"Security" operations coordinate the security state of the assigned
virtual device (TDI). These are the guest side operations in TDISP.

Only "link" (Secure Session and physical Link Encryption) operations are
defined at this stage. There are placeholders for the device security
(Trusted Computing Base entry / exit) operations.

The locking allows for multiple devices to be executing commands
simultaneously, one outstanding command per-device and an rwsem
synchronizes the implementation relative to TSM registration/unregistration
events.

Thanks to Wu Hao for his work on an early draft of this support.

Cc: Lukas Wunner <lukas@wunner.de>
Cc: Samuel Ortiz <sameo@rivosinc.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Alexey Kardashevskiy <aik@amd.com>
Co-developed-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Link: https://patch.msgid.link/20251031212902.2256310-5-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-03 19:27:41 -08:00
Dan Williams
215afa89d2 PCI: Introduce pci_walk_bus_reverse(), for_each_pci_dev_reverse()
PCI/TSM, the PCI core functionality for the PCIe TEE Device Interface
Security Protocol (TDISP), has a need to walk all subordinate functions of
a Device Security Manager (DSM) to setup a device security context. A DSM
is physical function 0 of multi-function or SR-IOV device endpoint, or it
is an upstream switch port.

In error scenarios or when a TEE Security Manager (TSM) device is removed
it needs to unwind all established DSM contexts.

Introduce reverse versions of PCI device iteration helpers to mirror the
setup path and ensure that dependent children are handled before parents.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Link: https://patch.msgid.link/20251031212902.2256310-4-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-03 19:27:40 -08:00
Dan Williams
f16469ee73 PCI/IDE: Enumerate Selective Stream IDE capabilities
Link encryption is a new PCIe feature enumerated by "PCIe r7.0 section
7.9.26 IDE Extended Capability".

It is both a standalone port + endpoint capability, and a building block
for the security protocol defined by "PCIe r7.0 section 11 TEE Device
Interface Security Protocol (TDISP)". That protocol coordinates device
security setup between a platform TSM (TEE Security Manager) and a
device DSM (Device Security Manager). While the platform TSM can
allocate resources like Stream ID and manage keys, it still requires
system software to manage the IDE capability register block.

Add register definitions and basic enumeration in preparation for
Selective IDE Stream establishment. A follow on change selects the new
CONFIG_PCI_IDE symbol. Note that while the IDE specification defines
both a point-to-point "Link Stream" and a Root Port to endpoint
"Selective Stream", only "Selective Stream" is considered for Linux as
that is the predominant mode expected by Trusted Execution Environment
Security Managers (TSMs), and it is the security model that limits the
number of PCI components within the TCB in a PCIe topology with
switches.

Co-developed-by: Alexey Kardashevskiy <aik@amd.com>
Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Co-developed-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alexey Kardashevskiy <aik@amd.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Link: https://patch.msgid.link/20251031212902.2256310-3-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-03 19:27:40 -08:00
Dan Williams
603c646f00 coco/tsm: Introduce a core device for TEE Security Managers
A "TSM" is a platform component that provides an API for securely
provisioning resources for a confidential guest (TVM) to consume. The
name originates from the PCI specification for platform agent that
carries out operations for PCIe TDISP (TEE Device Interface Security
Protocol).

Instances of this core device are parented by a device representing the
platform security function like CONFIG_CRYPTO_DEV_CCP or
CONFIG_INTEL_TDX_HOST.

This device interface is a frontend to the aspects of a TSM and TEE I/O
that are cross-architecture common. This includes mechanisms like
enumerating available platform TEE I/O capabilities and provisioning
connections between the platform TSM and device DSMs (Device Security
Manager (TDISP)).

For now this is just the scaffolding for registering a TSM device sysfs
interface.

Cc: Xu Yilun <yilun.xu@linux.intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Co-developed-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Alexey Kardashevskiy <aik@amd.com>
Link: https://patch.msgid.link/20251031212902.2256310-2-dan.j.williams@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
2025-11-03 19:27:40 -08:00
Alison Schofield
f59b701b46 tools/testing/nvdimm: Use per-DIMM device handle
KASAN reports a global-out-of-bounds access when running these nfit
tests: clear.sh, pmem-errors.sh, pfn-meta-errors.sh, btt-errors.sh,
daxdev-errors.sh, and inject-error.sh.

[] BUG: KASAN: global-out-of-bounds in nfit_test_ctl+0x769f/0x7840 [nfit_test]
[] Read of size 4 at addr ffffffffc03ea01c by task ndctl/1215
[] The buggy address belongs to the variable:
[] handle+0x1c/0x1df4 [nfit_test]

nfit_test_search_spa() uses handle[nvdimm->id] to retrieve a device
handle and triggers a KASAN error when it reads past the end of the
handle array. It should not be indexing the handle array at all.

The correct device handle is stored in per-DIMM test data. Each DIMM
has a struct nfit_mem that embeds a struct acpi_nfit_memdev that
describes the NFIT device handle. Use that device handle here.

Fixes: 10246dc84dfc ("acpi nfit: nfit_test supports translate SPA")
Cc: stable@vger.kernel.org
Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>> ---
Link: https://patch.msgid.link/20251031234227.1303113-1-alison.schofield@intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
2025-11-03 16:47:13 -06:00
Mike Rapoport (Microsoft)
43bc0aa19a nvdimm: allow exposing RAM carveouts as NVDIMM DIMM devices
There are use cases, for example virtual machine hosts, that create
"persistent" memory regions using memmap= option on x86 or dummy
pmem-region device tree nodes on DT based systems.

Both these options are inflexible because they create static regions and
the layout of the "persistent" memory cannot be adjusted without reboot
and sometimes they even require firmware update.

Add a ramdax driver that allows creation of DIMM devices on top of
E820_TYPE_PRAM regions and devicetree pmem-region nodes.

The DIMMs support label space management on the "device" and provide a
flexible way to access RAM using fsdax and devdax.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://patch.msgid.link/20251026153841.752061-2-rppt@kernel.org
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
2025-11-03 14:50:42 -06:00
Andreas Hindborg
ee3b8134b2
modules: add rust modules files to MAINTAINERS
The module subsystem people agreed to maintain rust support for modules
[1]. Thus, add entries for relevant files to modules entry in MAINTAINERS.

Link: https://lore.kernel.org/all/0d9e596a-5316-4e00-862b-fd77552ae4b5@suse.com/ [1]

Acked-by: Daniel Gomez <da.gomez@samsung.com>
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
Tested-by: Daniel Gomez <da.gomez@samsung.com>
Signed-off-by: Daniel Gomez <da.gomez@kernel.org>
2025-11-03 14:43:42 +01:00
Andreas Hindborg
e119c2fe8c
rust: samples: add a module parameter to the rust_minimal sample
Showcase the rust module parameter support by adding a module parameter to
the `rust_minimal` sample.

Reviewed-by: Benno Lossin <lossin@kernel.org>
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
Tested-by: Daniel Gomez <da.gomez@samsung.com>
Signed-off-by: Daniel Gomez <da.gomez@kernel.org>
2025-11-03 14:42:34 +01:00
Andreas Hindborg
0b24f9740f
rust: module: update the module macro with module parameter support
Allow module parameters to be declared in the rust `module!` macro.

Reviewed-by: Benno Lossin <lossin@kernel.org>
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
Tested-by: Daniel Gomez <da.gomez@samsung.com>
Signed-off-by: Daniel Gomez <da.gomez@kernel.org>
2025-11-03 14:42:29 +01:00
Andreas Hindborg
3809d7a89f
rust: module: use a reference in macros::module::module
When we add parameter support to the module macro, we want to be able to
pass a reference to `ModuleInfo` to a helper function. That is not possible
when we move out of the local `modinfo`. So change the function to access
the local via reference rather than value.

Reviewed-by: Benno Lossin <lossin@kernel.org>
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
Tested-by: Daniel Gomez <da.gomez@samsung.com>
Signed-off-by: Daniel Gomez <da.gomez@kernel.org>
2025-11-03 14:41:29 +01:00
Andreas Hindborg
0b08fc2928
rust: introduce module_param module
Add types and traits for interfacing the C moduleparam API.

Reviewed-by: Benno Lossin <lossin@kernel.org>
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
Tested-by: Daniel Gomez <da.gomez@samsung.com>
Signed-off-by: Daniel Gomez <da.gomez@kernel.org>
2025-11-03 14:40:57 +01:00
Andreas Hindborg
51d9ee90ea
rust: str: add radix prefixed integer parsing functions
Add the trait `ParseInt` for parsing string representations of integers
where the string representations are optionally prefixed by a radix
specifier. Implement the trait for the primitive integer types.

Suggested-by: Benno Lossin <benno.lossin@proton.me>
Tested-by: Daniel Gomez <da.gomez@samsung.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Benno Lossin <lossin@kernel.org>
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
Signed-off-by: Daniel Gomez <da.gomez@kernel.org>
2025-11-03 14:40:45 +01:00
Andreas Hindborg
821fe7bf16
rust: sync: add SetOnce
Introduce the `SetOnce` type, a container that can only be written once.
The container uses an internal atomic to synchronize writes to the internal
value.

Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Reviewed-by: Benno Lossin <lossin@kernel.org>
Signed-off-by: Andreas Hindborg <a.hindborg@kernel.org>
Tested-by: Daniel Gomez <da.gomez@samsung.com>
Signed-off-by: Daniel Gomez <da.gomez@kernel.org>
2025-11-03 14:40:35 +01:00
Leon Romanovsky
131971f67e dma-mapping: remove unused map_page callback
After conversion of arch code to use physical address mapping,
there are no users of .map_page() and .unmap_page() callbacks,
so let's remove them.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-14-3bbfe3a25cdf@kernel.org
2025-10-29 10:27:31 +01:00
Leon Romanovsky
936a9f0cb1 xen: swiotlb: Convert mapping routine to rely on physical address
Switch to .map_phys callback instead of .map_page.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-13-3bbfe3a25cdf@kernel.org
2025-10-29 10:27:30 +01:00
Leon Romanovsky
33d2c5ee10 x86: Use physical address for DMA mapping
Perform mechanical conversion from DMA .map_page to .map_phys.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-12-3bbfe3a25cdf@kernel.org
2025-10-29 10:27:30 +01:00
Leon Romanovsky
38c0d0ebf5 sparc: Use physical address DMA mapping
Convert sparc architecture DMA code to use .map_phys callback.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-11-3bbfe3a25cdf@kernel.org
2025-10-29 10:27:30 +01:00
Leon Romanovsky
a10d648d13 powerpc: Convert to physical address DMA mapping
Adapt PowerPC DMA to use physical addresses in order to prepare code
to removal .map_page and .unmap_page.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-10-3bbfe3a25cdf@kernel.org
2025-10-29 10:27:30 +01:00
Leon Romanovsky
96ddf2ef58 parisc: Convert DMA map_page to map_phys interface
Perform mechanical conversion from .map_page to .map_phys callback.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-9-3bbfe3a25cdf@kernel.org
2025-10-29 10:27:30 +01:00
Leon Romanovsky
e4e3fff66a MIPS/jazzdma: Provide physical address directly
MIPS jazz uses physical addresses for mapping pages, so convert
it to get them directly from DMA mapping routine.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-8-3bbfe3a25cdf@kernel.org
2025-10-29 10:27:30 +01:00
Leon Romanovsky
6aaecdf0d8 alpha: Convert mapping routine to rely on physical address
Alpha doesn't need struct *page and can perform mapping based on
physical addresses. So convert it to implement new .map_phys callback.

As part of this change, remove useless BUG_ON() as DMA mapping layer
ensures that right direction is provided.

Tested-by: Magnus Lindholm <linmag7@gmail.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-7-3bbfe3a25cdf@kernel.org
2025-10-29 10:27:30 +01:00
Leon Romanovsky
14cb413af0 dma-mapping: remove unused mapping resource callbacks
After ARM and XEN conversions to use physical addresses for the mapping,
there are no in-kernel users for map_resource/unmap_resource callbacks,
so remove them.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-6-3bbfe3a25cdf@kernel.org
2025-10-29 10:27:30 +01:00
Leon Romanovsky
af85de5a9f xen: swiotlb: Switch to physical address mapping callbacks
Combine resource and page mappings routines to one function
and remove .map_resource/.unmap_resource callbacks completely.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-5-3bbfe3a25cdf@kernel.org
2025-10-29 10:27:30 +01:00
Leon Romanovsky
50b149be07 ARM: dma-mapping: Switch to physical address mapping callbacks
Combine resource and page mappings routines to one function, which
handles both these flows at the same manner. This conversion allows
us to remove .map_resource/.unmap_resource callbacks completely.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-4-3bbfe3a25cdf@kernel.org
2025-10-29 10:27:30 +01:00
Leon Romanovsky
52c9aa1adc ARM: dma-mapping: Reduce struct page exposure in arch_sync_dma*()
As a preparation to changing from .map_page to use .map_phys DMA
callbacks, convert arch_sync_dma*() functions to use physical addresses
instead of struct page.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-3-3bbfe3a25cdf@kernel.org
2025-10-29 10:27:29 +01:00
Leon Romanovsky
45fa6d190d dma-mapping: convert dummy ops to physical address mapping
Change dma_dummy_map_page and dma_dummy_unmap_page routines
to accept physical address and rename them.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-2-3bbfe3a25cdf@kernel.org
2025-10-29 10:27:29 +01:00
Leon Romanovsky
ed7fc3cbb3 dma-mapping: prepare dma_map_ops to conversion to physical address
Add new .map_phys() and .unmap_phys() callbacks to dma_map_ops as a
preparation to replace .map_page() and .unmap_page() respectively.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-1-3bbfe3a25cdf@kernel.org
2025-10-29 10:27:29 +01:00
Qinxin Xia
f74ee32963 tools/dma: move dma_map_benchmark from selftests to tools/dma
dma_map_benchmark is a standalone developer tool rather than an
automated selftest. It has no pass/fail criteria, expects manual
invocation, and is built as a normal userspace binary. Move it to
tools/dma/ and add a minimal Makefile.

Suggested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Suggested-by: Barry Song <baohua@kernel.org>
Signed-off-by: Qinxin Xia <xiaqinxin@huawei.com>
Acked-by: Barry Song <baohua@kernel.org>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20251028120900.2265511-3-xiaqinxin@huawei.com
2025-10-29 09:41:40 +01:00
Peng Fan
12dc929c6c remoteproc: core: Remove unused export of rproc_va_to_pa
Commit 086d08725d34 ("remoteproc: create vdev subdevice with specific dma
memory pool") added an export for rproc_va_to_pa. However, since its
introduction, this symbol has not been used by any loadable modules. It
remains only referenced within remoteproc_virtio.c, which is always built
together with remoteproc_core.c.

As such, exporting rproc_va_to_pa is unnecessary, so remove the export.

No functional changes.

Signed-off-by: Peng Fan <peng.fan@nxp.com>
Acked-by: Andrew Davis <afd@ti.com>
Link: https://lore.kernel.org/r/20251016-rproc-cleanup-v3-v3-4-774083716e8a@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-10-27 09:26:07 -06:00
Peng Fan
6e863a57dd remoteproc: core: Removed unused headers
There is no user of crc32.h, debugfs.h, of_reserved_mem.h, virtio_ids.h,
so remove from the included headers.

No functional changes.

Signed-off-by: Peng Fan <peng.fan@nxp.com>
Acked-by: Andrew Davis <afd@ti.com>
Link: https://lore.kernel.org/r/20251016-rproc-cleanup-v3-v3-3-774083716e8a@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-10-27 09:26:01 -06:00
Peng Fan
f1b26faafd remoteproc: core: Sort header includes
Reordered the header includes in drivers/remoteproc/remoteproc_core.c
to follow alphabetical order to simplify future maintenance.

No functional changes.

Signed-off-by: Peng Fan <peng.fan@nxp.com>
Acked-by: Andrew Davis <afd@ti.com>
Link: https://lore.kernel.org/r/20251016-rproc-cleanup-v3-v3-2-774083716e8a@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-10-27 09:25:55 -06:00
Peng Fan
4531b6bad5 remoteproc: core: Drop redundant initialization of 'ret' in rproc_shutdown()
The variable ret is immediately assigned the return value of
mutex_lock_interruptible(), making its prior initialization to zero
unnecessary. Remove the redundant assignment

No functional changes.

Signed-off-by: Peng Fan <peng.fan@nxp.com>
Acked-by: Andrew Davis <afd@ti.com>
Link: https://lore.kernel.org/r/20251016-rproc-cleanup-v3-v3-1-774083716e8a@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-10-27 09:25:49 -06:00
Peng Fan
5a4d08351b remoteproc: imx_rproc: Remove the assignement to method
'method' is no longer used in imx_rproc.c, so remove the assignment.
But imx_dsp_rproc.c is still using 'method', so still keep the field
in struct imx_rrpoc_dcfg.

No functional changes.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20251024-imx_rproc_c4-v4-4-af83ed3fdbba@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-10-24 08:48:22 -06:00
Peng Fan
b2d66cd137 remoteproc: imx_rproc: Enable PM runtime support unconditionally
PM runtime support is safe and applicable across all i.MX platforms, not
just those using the SCU API. Remove the conditional check and enable PM
runtime unconditionally to simplify the code and ensure consistent power
management behavior.

Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20251024-imx_rproc_c4-v4-3-af83ed3fdbba@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-10-24 08:47:26 -06:00
Peng Fan
016a3d4bcf remoteproc: imx_rproc: Make detach operation platform-specific
Refactor the detach logic to support platform-specific implementations via
the dcfg->ops->detach callback. Allow finer control over detach behavior
depending on the remote processor management method, and make it easier
to add detach support for new SoCs.

The previous hardcoded SCU API detach logic is now moved into a dedicated
imx_rproc_scu_api_detach() function, and registered via the plat ops
structure. The generic imx_rproc_detach() now delegates to the
platform-specific handler if available.

Also, the dcfg->method check with IMX_RPROC_SCU_API is removed.

No functional changes.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20251024-imx_rproc_c4-v4-2-af83ed3fdbba@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-10-24 08:46:33 -06:00
Peng Fan
ddbec021a3 remoteproc: imx_rproc: Simplify clock enable logic using dcfg flags
Simplify the clock enable logic by removing the dedicated
imx_rproc_clk_enable() function and integrate the clock handling directly
into the probe function to simplify the code.

Add a new IMX_RPROC_NEED_CLKS flag in dcfg to indicate whether clock
management is required for a given SoC. Update probe logic to conditionally
enable clocks based on the new flag.

Set the flag for applicable SoCs (e.g., i.MX7D, i.MX8MQ, i.MX93, etc.).

No functional changes.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20251024-imx_rproc_c4-v4-1-af83ed3fdbba@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-10-24 08:45:41 -06:00
Len Brown
6dfb04332f tools/power turbostat: Remove dead code
amperf_group_fd is never used.

Signed-off-by: Len Brown <len.brown@intel.com>
2025-10-24 10:54:16 -03:00
Len Brown
696d15cbd8 tools/power turbostat: Refactor floating point printout code
Too many copies of (usually) the same printf code...

Also, unify code for added-counter FORMAT_AVERAGE,
which was correct where it was tested, but neglected elsewhere.

Signed-off-by: Len Brown <len.brown@intel.com>
2025-10-24 10:54:09 -03:00
Len Brown
64f96057a6 tools/power turbostat.8: Update example
Update the added-counters example to print counters in decimal
rather than hex -- now that it is working...

Signed-off-by: Len Brown <len.brown@intel.com>
2025-10-24 10:50:32 -03:00
Len Brown
885e822764 tools/power turbostat: Refactor added-counter value printing code
We build up many copies of very similar code...

Signed-off-by: Len Brown <len.brown@intel.com>
2025-10-24 10:50:25 -03:00
Len Brown
56dbb87850 tools/power turbostat: Refactor added column header printing
Over time, we built up many copies of nearly identical code...

Signed-off-by: Len Brown <len.brown@intel.com>
2025-10-24 10:44:18 -03:00
Len Brown
4e35847d7b tools/power turbostat: Add Wildcat Lake and Nova Lake support
Treat Wildcat Lake and Nova Lake (and Panther Lake)
the same as Lunar Lake, for now.

Signed-off-by: Len Brown <len.brown@intel.com>
2025-10-24 10:41:21 -03:00
Len Brown
92664f2e6a tools/power turbostat: Regression fix Uncore MHz printed in hex
A patch to allow specifying FORMAT_AVERAGE to added counters...
broke the internally added counter for Cluster Uncore MHz -- printing it in HEX.

Fixes: dcd1c379b0f1 ("tools/power turbostat: add format "average" for external attributes")
Reported-by: Andrej Tkalcec <andrej.tkalcec@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
2025-10-24 10:41:17 -03:00
AngeloGioacchino Del Regno
8fd705c5e7 remoteproc: mtk_scp: Construct FW path if firmware-name not present
After a reply on the mailing lists [1] it emerged that the DT
property "firmware-name" should not be relied on because of
possible issues with firmware versions.
For MediaTek SCP, there has never been any firmware version vs
driver version desync issue but, regardless, the firmwares are
always using the same name and they're always located in a path
with a specific pattern.

Instead of unconditionally always relying on the firmware-name
devicetree property to get a path to the SCP FW file, drivers
should construct a name based on what firmware it knows and
what hardware it is running on.

In order to do that, add a `scp_get_default_fw_path()` function
that constructs the path and filename based on two of the infos
that the driver can get:
 1. The compatible string with the highest priority (so, the
    first one at index 0); and
 2. The type of SCP HW - single-core or multi-core.

This means that the default firmware path is generated as:
 - Single core SCP: mediatek/(soc_model)/scp.img
   for example:     mediatek/mt8183/scp.img;

 - Multi core SCP:  mediatek/(soc_model)/scp_c(core_number).img
   for example:     mediatek/mt8188/scp_c0.img for Core 0, and
                    mediatek/mt8188/scp_c1.img for Core 1.

Note that the generated firmware path is being used only if the
"firmware-name" devicetree property is not present in the SCP
node or in the SCP Core node(s).

[1 - Reply regarding firmware-name property]
Link: https://lore.kernel.org/all/7e8718b0-df78-44a6-a102-89529d6abcce@app.fastmail.com/
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20251015084103.10737-1-angelogioacchino.delregno@collabora.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-10-20 09:16:11 -06:00
Peng Fan
ff7c763b91 remoteproc: imx_rproc: Use devm_rproc_add() helper
Replace manual rproc_add() and cleanup logic with devm_rproc_add(), which
ties the remoteproc lifecycle to the device's lifecycle. This simplifies
error handling and ensures proper cleanup.

With no need to invoke rproc_del(), the remove() ops could be removed.

No functional changes.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20250926-imx_rproc_v3-v3-6-4c0ec279cc5f@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-10-14 09:14:32 -06:00
Peng Fan
9b2451658a remoteproc: imx_rproc: Use devm_add_action_or_reset() for scu cleanup
Replace the explicit call to imx_rproc_put_scu() in the remove path with
devm_add_action_or_reset(). Ensure proper cleanup of scu resources and
simplify the code by leveraging the device-managed resource framework.

Additionally:
 - Remove the IMX_RPROC_SCU_API check from imx_rproc_put_scu(), as
   devm_add_action_or_reset() now exclusively handles SCU cleanup.
 - Improve error reporting by using dev_err_probe() for consistency and
   clarity.
 - Drop the err_put_scu label, as it is now redundant due to the updated
   error handling approach.

No functional changes.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20250926-imx_rproc_v3-v3-5-4c0ec279cc5f@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-10-14 09:13:36 -06:00
Peng Fan
65af722aa8 remoteproc: imx_rproc: Use devm_clk_get_enabled() and simplify cleanup
Replace separate calls to devm_clk_get() and clk_prepare_enable() with
devm_clk_get_enabled(), which combines clock acquisition and enabling
into a single managed step. Simplify the probe logic and remove the need
for manual clock disable in error and remove paths.

Also, update error handling to eliminate redundant cleanup steps and use
return-based error propagation where appropriate. Improve code clarity and
reduce the chance of resource leaks or incorrect ordering in cleanup paths.

No functional changes.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20250926-imx_rproc_v3-v3-4-4c0ec279cc5f@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-10-14 09:12:29 -06:00
Peng Fan
b0106defc0 remoteproc: imx_rproc: Use devm_add_action_or_reset() for mailbox cleanup
Convert imx_rproc_free_mbox() to a devm-managed cleanup action using
devm_add_action_or_reset(). Ensure the mailbox resources are freed
automatically with the device lifecycle, simplify error handling and
removing the need for manual cleanup in probe and remove paths.

Also improve error reporting by using dev_err_probe() for consistency and
clarity.

No functional changes.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20250926-imx_rproc_v3-v3-3-4c0ec279cc5f@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-10-14 09:11:39 -06:00
Peng Fan
6c5c37dc41 remoteproc: imx_rproc: Use devm_add_action_or_reset() for workqueue cleanup
Replace manual destroy_workqueue() calls in error and remove paths with a
devm_add_action_or_reset() helper. Ensure the workqueue is properly
cleaned up with the device lifecycle, and simplify error handling in probe
by removing now-unnecessary labels and cleanup steps.

No functional changes.

Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20250926-imx_rproc_v3-v3-2-4c0ec279cc5f@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-10-14 09:09:40 -06:00
Peng Fan
80405a34e1 remoteproc: imx_rproc: Fix runtime PM cleanup and improve remove path
Proper cleanup should be done when rproc_add() fails by invoking both
pm_runtime_disable() and pm_runtime_put_noidle() to avoid leaving the
device in an inconsistent power state.

Fix it by adding pm_runtime_put_noidle() and pm_runtime_disable()
in the error path.

Also Update the remove() callback to use pm_runtime_put_noidle() instead of
pm_runtime_put(), to clearly indicate that only need to restore the usage
count.

Fixes: a876a3aacc43 ("remoteproc: imx_rproc: detect and attach to pre-booted remote cores")
Cc: Ulf Hansson <ulf.hansson@linaro.org>
Cc: Hiago De Franco <hiago.franco@toradex.com>
Suggested-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20250926-imx_rproc_v3-v3-1-4c0ec279cc5f@nxp.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
2025-10-14 09:08:32 -06:00
714 changed files with 19793 additions and 4662 deletions

View File

@ -303,6 +303,7 @@ Hans de Goede <hansg@kernel.org> <hdegoede@redhat.com>
Hans Verkuil <hverkuil@kernel.org> <hverkuil@xs4all.nl>
Hans Verkuil <hverkuil@kernel.org> <hverkuil-cisco@xs4all.nl>
Hans Verkuil <hverkuil@kernel.org> <hansverk@cisco.com>
Hao Ge <hao.ge@linux.dev> <gehao@kylinos.cn>
Harry Yoo <harry.yoo@oracle.com> <42.hyeyoo@gmail.com>
Heiko Carstens <hca@linux.ibm.com> <h.carstens@de.ibm.com>
Heiko Carstens <hca@linux.ibm.com> <heiko.carstens@de.ibm.com>
@ -503,9 +504,7 @@ Mark Brown <broonie@sirena.org.uk>
Mark Starovoytov <mstarovo@pm.me> <mstarovoitov@marvell.com>
Markus Schneider-Pargmann <msp@baylibre.com> <mpa@pengutronix.de>
Mark Yao <markyao0591@gmail.com> <mark.yao@rock-chips.com>
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@ginzinger.com>
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@puri.sm>
Martin Kepplinger <martink@posteo.de> <martin.kepplinger@theobroma-systems.com>
Martin Kepplinger-Novakovic <martink@posteo.de> <martin.kepplinger-novakovic@ginzinger.com>
Martyna Szapar-Mudlaw <martyna.szapar-mudlaw@linux.intel.com> <martyna.szapar-mudlaw@intel.com>
Mathieu Othacehe <othacehe@gnu.org> <m.othacehe@gmail.com>
Mat Martineau <martineau@kernel.org> <mathew.j.martineau@linux.intel.com>
@ -856,6 +855,9 @@ Vivien Didelot <vivien.didelot@gmail.com> <vivien.didelot@savoirfairelinux.com>
Vlad Dogaru <ddvlad@gmail.com> <vlad.dogaru@intel.com>
Vladimir Davydov <vdavydov.dev@gmail.com> <vdavydov@parallels.com>
Vladimir Davydov <vdavydov.dev@gmail.com> <vdavydov@virtuozzo.com>
WangYuli <wangyuli@aosc.io> <wangyl5933@chinaunicom.cn>
WangYuli <wangyuli@aosc.io> <wangyuli@deepin.org>
WangYuli <wangyuli@aosc.io> <wangyuli@uniontech.com>
Weiwen Hu <huweiwen@linux.alibaba.com> <sehuww@mail.scut.edu.cn>
WeiXiong Liao <gmpy.liaowx@gmail.com> <liaoweixiong@allwinnertech.com>
Wen Gong <quic_wgong@quicinc.com> <wgong@codeaurora.org>
@ -867,6 +869,7 @@ Yakir Yang <kuankuan.y@gmail.com> <ykk@rock-chips.com>
Yanteng Si <si.yanteng@linux.dev> <siyanteng@loongson.cn>
Ying Huang <huang.ying.caritas@gmail.com> <ying.huang@intel.com>
Yosry Ahmed <yosry.ahmed@linux.dev> <yosryahmed@google.com>
Yu-Chun Lin <eleanor.lin@realtek.com> <eleanor15x@gmail.com>
Yusuke Goda <goda.yusuke@renesas.com>
Zack Rusin <zack.rusin@broadcom.com> <zackr@vmware.com>
Zhu Yanjun <zyjzyj2000@gmail.com> <yanjunz@nvidia.com>

View File

@ -2056,16 +2056,15 @@ S: Korte Heul 95
S: 1403 ND BUSSUM
S: The Netherlands
N: Martin Kepplinger
N: Martin Kepplinger-Novakovic
E: martink@posteo.de
E: martin.kepplinger@puri.sm
W: http://www.martinkepplinger.com
P: 4096R/5AB387D3 F208 2B88 0F9E 4239 3468 6E3F 5003 98DF 5AB3 87D3
D: mma8452 accelerators iio driver
D: pegasus_notetaker input driver
D: imx8m media and hi846 sensor driver
D: Kernel fixes and cleanups
S: Garnisonstraße 26
S: 4020 Linz
S: Keplerstr. 6
S: 4050 Traun
S: Austria
N: Karl Keyte

View File

@ -0,0 +1,71 @@
NOTE: all the ABIs listed in this file are deprecated and will be removed after 2028.
Here are the alternative ABIs:
+------------------------------------+-----------------------------------------+
| Deprecated | Alternative |
+------------------------------------+-----------------------------------------+
| /sys/kernel/kexec_loaded | /sys/kernel/kexec/loaded |
+------------------------------------+-----------------------------------------+
| /sys/kernel/kexec_crash_loaded | /sys/kernel/kexec/crash_loaded |
+------------------------------------+-----------------------------------------+
| /sys/kernel/kexec_crash_size | /sys/kernel/kexec/crash_size |
+------------------------------------+-----------------------------------------+
| /sys/kernel/crash_elfcorehdr_size | /sys/kernel/kexec/crash_elfcorehdr_size |
+------------------------------------+-----------------------------------------+
| /sys/kernel/kexec_crash_cma_ranges | /sys/kernel/kexec/crash_cma_ranges |
+------------------------------------+-----------------------------------------+
What: /sys/kernel/kexec_loaded
Date: Jun 2006
Contact: kexec@lists.infradead.org
Description: read only
Indicates whether a new kernel image has been loaded
into memory using the kexec system call. It shows 1 if
a kexec image is present and ready to boot, or 0 if none
is loaded.
User: kexec tools, kdump service
What: /sys/kernel/kexec_crash_loaded
Date: Jun 2006
Contact: kexec@lists.infradead.org
Description: read only
Indicates whether a crash (kdump) kernel is currently
loaded into memory. It shows 1 if a crash kernel has been
successfully loaded for panic handling, or 0 if no crash
kernel is present.
User: Kexec tools, Kdump service
What: /sys/kernel/kexec_crash_size
Date: Dec 2009
Contact: kexec@lists.infradead.org
Description: read/write
Shows the amount of memory reserved for loading the crash
(kdump) kernel. It reports the size, in bytes, of the
crash kernel area defined by the crashkernel= parameter.
This interface also allows reducing the crashkernel
reservation by writing a smaller value, and the reclaimed
space is added back to the system RAM.
User: Kdump service
What: /sys/kernel/crash_elfcorehdr_size
Date: Aug 2023
Contact: kexec@lists.infradead.org
Description: read only
Indicates the preferred size of the memory buffer for the
ELF core header used by the crash (kdump) kernel. It defines
how much space is needed to hold metadata about the crashed
system, including CPU and memory information. This information
is used by the user space utility kexec to support updating the
in-kernel kdump image during hotplug operations.
User: Kexec tools
What: /sys/kernel/kexec_crash_cma_ranges
Date: Nov 2025
Contact: kexec@lists.infradead.org
Description: read only
Provides information about the memory ranges reserved from
the Contiguous Memory Allocator (CMA) area that are allocated
to the crash (kdump) kernel. It lists the start and end physical
addresses of CMA regions assigned for crashkernel use.
User: kdump service

View File

@ -621,3 +621,84 @@ Description:
number extended capability. The file is read only and due to
the possible sensitivity of accessible serial numbers, admin
only.
What: /sys/bus/pci/devices/.../tsm/
Contact: linux-coco@lists.linux.dev
Description:
This directory only appears if a physical device function
supports authentication (PCIe CMA-SPDM), interface security
(PCIe TDISP), and is accepted for secure operation by the
platform TSM driver. This attribute directory appears
dynamically after the platform TSM driver loads. So, only after
the /sys/class/tsm/tsm0 device arrives can tools assume that
devices without a tsm/ attribute directory will never have one;
before that, the security capabilities of the device relative to
the platform TSM are unknown. See
Documentation/ABI/testing/sysfs-class-tsm.
What: /sys/bus/pci/devices/.../tsm/connect
Contact: linux-coco@lists.linux.dev
Description:
(RW) Write the name of a TSM (TEE Security Manager) device from
/sys/class/tsm to this file to establish a connection with the
device. This typically includes an SPDM (DMTF Security
Protocols and Data Models) session over PCIe DOE (Data Object
Exchange) and may also include PCIe IDE (Integrity and Data
Encryption) establishment. Reads from this attribute return the
name of the connected TSM or the empty string if not
connected. A TSM device signals its readiness to accept PCI
connection via a KOBJ_CHANGE event.
What: /sys/bus/pci/devices/.../tsm/disconnect
Contact: linux-coco@lists.linux.dev
Description:
(WO) Write the name of the TSM device that was specified
to 'connect' to teardown the connection.
What: /sys/bus/pci/devices/.../tsm/dsm
Contact: linux-coco@lists.linux.dev
Description: (RO) Return PCI device name of this device's DSM (Device
Security Manager). When a device is in the connected state it
indicates that the platform TSM (TEE Security Manager) has made
a secure-session connection with a device's DSM. A DSM is always
physical function 0 and when the device supports TDISP (TEE
Device Interface Security Protocol) its managed functions also
populate this tsm/dsm attribute. The managed functions of a DSM
are SR-IOV (Single Root I/O Virtualization) virtual functions,
non-zero functions of a multi-function device, or downstream
endpoints depending on whether the DSM is an SR-IOV physical
function, function0 of a multi-function device, or an upstream
PCIe switch port. This is a "link" TSM attribute, see
Documentation/ABI/testing/sysfs-class-tsm.
What: /sys/bus/pci/devices/.../tsm/bound
Contact: linux-coco@lists.linux.dev
Description: (RO) Return the device name of the TSM when the device is in a
TDISP (TEE Device Interface Security Protocol) operational state
(LOCKED, RUN, or ERROR, not UNLOCKED). Bound devices consume
platform TSM resources and depend on the device's configuration
(e.g. BME (Bus Master Enable) and MSE (Memory Space Enable)
among other settings) to remain stable for the duration of the
bound state. This attribute is only visible for devices that
support TDISP operation, and it is only populated after
successful connect and TSM bind. The TSM bind operation is
initiated by VFIO/IOMMUFD. This is a "link" TSM attribute, see
Documentation/ABI/testing/sysfs-class-tsm.
What: /sys/bus/pci/devices/.../authenticated
Contact: linux-pci@vger.kernel.org
Description:
When the device's tsm/ directory is present device
authentication (PCIe CMA-SPDM) and link encryption (PCIe IDE)
are handled by the platform TSM (TEE Security Manager). When the
tsm/ directory is not present this attribute reflects only the
native CMA-SPDM authentication state with the kernel's
certificate store.
If the attribute is not present, it indicates that
authentication is unsupported by the device, or the TSM has no
available authentication methods for the device.
When present and the tsm/ attribute directory is present, the
authenticated attribute is an alias for the device 'connect'
state. See the 'tsm/connect' attribute for more details.

View File

@ -0,0 +1,19 @@
What: /sys/class/tsm/tsmN
Contact: linux-coco@lists.linux.dev
Description:
"tsmN" is a device that represents the generic attributes of a
platform TEE Security Manager. It is typically a child of a
platform enumerated TSM device. /sys/class/tsm/tsmN/uevent
signals when the PCI layer is able to support establishment of
link encryption and other device-security features coordinated
through a platform tsm.
What: /sys/class/tsm/tsmN/streamH.R.E
Contact: linux-pci@vger.kernel.org
Description:
(RO) When a host bridge has established a secure connection via
the platform TSM, symlink appears. The primary function of this
is have a system global review of TSM resource consumption
across host bridges. The link points to the endpoint PCI device
and matches the same link published by the host bridge. See
Documentation/ABI/testing/sysfs-devices-pci-host-bridge.

View File

@ -0,0 +1,45 @@
What: /sys/devices/pciDDDD:BB
/sys/devices/.../pciDDDD:BB
Contact: linux-pci@vger.kernel.org
Description:
A PCI host bridge device parents a PCI bus device topology. PCI
controllers may also parent host bridges. The DDDD:BB format
conveys the PCI domain (ACPI segment) number and root bus number
(in hexadecimal) of the host bridge. Note that the domain number
may be larger than the 16-bits that the "DDDD" format implies
for emulated host-bridges.
What: pciDDDD:BB/firmware_node
Contact: linux-pci@vger.kernel.org
Description:
(RO) Symlink to the platform firmware device object "companion"
of the host bridge. For example, an ACPI device with an _HID of
PNP0A08 (/sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0A08:00). See
/sys/devices/pciDDDD:BB entry for details about the DDDD:BB
format.
What: pciDDDD:BB/streamH.R.E
Contact: linux-pci@vger.kernel.org
Description:
(RO) When a platform has established a secure connection, PCIe
IDE, between two Partner Ports, this symlink appears. A stream
consumes a Stream ID slot in each of the Host bridge (H), Root
Port (R) and Endpoint (E). The link points to the Endpoint PCI
device in the Selective IDE Stream pairing. Specifically, "R"
and "E" represent the assigned Selective IDE Stream Register
Block in the Root Port and Endpoint, and "H" represents a
platform specific pool of stream resources shared by the Root
Ports in a host bridge. See /sys/devices/pciDDDD:BB entry for
details about the DDDD:BB format.
What: pciDDDD:BB/available_secure_streams
Contact: linux-pci@vger.kernel.org
Description:
(RO) When a host bridge has Root Ports that support PCIe IDE
(link encryption and integrity protection) there may be a
limited number of Selective IDE Streams that can be used for
establishing new end-to-end secure links. This attribute
decrements upon secure link setup, and increments upon secure
link teardown. The in-use stream count is determined by counting
stream symlinks. See /sys/devices/pciDDDD:BB entry for details
about the DDDD:BB format.

View File

@ -0,0 +1,61 @@
What: /sys/kernel/kexec/*
Date: Nov 2025
Contact: kexec@lists.infradead.org
Description:
The /sys/kernel/kexec/* directory contains sysfs files
that provide information about the configuration status
of kexec and kdump.
What: /sys/kernel/kexec/loaded
Date: Nov 2025
Contact: kexec@lists.infradead.org
Description: read only
Indicates whether a new kernel image has been loaded
into memory using the kexec system call. It shows 1 if
a kexec image is present and ready to boot, or 0 if none
is loaded.
User: kexec tools, kdump service
What: /sys/kernel/kexec/crash_loaded
Date: Nov 2025
Contact: kexec@lists.infradead.org
Description: read only
Indicates whether a crash (kdump) kernel is currently
loaded into memory. It shows 1 if a crash kernel has been
successfully loaded for panic handling, or 0 if no crash
kernel is present.
User: Kexec tools, Kdump service
What: /sys/kernel/kexec/crash_size
Date: Nov 2025
Contact: kexec@lists.infradead.org
Description: read/write
Shows the amount of memory reserved for loading the crash
(kdump) kernel. It reports the size, in bytes, of the
crash kernel area defined by the crashkernel= parameter.
This interface also allows reducing the crashkernel
reservation by writing a smaller value, and the reclaimed
space is added back to the system RAM.
User: Kdump service
What: /sys/kernel/kexec/crash_elfcorehdr_size
Date: Nov 2025
Contact: kexec@lists.infradead.org
Description: read only
Indicates the preferred size of the memory buffer for the
ELF core header used by the crash (kdump) kernel. It defines
how much space is needed to hold metadata about the crashed
system, including CPU and memory information. This information
is used by the user space utility kexec to support updating the
in-kernel kdump image during hotplug operations.
User: Kexec tools
What: /sys/kernel/kexec/crash_cma_ranges
Date: Nov 2025
Contact: kexec@lists.infradead.org
Description: read only
Provides information about the memory ranges reserved from
the Contiguous Memory Allocator (CMA) area that are allocated
to the crash (kdump) kernel. It lists the start and end physical
addresses of CMA regions assigned for crashkernel use.
User: kdump service

View File

@ -223,12 +223,13 @@ The flags are::
f Include the function name
s Include the source file name
l Include line number
d Include call trace
For ``print_hex_dump_debug()`` and ``print_hex_dump_bytes()``, only
the ``p`` flag has meaning, other flags are ignored.
Note the regexp ``^[-+=][fslmpt_]+$`` matches a flags specification.
To clear all flags at once, use ``=_`` or ``-fslmpt``.
Note the regexp ``^[-+=][fslmptd_]+$`` matches a flags specification.
To clear all flags at once, use ``=_`` or ``-fslmptd``.
Debug messages during Boot Process

View File

@ -2114,14 +2114,20 @@ Kernel parameters
the added memory block itself do not be affected.
hung_task_panic=
[KNL] Should the hung task detector generate panics.
Format: 0 | 1
[KNL] Number of hung tasks to trigger kernel panic.
Format: <int>
A value of 1 instructs the kernel to panic when a
hung task is detected. The default value is controlled
by the CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time
option. The value selected by this boot parameter can
be changed later by the kernel.hung_task_panic sysctl.
When set to a non-zero value, a kernel panic will be triggered if
the number of detected hung tasks reaches this value.
0: don't panic
1: panic immediately on first hung task
N: panic after N hung tasks are detected in a single scan
The default value is controlled by the
CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time option. The value
selected by this boot parameter can be changed later by the
kernel.hung_task_panic sysctl.
hvc_iucv= [S390] Number of z/VM IUCV hypervisor console (HVC)
terminal devices. Valid values: 0..8

View File

@ -397,13 +397,14 @@ a hung task is detected.
hung_task_panic
===============
Controls the kernel's behavior when a hung task is detected.
When set to a non-zero value, a kernel panic will be triggered if the
number of hung tasks found during a single scan reaches this value.
This file shows up if ``CONFIG_DETECT_HUNG_TASK`` is enabled.
= =================================================
= =======================================================
0 Continue operation. This is the default behavior.
1 Panic immediately.
= =================================================
N Panic when N hung tasks are found during a single scan.
= =======================================================
hung_task_check_count
@ -421,6 +422,11 @@ the system boot.
This file shows up if ``CONFIG_DETECT_HUNG_TASK`` is enabled.
hung_task_sys_info
==================
A comma separated list of extra system information to be dumped when
hung task is detected, for example, "tasks,mem,timers,locks,...".
Refer 'panic_sys_info' section below for more details.
hung_task_timeout_secs
======================
@ -515,6 +521,15 @@ default), only processes with the CAP_SYS_ADMIN capability may create
io_uring instances.
kernel_sys_info
===============
A comma separated list of extra system information to be dumped when
soft/hard lockup is detected, for example, "tasks,mem,timers,locks,...".
Refer 'panic_sys_info' section below for more details.
It serves as the default kernel control knob, which will take effect
when a kernel module calls sys_info() with parameter==0.
kexec_load_disabled
===================
@ -576,6 +591,11 @@ if leaking kernel pointer values to unprivileged users is a concern.
When ``kptr_restrict`` is set to 2, kernel pointers printed using
%pK will be replaced with 0s regardless of privileges.
softlockup_sys_info & hardlockup_sys_info
=========================================
A comma separated list of extra system information to be dumped when
soft/hard lockup is detected, for example, "tasks,mem,timers,locks,...".
Refer 'panic_sys_info' section below for more details.
modprobe
========
@ -910,8 +930,8 @@ to 'panic_print'. Possible values are:
============= ===================================================
tasks print all tasks info
mem print system memory info
timer print timers info
lock print locks info if CONFIG_LOCKDEP is on
timers print timers info
locks print locks info if CONFIG_LOCKDEP is on
ftrace print ftrace buffer
all_bt print all CPUs backtrace (if available in the arch)
blocked_tasks print only tasks in uninterruptible (blocked) state

View File

@ -416,7 +416,7 @@ Offset/size: 0x210/1
Protocol: 2.00+
============ ==================
If your boot loader has an assigned id (see table below), enter
If your boot loader has an assigned ID (see table below), enter
0xTV here, where T is an identifier for the boot loader and V is
a version number. Otherwise, enter 0xFF here.
@ -431,31 +431,31 @@ Protocol: 2.00+
ext_loader_type <- 0x05
ext_loader_ver <- 0x23
Assigned boot loader ids (hexadecimal):
Assigned boot loader IDs:
== =======================================
0 LILO
(0x00 reserved for pre-2.00 bootloader)
1 Loadlin
2 bootsect-loader
(0x20, all other values reserved)
3 Syslinux
4 Etherboot/gPXE/iPXE
5 ELILO
7 GRUB
8 U-Boot
9 Xen
A Gujin
B Qemu
C Arcturus Networks uCbootloader
D kexec-tools
E Extended (see ext_loader_type)
F Special (0xFF = undefined)
10 Reserved
11 Minimal Linux Bootloader
<http://sebastian-plotz.blogspot.de>
12 OVMF UEFI virtualization stack
13 barebox
0x0 LILO
(0x00 reserved for pre-2.00 bootloader)
0x1 Loadlin
0x2 bootsect-loader
(0x20, all other values reserved)
0x3 Syslinux
0x4 Etherboot/gPXE/iPXE
0x5 ELILO
0x7 GRUB
0x8 U-Boot
0x9 Xen
0xA Gujin
0xB Qemu
0xC Arcturus Networks uCbootloader
0xD kexec-tools
0xE Extended (see ext_loader_type)
0xF Special (0xFF = undefined)
0x10 Reserved
0x11 Minimal Linux Bootloader
<http://sebastian-plotz.blogspot.de>
0x12 OVMF UEFI virtualization stack
0x13 barebox
== =======================================
Please contact <hpa@zytor.com> if you need a bootloader ID value assigned.

View File

@ -138,6 +138,7 @@ Documents that don't fit elsewhere or which have yet to be categorized.
:maxdepth: 1
librs
liveupdate
netlink
.. only:: subproject and html

View File

@ -70,5 +70,5 @@ in the FDT. That state is called the KHO finalization phase.
Public API
==========
.. kernel-doc:: kernel/kexec_handover.c
.. kernel-doc:: kernel/liveupdate/kexec_handover.c
:export:

View File

@ -0,0 +1,61 @@
.. SPDX-License-Identifier: GPL-2.0
========================
Live Update Orchestrator
========================
:Author: Pasha Tatashin <pasha.tatashin@soleen.com>
.. kernel-doc:: kernel/liveupdate/luo_core.c
:doc: Live Update Orchestrator (LUO)
LUO Sessions
============
.. kernel-doc:: kernel/liveupdate/luo_session.c
:doc: LUO Sessions
LUO Preserving File Descriptors
===============================
.. kernel-doc:: kernel/liveupdate/luo_file.c
:doc: LUO File Descriptors
Live Update Orchestrator ABI
============================
.. kernel-doc:: include/linux/kho/abi/luo.h
:doc: Live Update Orchestrator ABI
The following types of file descriptors can be preserved
.. toctree::
:maxdepth: 1
../mm/memfd_preservation
Public API
==========
.. kernel-doc:: include/linux/liveupdate.h
.. kernel-doc:: include/linux/kho/abi/luo.h
:functions:
.. kernel-doc:: kernel/liveupdate/luo_core.c
:export:
.. kernel-doc:: kernel/liveupdate/luo_file.c
:export:
Internal API
============
.. kernel-doc:: kernel/liveupdate/luo_core.c
:internal:
.. kernel-doc:: kernel/liveupdate/luo_session.c
:internal:
.. kernel-doc:: kernel/liveupdate/luo_file.c
:internal:
See Also
========
- :doc:`Live Update uAPI </userspace-api/liveupdate>`
- :doc:`/core-api/kho/concepts`

View File

@ -1238,6 +1238,16 @@ Others
The patch file does not appear to be in unified-diff format. Please
regenerate the patch file before sending it to the maintainer.
**PLACEHOLDER_USE**
Detects unhandled placeholder text left in cover letters or commit headers/logs.
Common placeholders include lines like::
*** SUBJECT HERE ***
*** BLURB HERE ***
These typically come from autogenerated templates. Replace them with a proper
subject and description before sending.
**PRINTF_0XDECIMAL**
Prefixing 0x with decimal output is defective and should be corrected.

View File

@ -57,8 +57,7 @@ properties:
- prstb
- intb-only
timeout-sec:
maxItems: 2
timeout-sec: true
regulators:
$ref: /schemas/regulator/rohm,bd96801-regulator.yaml
@ -72,7 +71,10 @@ required:
- interrupt-names
- regulators
additionalProperties: false
allOf:
- $ref: /schemas/watchdog/watchdog.yaml
unevaluatedProperties: false
examples:
- |

View File

@ -24,6 +24,7 @@ properties:
- qcom,msm8998-adsp-pas
- qcom,msm8998-slpi-pas
- qcom,sdm660-adsp-pas
- qcom,sdm660-cdsp-pas
- qcom,sdm845-adsp-pas
- qcom,sdm845-cdsp-pas
- qcom,sdm845-slpi-pas
@ -31,9 +32,6 @@ properties:
reg:
maxItems: 1
cx-supply:
description: Phandle to the CX regulator
px-supply:
description: Phandle to the PX regulator
@ -69,6 +67,8 @@ allOf:
- qcom,msm8996-slpi-pil
- qcom,msm8998-adsp-pas
- qcom,msm8998-slpi-pas
- qcom,sdm660-adsp-pas
- qcom,sdm660-cdsp-pas
- qcom,sdm845-adsp-pas
- qcom,sdm845-cdsp-pas
- qcom,sdm845-slpi-pas
@ -93,6 +93,8 @@ allOf:
- qcom,msm8996-slpi-pil
- qcom,msm8998-adsp-pas
- qcom,msm8998-slpi-pas
- qcom,sdm660-adsp-pas
- qcom,sdm660-cdsp-pas
- qcom,sdm845-adsp-pas
- qcom,sdm845-cdsp-pas
- qcom,sdm845-slpi-pas
@ -103,16 +105,6 @@ allOf:
interrupt-names:
maxItems: 5
- if:
properties:
compatible:
contains:
enum:
- qcom,msm8974-adsp-pil
then:
required:
- cx-supply
- if:
properties:
compatible:
@ -120,8 +112,11 @@ allOf:
enum:
- qcom,msm8226-adsp-pil
- qcom,msm8953-adsp-pil
- qcom,msm8974-adsp-pil
- qcom,msm8996-adsp-pil
- qcom,msm8998-adsp-pas
- qcom,sdm660-adsp-pas
- qcom,sdm660-cdsp-pas
then:
properties:
power-domains:
@ -178,6 +173,7 @@ allOf:
- qcom,msm8998-adsp-pas
- qcom,msm8998-slpi-pas
- qcom,sdm660-adsp-pas
- qcom,sdm660-cdsp-pas
then:
properties:
qcom,qmp: false
@ -187,6 +183,7 @@ examples:
#include <dt-bindings/clock/qcom,rpmcc.h>
#include <dt-bindings/interrupt-controller/arm-gic.h>
#include <dt-bindings/interrupt-controller/irq.h>
#include <dt-bindings/power/qcom-rpmpd.h>
adsp {
compatible = "qcom,msm8974-adsp-pil";
@ -204,7 +201,8 @@ examples:
clocks = <&rpmcc RPM_CXO_CLK>;
clock-names = "xo";
cx-supply = <&pm8841_s2>;
power-domains = <&rpmpd MSM8974_VDDCX>;
power-domain-names = "cx";
memory-region = <&adsp_region>;

View File

@ -91,9 +91,13 @@ allOf:
power-domains:
items:
- description: NSP power domain
- description: CX power domain
- description: MXC power domain
power-domain-names:
items:
- const: nsp
- const: cx
- const: mxc
unevaluatedProperties: false

View File

@ -14,7 +14,11 @@ allOf:
properties:
compatible:
const: airoha,en7581-wdt
oneOf:
- items:
- const: airoha,an7583-wdt
- const: airoha,en7581-wdt
- const: airoha,en7581-wdt
reg:
maxItems: 1

View File

@ -15,6 +15,7 @@ properties:
- aspeed,ast2400-wdt
- aspeed,ast2500-wdt
- aspeed,ast2600-wdt
- aspeed,ast2700-wdt
reg:
maxItems: 1
@ -87,13 +88,15 @@ properties:
aspeed,reset-mask:
$ref: /schemas/types.yaml#/definitions/uint32-array
minItems: 1
maxItems: 2
maxItems: 5
description: >
A bitmask indicating which peripherals will be reset if the watchdog
timer expires. On AST2500 SoCs this should be a single word defined using
the AST2500_WDT_RESET_* macros; on AST2600 SoCs this should be a two-word
array with the first word defined using the AST2600_WDT_RESET1_* macros,
and the second word defined using the AST2600_WDT_RESET2_* macros.
and the second word defined using the AST2600_WDT_RESET2_* macros; on
AST2700 SoCs, this should be five-word array from AST2700_WDT_RESET1_*
macros to AST2700_WDT_RESET5_* macros.
required:
- compatible
@ -114,6 +117,7 @@ allOf:
enum:
- aspeed,ast2500-wdt
- aspeed,ast2600-wdt
- aspeed,ast2700-wdt
- if:
required:
- aspeed,ext-active-high

View File

@ -0,0 +1,57 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/watchdog/lantiq,wdt.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Lantiq WTD watchdog
maintainers:
- Hauke Mehrtens <hauke@hauke-m.de>
properties:
compatible:
oneOf:
- enum:
- lantiq,falcon-wdt
- lantiq,wdt
- lantiq,xrx100-wdt
- items:
- enum:
- lantiq,xrx200-wdt
- const: lantiq,xrx100-wdt
reg:
maxItems: 1
lantiq,rcu:
$ref: /schemas/types.yaml#/definitions/phandle
description: Phandle to the RCU syscon node
required:
- compatible
- reg
allOf:
- $ref: watchdog.yaml#
- if:
properties:
compatible:
contains:
enum:
- lantiq,xrx100-wdt
- lantiq,falcon-wdt
then:
required:
- lantiq,rcu
unevaluatedProperties: false
examples:
- |
watchdog@803f0 {
compatible = "lantiq,xrx200-wdt", "lantiq,xrx100-wdt";
reg = <0x803f0 0x10>;
lantiq,rcu = <&rcu0>;
};

View File

@ -1,24 +0,0 @@
Lantiq WTD watchdog binding
============================
This describes the binding of the Lantiq watchdog driver.
-------------------------------------------------------------------------------
Required properties:
- compatible : Should be one of
"lantiq,wdt"
"lantiq,xrx100-wdt"
"lantiq,xrx200-wdt", "lantiq,xrx100-wdt"
"lantiq,falcon-wdt"
- reg : Address of the watchdog block
- lantiq,rcu : A phandle to the RCU syscon (required for
"lantiq,falcon-wdt" and "lantiq,xrx100-wdt")
-------------------------------------------------------------------------------
Example for the watchdog on the xRX200 SoCs:
watchdog@803f0 {
compatible = "lantiq,xrx200-wdt", "lantiq,xrx100-wdt";
reg = <0x803f0 0x10>;
lantiq,rcu = <&rcu0>;
};

View File

@ -4,7 +4,7 @@
$id: http://devicetree.org/schemas/watchdog/loongson,ls1x-wdt.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Loongson-1 Watchdog Timer
title: Loongson Watchdog Timer
maintainers:
- Keguang Zhang <keguang.zhang@gmail.com>
@ -17,6 +17,7 @@ properties:
enum:
- loongson,ls1b-wdt
- loongson,ls1c-wdt
- loongson,ls2k0300-wdt
reg:
maxItems: 1

View File

@ -1,45 +0,0 @@
* Marvell Orion Watchdog Time
Required Properties:
- Compatibility : "marvell,orion-wdt"
"marvell,armada-370-wdt"
"marvell,armada-xp-wdt"
"marvell,armada-375-wdt"
"marvell,armada-380-wdt"
- reg : Should contain two entries: first one with the
timer control address, second one with the
rstout enable address.
For "marvell,armada-375-wdt" and "marvell,armada-380-wdt":
- reg : A third entry is mandatory and should contain the
shared mask/unmask RSTOUT address.
Clocks required for compatibles = "marvell,orion-wdt",
"marvell,armada-370-wdt":
- clocks : Must contain a single entry describing the clock input
Clocks required for compatibles = "marvell,armada-xp-wdt"
"marvell,armada-375-wdt"
"marvell,armada-380-wdt":
- clocks : Must contain an entry for each entry in clock-names.
- clock-names : Must include the following entries:
"nbclk" (L2/coherency fabric clock),
"fixed" (Reference 25 MHz fixed-clock).
Optional properties:
- interrupts : Contains the IRQ for watchdog expiration
- timeout-sec : Contains the watchdog timeout in seconds
Example:
wdt@20300 {
compatible = "marvell,orion-wdt";
reg = <0x20300 0x28>, <0x20108 0x4>;
interrupts = <3>;
timeout-sec = <10>;
clocks = <&gate_clk 7>;
};

View File

@ -0,0 +1,100 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/watchdog/marvell,orion-wdt.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Marvell Orion Watchdog Timer
maintainers:
- Andrew Lunn <andrew@lunn.ch>
- Gregory Clement <gregory.clement@bootlin.com>
properties:
compatible:
enum:
- marvell,orion-wdt
- marvell,armada-370-wdt
- marvell,armada-xp-wdt
- marvell,armada-375-wdt
- marvell,armada-380-wdt
reg:
minItems: 2
items:
- description: Timer control register address
- description: RSTOUT enable register address
- description: Shared mask/unmask RSTOUT register address
clocks:
minItems: 1
items:
- description: L2/coherency fabric clock input
- description: Reference 25 MHz fixed-clock supply
clock-names:
minItems: 1
items:
- const: nbclk
- const: fixed
interrupts:
minItems: 1
items:
- description: timeout
- description: pre-timeout
allOf:
- $ref: watchdog.yaml#
- if:
properties:
compatible:
contains:
enum:
- marvell,armada-375-wdt
- marvell,armada-380-wdt
then:
properties:
reg:
minItems: 3
else:
properties:
reg:
maxItems: 2
- if:
properties:
compatible:
contains:
enum:
- marvell,armada-xp-wdt
- marvell,armada-375-wdt
- marvell,armada-380-wdt
then:
properties:
clocks:
minItems: 2
clock-names:
minItems: 2
interrupts:
minItems: 2
required:
- clock-names
required:
- compatible
- reg
- clocks
unevaluatedProperties: false
examples:
- |
watchdog@20300 {
compatible = "marvell,orion-wdt";
reg = <0x20300 0x28>, <0x20108 0x4>;
interrupts = <3>;
timeout-sec = <10>;
clocks = <&gate_clk 7>;
};

View File

@ -41,6 +41,8 @@ properties:
- mediatek,mt7623-wdt
- mediatek,mt7629-wdt
- mediatek,mt8173-wdt
- mediatek,mt8188-wdt
- mediatek,mt8189-wdt
- mediatek,mt8365-wdt
- mediatek,mt8516-wdt
- const: mediatek,mt6589-wdt

View File

@ -1,15 +0,0 @@
TI Watchdog Timer (WDT) Controller for OMAP
Required properties:
- compatible : "ti,omap3-wdt" for OMAP3 or "ti,omap4-wdt" for OMAP4
- ti,hwmods : Name of the hwmod associated to the WDT
Optional properties:
- timeout-sec : default watchdog timeout in seconds
Examples:
wdt2: wdt@4a314000 {
compatible = "ti,omap4-wdt", "ti,omap3-wdt";
ti,hwmods = "wd_timer2";
};

View File

@ -22,6 +22,7 @@ properties:
- qcom,apss-wdt-ipq5332
- qcom,apss-wdt-ipq5424
- qcom,apss-wdt-ipq9574
- qcom,apss-wdt-kaanapali
- qcom,apss-wdt-msm8226
- qcom,apss-wdt-msm8974
- qcom,apss-wdt-msm8994

View File

@ -0,0 +1,99 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/watchdog/renesas,r9a09g057-wdt.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Renesas RZ/V2H(P) Watchdog Timer (WDT) Controller
maintainers:
- Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
properties:
compatible:
oneOf:
- items:
- enum:
- renesas,r9a09g047-wdt # RZ/G3E
- renesas,r9a09g056-wdt # RZ/V2N
- const: renesas,r9a09g057-wdt # RZ/V2H(P)
- items:
- const: renesas,r9a09g087-wdt # RZ/N2H
- const: renesas,r9a09g077-wdt # RZ/T2H
- enum:
- renesas,r9a09g057-wdt # RZ/V2H(P)
- renesas,r9a09g077-wdt # RZ/T2H
reg:
minItems: 1
maxItems: 2
clocks:
minItems: 1
items:
- description: Register access clock
- description: Main clock
clock-names:
minItems: 1
items:
- const: pclk
- const: oscclk
power-domains:
maxItems: 1
resets:
maxItems: 1
timeout-sec: true
required:
- compatible
- reg
- clocks
- clock-names
- power-domains
allOf:
- $ref: watchdog.yaml#
- if:
properties:
compatible:
contains:
const: renesas,r9a09g057-wdt
then:
properties:
reg:
maxItems: 1
clocks:
minItems: 2
clock-names:
minItems: 2
else:
properties:
clocks:
maxItems: 1
clock-names:
maxItems: 1
reg:
minItems: 2
resets: false
additionalProperties: false
examples:
- |
#include <dt-bindings/clock/renesas,r9a09g057-cpg.h>
watchdog@11c00400 {
compatible = "renesas,r9a09g057-wdt";
reg = <0x11c00400 0x400>;
clocks = <&cpg CPG_MOD 0x4b>, <&cpg CPG_MOD 0x4c>;
clock-names = "pclk", "oscclk";
resets = <&cpg 0x75>;
power-domains = <&cpg>;
};

View File

@ -0,0 +1,114 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/watchdog/renesas,rcar-gen3-wwdt.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Renesas Window Watchdog Timer (WWDT) Controller
maintainers:
- Wolfram Sang <wsa+renesas@sang-engineering.com>
properties:
compatible:
oneOf:
- items:
- enum:
- renesas,r8a77970-wwdt # R-Car V3M
- renesas,r8a77980-wwdt # R-Car V3H
- const: renesas,rcar-gen3-wwdt
- items:
- enum:
- renesas,r8a779a0-wwdt # R-Car V3U
- renesas,r8a779f0-wwdt # R-Car S4
- renesas,r8a779g0-wwdt # R-Car V4H
- renesas,r8a779h0-wwdt # R-Car V4M
- const: renesas,rcar-gen4-wwdt
reg:
maxItems: 1
interrupts:
items:
- description: Pretimeout, 75% of overflow reached
- description: Error occurred
interrupt-names:
items:
- const: pretimeout
- const: error
clocks:
items:
- description: Counting clock
- description: Bus clock
clock-names:
items:
- const: cnt
- const: bus
resets:
minItems: 1
maxItems: 2
reset-names:
minItems: 1
items:
- const: cnt
- const: bus
power-domains:
maxItems: 1
required:
- compatible
- reg
- interrupts
- interrupt-names
- clocks
- clock-names
- resets
- reset-names
- power-domains
allOf:
- $ref: watchdog.yaml#
- if:
properties:
compatible:
contains:
enum:
- renesas,r8a779a0-wwdt
- renesas,r8a779f0-wwdt
then:
properties:
resets:
minItems: 2
reset-names:
minItems: 2
additionalProperties: false
examples:
- |
#include <dt-bindings/clock/r8a779g0-cpg-mssr.h>
#include <dt-bindings/power/r8a779g0-sysc.h>
#include <dt-bindings/interrupt-controller/arm-gic.h>
watchdog@ffc90000 {
compatible = "renesas,r8a779g0-wwdt",
"renesas,rcar-gen4-wwdt";
reg = <0xffc90000 0x10>;
interrupts = <GIC_SPI 310 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 311 IRQ_TYPE_LEVEL_HIGH>;
interrupt-names = "pretimeout", "error";
clocks = <&cpg CPG_CORE R8A779G0_CLK_R>,
<&cpg CPG_CORE R8A779G0_CLK_SASYNCRT>;
clock-names = "cnt", "bus";
power-domains = <&sysc R8A779G0_PD_ALWAYS_ON>;
resets = <&cpg 1200>;
reset-names = "cnt";
};

View File

@ -0,0 +1,51 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/watchdog/renesas,rza-wdt.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Renesas RZ/A Watchdog Timer (WDT) Controller
maintainers:
- Wolfram Sang <wsa+renesas@sang-engineering.com>
properties:
compatible:
items:
- enum:
- renesas,r7s72100-wdt # RZ/A1
- renesas,r7s9210-wdt # RZ/A2
- const: renesas,rza-wdt # RZ/A
reg:
maxItems: 1
interrupts:
maxItems: 1
clocks:
maxItems: 1
timeout-sec: true
required:
- compatible
- reg
- clocks
allOf:
- $ref: watchdog.yaml#
additionalProperties: false
examples:
- |
#include <dt-bindings/clock/r7s72100-clock.h>
#include <dt-bindings/interrupt-controller/arm-gic.h>
watchdog@fcfe0000 {
compatible = "renesas,r7s72100-wdt", "renesas,rza-wdt";
reg = <0xfcfe0000 0x6>;
interrupts = <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&p0_clk>;
};

View File

@ -0,0 +1,111 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/watchdog/renesas,rzg2l-wdt.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Renesas RZ/G2L Watchdog Timer (WDT) Controller
maintainers:
- Biju Das <biju.das.jz@bp.renesas.com>
properties:
compatible:
oneOf:
- items:
- enum:
- renesas,r9a07g043-wdt # RZ/G2UL and RZ/Five
- renesas,r9a07g044-wdt # RZ/G2{L,LC}
- renesas,r9a07g054-wdt # RZ/V2L
- renesas,r9a08g045-wdt # RZ/G3S
- const: renesas,rzg2l-wdt
- items:
- const: renesas,r9a09g011-wdt # RZ/V2M
- const: renesas,rzv2m-wdt # RZ/V2M
reg:
maxItems: 1
interrupts:
minItems: 1
items:
- description: Timeout
- description: Parity error
interrupt-names:
minItems: 1
items:
- const: wdt
- const: perrout
clocks:
items:
- description: Register access clock
- description: Main clock
clock-names:
items:
- const: pclk
- const: oscclk
power-domains:
maxItems: 1
resets:
maxItems: 1
timeout-sec: true
required:
- compatible
- reg
- interrupts
- clocks
- clock-names
- power-domains
- resets
allOf:
- $ref: watchdog.yaml#
- if:
properties:
compatible:
contains:
const: renesas,rzg2l-wdt
then:
properties:
interrupts:
minItems: 2
interrupt-names:
minItems: 2
required:
- interrupt-names
else:
properties:
interrupts:
maxItems: 1
interrupt-names:
maxItems: 1
additionalProperties: false
examples:
- |
#include <dt-bindings/clock/r9a07g044-cpg.h>
#include <dt-bindings/interrupt-controller/arm-gic.h>
watchdog@12800800 {
compatible = "renesas,r9a07g044-wdt",
"renesas,rzg2l-wdt";
reg = <0x12800800 0x400>;
clocks = <&cpg CPG_MOD R9A07G044_WDT0_PCLK>,
<&cpg CPG_MOD R9A07G044_WDT0_CLK>;
clock-names = "pclk", "oscclk";
interrupts = <GIC_SPI 49 IRQ_TYPE_LEVEL_HIGH>,
<GIC_SPI 50 IRQ_TYPE_LEVEL_HIGH>;
interrupt-names = "wdt", "perrout";
resets = <&cpg R9A07G044_WDT0_PRESETN>;
power-domains = <&cpg>;
};

View File

@ -0,0 +1,50 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/watchdog/renesas,rzn1-wdt.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: Renesas RZ/N1 Watchdog Timer (WDT) Controller
maintainers:
- Wolfram Sang <wsa+renesas@sang-engineering.com>
properties:
compatible:
items:
- const: renesas,r9a06g032-wdt # RZ/N1D
- const: renesas,rzn1-wdt # RZ/N1
reg:
maxItems: 1
interrupts:
maxItems: 1
clocks:
maxItems: 1
timeout-sec: true
required:
- compatible
- reg
- interrupts
- clocks
allOf:
- $ref: watchdog.yaml#
additionalProperties: false
examples:
- |
#include <dt-bindings/clock/r9a06g032-sysctrl.h>
#include <dt-bindings/interrupt-controller/arm-gic.h>
watchdog@40008000 {
compatible = "renesas,r9a06g032-wdt", "renesas,rzn1-wdt";
reg = <0x40008000 0x1000>;
interrupts = <GIC_SPI 73 IRQ_TYPE_EDGE_RISING>;
clocks = <&sysctrl R9A06G032_CLK_WATCHDOG>;
};

View File

@ -13,30 +13,6 @@ maintainers:
properties:
compatible:
oneOf:
- items:
- enum:
- renesas,r7s72100-wdt # RZ/A1
- renesas,r7s9210-wdt # RZ/A2
- const: renesas,rza-wdt # RZ/A
- items:
- enum:
- renesas,r9a06g032-wdt # RZ/N1D
- const: renesas,rzn1-wdt # RZ/N1
- items:
- enum:
- renesas,r9a07g043-wdt # RZ/G2UL and RZ/Five
- renesas,r9a07g044-wdt # RZ/G2{L,LC}
- renesas,r9a07g054-wdt # RZ/V2L
- renesas,r9a08g045-wdt # RZ/G3S
- const: renesas,rzg2l-wdt
- items:
- enum:
- renesas,r9a09g011-wdt # RZ/V2M
- const: renesas,rzv2m-wdt # RZ/V2M
- items:
- enum:
- renesas,r8a7742-wdt # RZ/G1H
@ -75,47 +51,14 @@ properties:
- renesas,r8a779h0-wdt # R-Car V4M
- const: renesas,rcar-gen4-wdt # R-Car Gen4
- items:
- enum:
- renesas,r9a09g047-wdt # RZ/G3E
- renesas,r9a09g056-wdt # RZ/V2N
- const: renesas,r9a09g057-wdt # RZ/V2H(P)
- enum:
- renesas,r9a09g057-wdt # RZ/V2H(P)
- renesas,r9a09g077-wdt # RZ/T2H
- items:
- const: renesas,r9a09g087-wdt # RZ/N2H
- const: renesas,r9a09g077-wdt # RZ/T2H
reg:
minItems: 1
maxItems: 2
maxItems: 1
interrupts:
minItems: 1
items:
- description: Timeout
- description: Parity error
interrupt-names:
minItems: 1
items:
- const: wdt
- const: perrout
maxItems: 1
clocks:
minItems: 1
items:
- description: Register access clock
- description: Main clock
clock-names:
minItems: 1
items:
- const: pclk
- const: oscclk
maxItems: 1
power-domains:
maxItems: 1
@ -129,6 +72,8 @@ required:
- compatible
- reg
- clocks
- interrupts
- power-domains
allOf:
- $ref: watchdog.yaml#
@ -138,90 +83,11 @@ allOf:
properties:
compatible:
contains:
enum:
- renesas,r9a09g077-wdt
- renesas,rza-wdt
- renesas,rzn1-wdt
const: renesas,r8a77980-wdt
then:
required:
- power-domains
- resets
- if:
properties:
compatible:
contains:
enum:
- renesas,r9a09g057-wdt
- renesas,rzg2l-wdt
- renesas,rzv2m-wdt
then:
properties:
clocks:
minItems: 2
clock-names:
minItems: 2
required:
- clock-names
else:
properties:
clocks:
maxItems: 1
- if:
properties:
compatible:
contains:
enum:
- renesas,rzg2l-wdt
then:
properties:
interrupts:
minItems: 2
interrupt-names:
minItems: 2
required:
- interrupt-names
else:
properties:
interrupts:
maxItems: 1
- if:
properties:
compatible:
contains:
enum:
- renesas,r9a09g057-wdt
- renesas,r9a09g077-wdt
then:
properties:
interrupts: false
interrupt-names: false
else:
required:
- interrupts
- if:
properties:
compatible:
contains:
const: renesas,r9a09g077-wdt
then:
properties:
resets: false
clock-names:
maxItems: 1
reg:
minItems: 2
required:
- clock-names
- power-domains
else:
properties:
reg:
maxItems: 1
additionalProperties: false
examples:

View File

@ -28,6 +28,7 @@ properties:
- rockchip,rk3328-wdt
- rockchip,rk3368-wdt
- rockchip,rk3399-wdt
- rockchip,rk3506-wdt
- rockchip,rk3562-wdt
- rockchip,rk3568-wdt
- rockchip,rk3576-wdt

View File

@ -0,0 +1,51 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
$id: http://devicetree.org/schemas/watchdog/ti,omap2-wdt.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
title: TI OMAP Watchdog Timer Controller
maintainers:
- Aaro Koskinen <aaro.koskinen@iki.fi>
allOf:
- $ref: watchdog.yaml#
properties:
compatible:
oneOf:
- enum:
- ti,omap2-wdt
- ti,omap3-wdt
- items:
- enum:
- ti,am4372-wdt
- ti,omap4-wdt
- ti,omap5-wdt
- const: ti,omap3-wdt
reg:
maxItems: 1
interrupts:
maxItems: 1
ti,hwmods:
description: Name of the hardware module associated with the watchdog.
$ref: /schemas/types.yaml#/definitions/string
deprecated: true
required:
- compatible
- reg
unevaluatedProperties: false
examples:
- |
watchdog@48314000 {
compatible = "ti,omap3-wdt";
reg = <0x48314000 0x80>;
ti,hwmods = "wd_timer2";
};

View File

@ -21,9 +21,10 @@ select:
properties:
$nodename:
pattern: "^(timer|watchdog)(@.*|-([0-9]|[1-9][0-9]+))?$"
pattern: "^(pmic|timer|watchdog)(@.*|-([0-9]|[1-9][0-9]+))?$"
timeout-sec:
maxItems: 1
description:
Contains the watchdog timeout in seconds.

View File

@ -0,0 +1,60 @@
.. SPDX-License-Identifier: GPL-2.0
=================================================
Recoverable Hardware Error Tracking in vmcoreinfo
=================================================
Overview
--------
This feature provides a generic infrastructure within the Linux kernel to track
and log recoverable hardware errors. These are hardware recoverable errors
visible that might not cause immediate panics but may influence health, mainly
because new code path will be executed in the kernel.
By recording counts and timestamps of recoverable errors into the vmcoreinfo
crash dump notes, this infrastructure aids post-mortem crash analysis tools in
correlating hardware events with kernel failures. This enables faster triage
and better understanding of root causes, especially in large-scale cloud
environments where hardware issues are common.
Benefits
--------
- Facilitates correlation of hardware recoverable errors with kernel panics or
unusual code paths that lead to system crashes.
- Provides operators and cloud providers quick insights, improving reliability
and reducing troubleshooting time.
- Complements existing full hardware diagnostics without replacing them.
Data Exposure and Consumption
-----------------------------
- The tracked error data consists of per-error-type counts and timestamps of
last occurrence.
- This data is stored in the `hwerror_data` array, categorized by error source
types like CPU, memory, PCI, CXL, and others.
- It is exposed via vmcoreinfo crash dump notes and can be read using tools
like `crash`, `drgn`, or other kernel crash analysis utilities.
- There is no other way to read these data other than from crash dumps.
- These errors are divided by area, which includes CPU, Memory, PCI, CXL and
others.
Typical usage example (in drgn REPL):
.. code-block:: python
>>> prog['hwerror_data']
(struct hwerror_info[HWERR_RECOV_MAX]){
{
.count = (int)844,
.timestamp = (time64_t)1752852018,
},
...
}
Enabling
--------
- This feature is enabled when CONFIG_VMCORE_INFO is set.

View File

@ -97,6 +97,7 @@ Subsystem-specific APIs
gpio/index
hsi
hte/index
hw-recoverable-errors
i2c
iio/index
infiniband

View File

@ -83,7 +83,7 @@ flags, and the remaining form the internal block number.
======== =============================================================
Bit Description
======== =============================================================
31 - 30 Error and Zero flags - Used in the following way::
31 - 30 Error and Zero flags - Used in the following way:
== == ====================================================
31 30 Description

View File

@ -10,6 +10,7 @@ The Linux PCI driver implementer's API guide
pci
p2pdma
tsm
.. only:: subproject and html

View File

@ -0,0 +1,21 @@
.. SPDX-License-Identifier: GPL-2.0
.. include:: <isonum.txt>
========================================================
PCI Trusted Execution Environment Security Manager (TSM)
========================================================
Subsystem Interfaces
====================
.. kernel-doc:: include/linux/pci-ide.h
:internal:
.. kernel-doc:: drivers/pci/ide.c
:export:
.. kernel-doc:: include/linux/pci-tsm.h
:internal:
.. kernel-doc:: drivers/pci/tsm.c
:export:

View File

@ -13,5 +13,6 @@ NFS
rpc-cache
rpc-server-gss
nfs41-server
nfsd-io-modes
knfsd-stats
reexport

View File

@ -0,0 +1,153 @@
.. SPDX-License-Identifier: GPL-2.0
=============
NFSD IO MODES
=============
Overview
========
NFSD has historically always used buffered IO when servicing READ and
WRITE operations. BUFFERED is NFSD's default IO mode, but it is possible
to override that default to use either DONTCACHE or DIRECT IO modes.
Experimental NFSD debugfs interfaces are available to allow the NFSD IO
mode used for READ and WRITE to be configured independently. See both:
- /sys/kernel/debug/nfsd/io_cache_read
- /sys/kernel/debug/nfsd/io_cache_write
The default value for both io_cache_read and io_cache_write reflects
NFSD's default IO mode (which is NFSD_IO_BUFFERED=0).
Based on the configured settings, NFSD's IO will either be:
- cached using page cache (NFSD_IO_BUFFERED=0)
- cached but removed from page cache on completion (NFSD_IO_DONTCACHE=1)
- not cached stable_how=NFS_UNSTABLE (NFSD_IO_DIRECT=2)
To set an NFSD IO mode, write a supported value (0 - 2) to the
corresponding IO operation's debugfs interface, e.g.::
echo 2 > /sys/kernel/debug/nfsd/io_cache_read
echo 2 > /sys/kernel/debug/nfsd/io_cache_write
To check which IO mode NFSD is using for READ or WRITE, simply read the
corresponding IO operation's debugfs interface, e.g.::
cat /sys/kernel/debug/nfsd/io_cache_read
cat /sys/kernel/debug/nfsd/io_cache_write
If you experiment with NFSD's IO modes on a recent kernel and have
interesting results, please report them to linux-nfs@vger.kernel.org
NFSD DONTCACHE
==============
DONTCACHE offers a hybrid approach to servicing IO that aims to offer
the benefits of using DIRECT IO without any of the strict alignment
requirements that DIRECT IO imposes. To achieve this buffered IO is used
but the IO is flagged to "drop behind" (meaning associated pages are
dropped from the page cache) when IO completes.
DONTCACHE aims to avoid what has proven to be a fairly significant
limition of Linux's memory management subsystem if/when large amounts of
data is infrequently accessed (e.g. read once _or_ written once but not
read until much later). Such use-cases are particularly problematic
because the page cache will eventually become a bottleneck to servicing
new IO requests.
For more context on DONTCACHE, please see these Linux commit headers:
- Overview: 9ad6344568cc3 ("mm/filemap: change filemap_create_folio()
to take a struct kiocb")
- for READ: 8026e49bff9b1 ("mm/filemap: add read support for
RWF_DONTCACHE")
- for WRITE: 974c5e6139db3 ("xfs: flag as supporting FOP_DONTCACHE")
NFSD_IO_DONTCACHE will fall back to NFSD_IO_BUFFERED if the underlying
filesystem doesn't indicate support by setting FOP_DONTCACHE.
NFSD DIRECT
===========
DIRECT IO doesn't make use of the page cache, as such it is able to
avoid the Linux memory management's page reclaim scalability problems
without resorting to the hybrid use of page cache that DONTCACHE does.
Some workloads benefit from NFSD avoiding the page cache, particularly
those with a working set that is significantly larger than available
system memory. The pathological worst-case workload that NFSD DIRECT has
proven to help most is: NFS client issuing large sequential IO to a file
that is 2-3 times larger than the NFS server's available system memory.
The reason for such improvement is NFSD DIRECT eliminates a lot of work
that the memory management subsystem would otherwise be required to
perform (e.g. page allocation, dirty writeback, page reclaim). When
using NFSD DIRECT, kswapd and kcompactd are no longer commanding CPU
time trying to find adequate free pages so that forward IO progress can
be made.
The performance win associated with using NFSD DIRECT was previously
discussed on linux-nfs, see:
https://lore.kernel.org/linux-nfs/aEslwqa9iMeZjjlV@kernel.org/
But in summary:
- NFSD DIRECT can significantly reduce memory requirements
- NFSD DIRECT can reduce CPU load by avoiding costly page reclaim work
- NFSD DIRECT can offer more deterministic IO performance
As always, your mileage may vary and so it is important to carefully
consider if/when it is beneficial to make use of NFSD DIRECT. When
assessing comparative performance of your workload please be sure to log
relevant performance metrics during testing (e.g. memory usage, cpu
usage, IO performance). Using perf to collect perf data that may be used
to generate a "flamegraph" for work Linux must perform on behalf of your
test is a really meaningful way to compare the relative health of the
system and how switching NFSD's IO mode changes what is observed.
If NFSD_IO_DIRECT is specified by writing 2 (or 3 and 4 for WRITE) to
NFSD's debugfs interfaces, ideally the IO will be aligned relative to
the underlying block device's logical_block_size. Also the memory buffer
used to store the READ or WRITE payload must be aligned relative to the
underlying block device's dma_alignment.
But NFSD DIRECT does handle misaligned IO in terms of O_DIRECT as best
it can:
Misaligned READ:
If NFSD_IO_DIRECT is used, expand any misaligned READ to the next
DIO-aligned block (on either end of the READ). The expanded READ is
verified to have proper offset/len (logical_block_size) and
dma_alignment checking.
Misaligned WRITE:
If NFSD_IO_DIRECT is used, split any misaligned WRITE into a start,
middle and end as needed. The large middle segment is DIO-aligned
and the start and/or end are misaligned. Buffered IO is used for the
misaligned segments and O_DIRECT is used for the middle DIO-aligned
segment. DONTCACHE buffered IO is _not_ used for the misaligned
segments because using normal buffered IO offers significant RMW
performance benefit when handling streaming misaligned WRITEs.
Tracing:
The nfsd_read_direct trace event shows how NFSD expands any
misaligned READ to the next DIO-aligned block (on either end of the
original READ, as needed).
This combination of trace events is useful for READs::
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_vector/enable
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_direct/enable
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_read_io_done/enable
echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_read/enable
The nfsd_write_direct trace event shows how NFSD splits a given
misaligned WRITE into a DIO-aligned middle segment.
This combination of trace events is useful for WRITEs::
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_opened/enable
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_direct/enable
echo 1 > /sys/kernel/tracing/events/nfsd/nfsd_write_io_done/enable
echo 1 > /sys/kernel/tracing/events/xfs/xfs_file_direct_write/enable

View File

@ -0,0 +1,547 @@
NFSD Maintainer Entry Profile
=============================
A Maintainer Entry Profile supplements the top-level process
documents (found in Documentation/process/) with customs that are
specific to a subsystem and its maintainers. A contributor may use
this document to set their expectations and avoid common mistakes.
A maintainer may use these profiles to look across subsystems for
opportunities to converge on best common practices.
Overview
--------
The Network File System (NFS) is a standardized family of network
protocols that enable access to files across a set of network-
connected peer hosts. Applications on NFS clients access files that
reside on file systems that are shared by NFS servers. A single
network peer can act as both an NFS client and an NFS server.
NFSD refers to the NFS server implementation included in the Linux
kernel. An in-kernel NFS server has fast access to files stored
in file systems local to that server. NFSD can share files stored
on most of the file system types native to Linux, including xfs,
ext4, btrfs, and tmpfs.
Mailing list
------------
The linux-nfs@vger.kernel.org mailing list is a public list. Its
purpose is to enable collaboration among developers working on the
Linux NFS stack, both client and server. It is not a place for
conversations that are not related directly to the Linux NFS stack.
The linux-nfs mailing list is archived on `lore.kernel.org <https://lore.kernel.org/linux-nfs/>`_.
The Linux NFS community does not have any chat room.
Reporting bugs
--------------
If you experience an NFSD-related bug on a distribution-built
kernel, please start by working with your Linux distributor.
Bug reports against upstream Linux code bases are welcome on the
linux-nfs@vger.kernel.org mailing list, where some active triage
can be done. NFSD bugs may also be reported in the Linux kernel
community's bugzilla at:
https://bugzilla.kernel.org
Please file NFSD-related bugs under the "Filesystems/NFSD"
component. In general, including as much detail as possible is a
good start, including pertinent system log messages from both
the client and server.
User space software related to NFSD, such as mountd or the exportfs
command, is contained in the nfs-utils package. Report problems
with those components to linux-nfs@vger.kernel.org. You might be
directed to move the report to a specific bug tracker.
Contributor's Guide
-------------------
Standards compliance
~~~~~~~~~~~~~~~~~~~~
The priority is for NFSD to interoperate fully with the Linux NFS
client. We also test against other popular NFS client implementa-
tions regularly at NFS bake-a-thon events (also known as plug-
fests). Non-Linux NFS clients are not part of upstream NFSD CI/CD.
The NFSD community strives to provide an NFS server implementation
that interoperates with all standards-compliant NFS client
implementations. This is done by staying as close as is sensible to
the normative mandates in the IETF's published NFS, RPC, and GSS-API
standards.
It is always useful to reference an RFC and section number in a code
comment where behavior deviates from the standard (and even when the
behavior is compliant but the implementation is obfuscatory).
On the rare occasion when a deviation from standard-mandated
behavior is needed, brief documentation of the use case or
deficiencies in the standard is a required part of in-code
documentation.
Care must always be taken to avoid leaking local error codes (ie,
errnos) to clients of NFSD. A proper NFS status code is always
required in NFS protocol replies.
NFSD administrative interfaces
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
NFSD administrative interfaces include:
- an NFSD or SUNRPC module parameter
- export options in /etc/exports
- files under /proc/fs/nfsd/ or /proc/sys/sunrpc/
- the NFSD netlink protocol
Frequently, a request is made to introduce or modify one of NFSD's
traditional administrative interfaces. Certainly it is technically
easy to introduce a new administrative setting. However, there are
good reasons why the NFSD maintainers prefer to leave that as a last
resort:
- As with any API, administrative interfaces are difficult to get
right.
- Once they are documented and have a legacy of use, administrative
interfaces become difficult to modify or remove.
- Every new administrative setting multiplies the NFSD test matrix.
- The cost of one administrative interface is incremental, but costs
add up across all of the existing interfaces.
It is often better for everyone if effort is made up front to
understanding the underlying requirement of the new setting, and
then trying to make it tune itself (or to become otherwise
unnecessary).
If a new setting is indeed necessary, first consider adding it to
the NFSD netlink protocol. Or if it doesn't need to be a reliable
long term user space feature, it can be added to NFSD's menagerie of
experimental settings which reside under /sys/kernel/debug/nfsd/ .
Field observability
~~~~~~~~~~~~~~~~~~~
NFSD employs several different mechanisms for observing operation,
including counters, printks, WARNings, and static trace points. Each
have their strengths and weaknesses. Contributors should select the
most appropriate tool for their task.
- BUG must be avoided if at all possible, as it will frequently
result in a full system crash.
- WARN is appropriate only when a full stack trace is useful.
- printk can show detailed information. These must not be used
in code paths where they can be triggered repeatedly by remote
users.
- dprintk can show detailed information, but can be enabled only
in pre-set groups. The overhead of emitting output makes dprintk
inappropriate for frequent operations like I/O.
- Counters are always on, but provide little information about
individual events other than how frequently they occur.
- static trace points can be enabled individually or in groups
(via a glob). These are generally low overhead, and thus are
favored for use in hot paths.
- dynamic tracing, such as kprobes or eBPF, are quite flexible but
cannot be used in certain environments (eg, full kernel lock-
down).
Testing
~~~~~~~
The kdevops project
https://github.com/linux-kdevops/kdevops
contains several NFS-specific workflows, as well as the community
standard fstests suite. These workflows are based on open source
testing tools such as ltp and fio. Contributors are encouraged to
use these tools without kdevops, or contributors should install and
use kdevops themselves to verify their patches before submission.
Coding style
~~~~~~~~~~~~
Follow the coding style preferences described in
Documentation/process/coding-style.rst
with the following exceptions:
- Add new local variables to a function in reverse Christmas tree
order
- Use the kdoc comment style for
+ non-static functions
+ static inline functions
+ static functions that are callbacks/virtual functions
- All new function names start with ``nfsd_`` for non-NFS-version-
specific functions.
- New function names that are specific to NFSv2 or NFSv3, or are
used by all minor versions of NFSv4, use ``nfsdN_`` where N is
the version.
- New function names specific to an NFSv4 minor version can be
named with ``nfsd4M_`` where M is the minor version.
Patch preparation
~~~~~~~~~~~~~~~~~
Read and follow all guidelines in
Documentation/process/submitting-patches.rst
Use tagging to identify all patch authors. However, reviewers and
testers should be added by replying to the email patch submission.
Email is extensively used in order to publicly archive review and
testing attributions. These tags are automatically inserted into
your patches when they are applied.
The code in the body of the diff already shows /what/ is being
changed. Thus it is not necessary to repeat that in the patch
description. Instead, the description should contain one or more
of:
- A brief problem statement ("what is this patch trying to fix?")
with a root-cause analysis.
- End-user visible symptoms or items that a support engineer might
use to search for the patch, like stack traces.
- A brief explanation of why the patch is the best way to address
the problem.
- Any context that reviewers might need to understand the changes
made by the patch.
- Any relevant benchmarking results, and/or functional test results.
As detailed in Documentation/process/submitting-patches.rst,
identify the point in history that the issue being addressed was
introduced by using a Fixes: tag.
Mention in the patch description if that point in history cannot be
determined -- that is, no Fixes: tag can be provided. In this case,
please make it clear to maintainers whether an LTS backport is
needed even though there is no Fixes: tag.
The NFSD maintainers prefer to add stable tagging themselves, after
public discussion in response to the patch submission. Contributors
may suggest stable tagging, but be aware that many version
management tools add such stable Cc's when you post your patches.
Don't add "Cc: stable" unless you are absolutely sure the patch
needs to go to stable during the initial submission process.
Patch submission
~~~~~~~~~~~~~~~~
Patches to NFSD are submitted via the kernel's email-based review
process that is common to most other kernel subsystems.
Just before each submission, rebase your patch or series on the
nfsd-testing branch at
https://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git
The NFSD subsystem is maintained separately from the Linux in-kernel
NFS client. The NFSD maintainers do not normally take submissions
for client changes, nor can they respond authoritatively to bug
reports or feature requests for NFS client code.
This means that contributors might be asked to resubmit patches if
they were emailed to the incorrect set of maintainers and reviewers.
This is not a rejection, but simply a correction of the submission
process.
When in doubt, consult the NFSD entry in the MAINTAINERS file to
see which files and directories fall under the NFSD subsystem.
The proper set of email addresses for NFSD patches are:
To: the NFSD maintainers and reviewers listed in MAINTAINERS
Cc: linux-nfs@vger.kernel.org and optionally linux-kernel@
If there are other subsystems involved in the patches (for example
MM or RDMA) their primary mailing list address can be included in
the Cc: field. Other contributors and interested parties may be
included there as well.
In general we prefer that contributors use common patch email tools
such as "git send-email" or "stg email format/send", which tend to
get the details right without a lot of fuss.
A series consisting of a single patch is not required to have a
cover letter. However, a cover letter can be included if there is
substantial context that is not appropriate to include in the
patch description.
Please note that, with an e-mail based submission process, series
cover letters are not part of the work that is committed to the
kernel source code base or its commit history. Therefore always try
to keep pertinent information in the patch descriptions.
Design documentation is welcome, but as cover letters are not
preserved, a perhaps better option is to include a patch that adds
such documentation under Documentation/filesystems/nfs/.
Reviewers will ask about test coverage and what use cases the
patches are expected to address. Please be prepared to answer these
questions.
Review comments from maintainers might be politely stated, but in
general, these are not optional to address when they are actionable.
If necessary, the maintainers retain the right to not apply patches
when contributors refuse to address reasonable requests.
Post changes to kernel source code and user space source code as
separate series. You can connect the two series with comments in
your cover letters.
Generally the NFSD maintainers ask for a reposts even for simple
modifications in order to publicly archive the request and the
resulting repost before it is pulled into the NFSD trees. This
also enables us to rebuild a patch series quickly without missing
changes that might have been discussed via email.
Avoid frequently reposting large series with only small changes. As
a rule of thumb, posting substantial changes more than once a week
will result in reviewer overload.
Remember, there are only a handful of subsystem maintainers and
reviewers, but potentially many sources of contributions. The
maintainers and reviewers, therefore, are always the less scalable
resource. Be kind to your friendly neighborhood maintainer.
Patch Acceptance
~~~~~~~~~~~~~~~~
There isn't a formal review process for NFSD, but we like to see
at least two Reviewed-by: notices for patches that are more than
simple clean-ups. Reviews are done in public on
linux-nfs@vger.kernel.org and are archived on lore.kernel.org.
Currently the NFSD patch queues are maintained in branches here:
https://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git
The NFSD maintainers apply patches initially to the nfsd-testing
branch, which is always open to new submissions. Patches can be
applied while review is ongoing. nfsd-testing is a topic branch,
so it can change frequently, it will be rebased, and your patch
might get dropped if there is a problem with it.
Generally a script-generated "thank you" email will indicate when
your patch has been added to the nfsd-testing branch. You can track
the progress of your patch using the linux-nfs patchworks instance:
https://patchwork.kernel.org/project/linux-nfs/list/
While your patch is in nfsd-testing, it is exposed to a variety of
test environments, including community zero-day bots, static
analysis tools, and NFSD continuous integration testing. The soak
period is three to four weeks.
Each patch that survives in nfsd-testing for the soak period without
changes is moved to the nfsd-next branch.
The nfsd-next branch is automatically merged into linux-next and
fs-next on a nightly basis.
Patches that survive in nfsd-next are included in the next NFSD
merge window pull request. These windows typically occur once every
63 days (nine weeks).
When the upstream merge window closes, the nfsd-next branch is
renamed nfsd-fixes, and a new nfsd-next branch is created, based on
the upstream -rc1 tag.
Fixes that are destined for an upstream -rc release also run the
nfsd-testing gauntlet, but are then applied to the nfsd-fixes
branch. That branch is made available for Linus to pull after a
short time. In order to limit the risk of introducing regressions,
we limit such fixes to emergency situations or fixes to breakage
that occurred during the most recent upstream merge.
Please make it clear when submitting an emergency patch that
immediate action (either application to -rc or LTS backport) is
needed.
Sensitive patch submissions and bug reports
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
CVEs are generated by specific members of the Linux kernel community
and several external entities. The Linux NFS community does not emit
or assign CVEs. CVEs are assigned after an issue and its fix are
known.
However, the NFSD maintainers sometimes receive sensitive security
reports, and at times these are significant enough to need to be
embargoed. In such rare cases, fixes can be developed and reviewed
out of the public eye.
Please be aware that many version management tools add the stable
Cc's when you post your patches. This is generally a nuisance, but
it can result in outing an embargoed security issue accidentally.
Don't add "Cc: stable" unless you are absolutely sure the patch
needs to go to stable@ during the initial submission process.
Patches that are merged without ever appearing on any list, and
which carry a Reported-by: or Fixes: tag are detected as suspicious
by security-focused people. We encourage that, after any private
review, security-sensitive patches should be posted to linux-nfs@
for the usual public review, archiving, and test period.
LLM-generated submissions
~~~~~~~~~~~~~~~~~~~~~~~~~
The Linux kernel community as a whole is still exploring the new
world of LLM-generated code. The NFSD maintainers will entertain
submission of patches that are partially or wholly generated by
LLM-based development tools. Such submissions are held to the
same standards as submissions created entirely by human authors:
- The human contributor identifies themselves via a Signed-off-by:
tag. This tag counts as a DoC.
- The human contributor is solely responsible for code provenance
and any contamination by inadvertently-included code with a
conflicting license, as usual.
- The human contributor must be able to answer and address review
questions. A patch description such as "This fixed my problem
but I don't know why" is not acceptable.
- The contribution is subjected to the same test regimen as all
other submissions.
- An indication (via a Generated-by: tag or otherwise) that the
contribution is LLM-generated is not required.
It is easy to address review comments and fix requests in LLM
generated code. So easy, in fact, that it becomes tempting to repost
refreshed code immediately. Please resist that temptation.
As always, please avoid reposting series revisions more than once
every 24 hours.
Clean-up patches
~~~~~~~~~~~~~~~~
The NFSD maintainers discourage patches which perform simple clean-
ups, which are not in the context of other work. For example:
* Addressing ``checkpatch.pl`` warnings after merge
* Addressing :ref:`Local variable ordering<rcs>` issues
* Addressing long-standing whitespace damage
This is because it is felt that the churn that such changes produce
comes at a greater cost than the value of such clean-ups.
Conversely, spelling and grammar fixes are encouraged.
Stable and LTS support
----------------------
Upstream NFSD continuous integration testing runs against LTS trees
whenever they are updated.
Please indicate when a patch containing a fix needs to be considered
for LTS kernels, either via a Fixes: tag or explicit mention.
Feature requests
----------------
There is no one way to make an official feature request, but
discussion about the request should eventually make its way to
the linux-nfs@vger.kernel.org mailing list for public review by
the community.
Subsystem boundaries
~~~~~~~~~~~~~~~~~~~~
NFSD itself is not much more than a protocol engine. This means its
primary responsibility is to translate the NFS protocol into API
calls in the Linux kernel. For example, NFSD is not responsible for
knowing exactly how bytes or file attributes are managed on a block
device. It relies on other kernel subsystems for that.
If the subsystems on which NFSD relies do not implement a particular
feature, even if the standard NFS protocols do support that feature,
that usually means NFSD cannot provide that feature without
substantial development work in other areas of the kernel.
Specificity
~~~~~~~~~~~
Feature requests can come from anywhere, and thus can often be
nebulous. A requester might not understand what a "use case" or
"user story" is. These descriptive paradigms are often used by
developers and architects to understand what is required of a
design, but are terms of art in the software trade, not used in
the everyday world.
In order to prevent contributors and maintainers from becoming
overwhelmed, we won't be afraid of saying "no" politely to
underspecified requests.
Community roles and their authority
-----------------------------------
The purpose of Linux subsystem communities is to provide expertise
and active stewardship of a narrow set of source files in the Linux
kernel. This can include managing user space tooling as well.
To contextualize the structure of the Linux NFS community that
is responsible for stewardship of the NFS server code base, we
define the community roles here.
- **Contributor** : Anyone who submits a code change, bug fix,
recommendation, documentation fix, and so on. A contributor can
submit regularly or infrequently.
- **Outside Contributor** : A contributor who is not a regular actor
in the Linux NFS community. This can mean someone who contributes
to other parts of the kernel, or someone who just noticed a
misspelling in a comment and sent a patch.
- **Reviewer** : Someone who is named in the MAINTAINERS file as a
reviewer is an area expert who can request changes to contributed
code, and expects that contributors will address the request.
- **External Reviewer** : Someone who is not named in the
MAINTAINERS file as a reviewer, but who is an area expert.
Examples include Linux kernel contributors with networking,
security, or persistent storage expertise, or developers who
contribute primarily to other NFS implementations.
One or more people will take on the following roles. These people
are often generically referred to as "maintainers", and are
identified in the MAINTAINERS file with the "M:" tag under the NFSD
subsystem.
- **Upstream Release Manager** : This role is responsible for
curating contributions into a branch, reviewing test results, and
then sending a pull request during merge windows. There is a
trust relationship between the release manager and Linus.
- **Bug Triager** : Someone who is a first responder to bug reports
submitted to the linux-nfs mailing list or bug trackers, and helps
troubleshoot and identify next steps.
- **Security Lead** : The security lead handles contacts from the
security community to resolve immediate issues, as well as dealing
with long-term security issues such as supply chain concerns. For
upstream, that's usually whether contributions violate licensing
or other intellectual property agreements.
- **Testing Lead** : The testing lead builds and runs the test
infrastructure for the subsystem. The testing lead may ask for
patches to be dropped because of ongoing high defect rates.
- **LTS Maintainer** : The LTS maintainer is responsible for managing
the Fixes: and Cc: stable annotations on patches, and seeing that
patches that cannot be automatically applied to LTS kernels get
proper manual backports as necessary.
- **Community Manager** : This umpire role can be asked to call balls
and strikes during conflicts, but is also responsible for ensuring
the health of the relationships within the community and for
facilitating discussions on long-term topics such as how to manage
growing technical debt.

View File

@ -110,5 +110,6 @@ to do something different in the near future.
../process/maintainer-netdev
../driver-api/vfio-pci-device-specific-driver-acceptance
../nvme/feature-and-quirk-policy
../filesystems/nfs/nfsd-maintainer-entry-profile
../filesystems/xfs/xfs-maintainer-entry-profile
../mm/damon/maintainer-profile

View File

@ -48,6 +48,7 @@ documentation, or deleted if it has served its purpose.
hugetlbfs_reserv
ksm
memory-model
memfd_preservation
mmu_notifier
multigen_lru
numa

View File

@ -0,0 +1,23 @@
.. SPDX-License-Identifier: GPL-2.0-or-later
==========================
Memfd Preservation via LUO
==========================
.. kernel-doc:: mm/memfd_luo.c
:doc: Memfd Preservation via LUO
Memfd Preservation ABI
======================
.. kernel-doc:: include/linux/kho/abi/memfd.h
:doc: DOC: memfd Live Update ABI
.. kernel-doc:: include/linux/kho/abi/memfd.h
:internal:
See Also
========
- :doc:`/core-api/liveupdate`
- :doc:`/core-api/kho/concepts`

View File

@ -7,7 +7,7 @@ Landlock LSM: kernel documentation
==================================
:Author: Mickaël Salaün
:Date: March 2025
:Date: September 2025
Landlock's goal is to create scoped access-control (i.e. sandboxing). To
harden a whole system, this feature should be available to any process,
@ -110,6 +110,12 @@ Filesystem
.. kernel-doc:: security/landlock/fs.h
:identifiers:
Process credential
------------------
.. kernel-doc:: security/landlock/cred.h
:identifiers:
Ruleset and domain
------------------
@ -128,6 +134,9 @@ makes the reasoning much easier and helps avoid pitfalls.
.. kernel-doc:: security/landlock/ruleset.h
:identifiers:
.. kernel-doc:: security/landlock/domain.h
:identifiers:
Additional documentation
========================

View File

@ -61,6 +61,7 @@ Everything else
:maxdepth: 1
ELF
liveupdate
netlink/index
sysfs-platform_profile
vduse

View File

@ -385,6 +385,8 @@ Code Seq# Include File Comments
0xB8 01-02 uapi/misc/mrvl_cn10k_dpi.h Marvell CN10K DPI driver
0xB8 all uapi/linux/mshv.h Microsoft Hyper-V /dev/mshv driver
<mailto:linux-hyperv@vger.kernel.org>
0xBA 00-0F uapi/linux/liveupdate.h Pasha Tatashin
<mailto:pasha.tatashin@soleen.com>
0xC0 00-0F linux/usb/iowarrior.h
0xCA 00-0F uapi/misc/cxl.h Dead since 6.15
0xCA 10-2F uapi/misc/ocxl.h

View File

@ -0,0 +1,20 @@
.. SPDX-License-Identifier: GPL-2.0
================
Live Update uAPI
================
:Author: Pasha Tatashin <pasha.tatashin@soleen.com>
ioctl interface
===============
.. kernel-doc:: kernel/liveupdate/luo_core.c
:doc: LUO ioctl Interface
ioctl uAPI
===========
.. kernel-doc:: include/uapi/linux/liveupdate.h
See Also
========
- :doc:`Live Update Orchestrator </core-api/liveupdate>`

View File

@ -4432,6 +4432,7 @@ F: arch/*/lib/bitops.c
F: include/asm-generic/bitops
F: include/asm-generic/bitops.h
F: include/linux/bitops.h
F: lib/hweight.c
F: lib/test_bitops.c
F: tools/*/bitops*
@ -11658,7 +11659,7 @@ T: git git://linuxtv.org/media.git
F: drivers/media/i2c/hi556.c
HYNIX HI846 SENSOR DRIVER
M: Martin Kepplinger <martin.kepplinger@puri.sm>
M: Martin Kepplinger-Novakovic <martink@posteo.de>
L: linux-media@vger.kernel.org
S: Maintained
F: drivers/media/i2c/hi846.c
@ -11743,6 +11744,7 @@ HUNG TASK DETECTOR
M: Andrew Morton <akpm@linux-foundation.org>
R: Lance Yang <lance.yang@linux.dev>
R: Masami Hiramatsu <mhiramat@kernel.org>
R: Petr Mladek <pmladek@suse.com>
L: linux-kernel@vger.kernel.org
S: Maintained
F: include/linux/hung_task.h
@ -13653,6 +13655,7 @@ R: Dai Ngo <Dai.Ngo@oracle.com>
R: Tom Talpey <tom@talpey.com>
L: linux-nfs@vger.kernel.org
S: Supported
P: Documentation/filesystems/nfs/nfsd-maintainer-entry-profile.rst
B: https://bugzilla.kernel.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux.git
F: Documentation/filesystems/nfs/
@ -13672,6 +13675,10 @@ F: include/uapi/linux/sunrpc/
F: net/sunrpc/
F: tools/net/sunrpc/
KERNEL NFSD BLOCK and SCSI LAYOUT DRIVER
R: Christoph Hellwig <hch@lst.de>
F: fs/nfsd/blocklayout*
KERNEL PACMAN PACKAGING (in addition to generic KERNEL BUILD)
M: Thomas Weißschuh <linux@weissschuh.net>
R: Christian Heusel <christian@heusel.eu>
@ -13885,14 +13892,15 @@ F: kernel/kexec*
KEXEC HANDOVER (KHO)
M: Alexander Graf <graf@amazon.com>
M: Mike Rapoport <rppt@kernel.org>
M: Changyuan Lyu <changyuanl@google.com>
M: Pasha Tatashin <pasha.tatashin@soleen.com>
R: Pratyush Yadav <pratyush@kernel.org>
L: kexec@lists.infradead.org
L: linux-mm@kvack.org
S: Maintained
F: Documentation/admin-guide/mm/kho.rst
F: Documentation/core-api/kho/*
F: include/linux/kexec_handover.h
F: kernel/kexec_handover.c
F: kernel/liveupdate/kexec_handover*
F: lib/test_kho.c
F: tools/testing/selftests/kho/
@ -14561,6 +14569,22 @@ F: samples/livepatch/
F: scripts/livepatch/
F: tools/testing/selftests/livepatch/
LIVE UPDATE
M: Pasha Tatashin <pasha.tatashin@soleen.com>
M: Mike Rapoport <rppt@kernel.org>
R: Pratyush Yadav <pratyush@kernel.org>
L: linux-kernel@vger.kernel.org
S: Maintained
F: Documentation/core-api/liveupdate.rst
F: Documentation/mm/memfd_preservation.rst
F: Documentation/userspace-api/liveupdate.rst
F: include/linux/liveupdate.h
F: include/linux/liveupdate/
F: include/uapi/linux/liveupdate.h
F: kernel/liveupdate/
F: mm/memfd_luo.c
F: tools/testing/selftests/liveupdate/
LLC (802.2)
L: netdev@vger.kernel.org
S: Odd fixes
@ -15662,7 +15686,7 @@ F: include/media/imx.h
MEDIA DRIVERS FOR FREESCALE IMX7/8
M: Rui Miguel Silva <rmfrfs@gmail.com>
M: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
M: Martin Kepplinger <martin.kepplinger@puri.sm>
M: Martin Kepplinger-Novakovic <martink@posteo.de>
R: Purism Kernel Team <kernel@puri.sm>
R: Frank Li <Frank.Li@nxp.com>
L: imx@lists.linux.dev
@ -17522,6 +17546,7 @@ M: Luis Chamberlain <mcgrof@kernel.org>
M: Petr Pavlu <petr.pavlu@suse.com>
M: Daniel Gomez <da.gomez@kernel.org>
R: Sami Tolvanen <samitolvanen@google.com>
R: Aaron Tomlin <atomlin@atomlin.com>
L: linux-modules@vger.kernel.org
L: linux-kernel@vger.kernel.org
S: Maintained
@ -17531,6 +17556,8 @@ F: include/linux/module*.h
F: kernel/module/
F: lib/test_kmod.c
F: lib/tests/module/
F: rust/kernel/module_param.rs
F: rust/macros/module.rs
F: scripts/module*
F: tools/testing/selftests/kmod/
F: tools/testing/selftests/module/
@ -18411,10 +18438,11 @@ F: net/sunrpc/
NILFS2 FILESYSTEM
M: Ryusuke Konishi <konishi.ryusuke@gmail.com>
M: Viacheslav Dubeyko <slava@dubeyko.com>
L: linux-nilfs@vger.kernel.org
S: Supported
S: Maintained
W: https://nilfs.sourceforge.io/
T: git https://github.com/konis/nilfs2.git
T: git git://git.kernel.org/pub/scm/linux/kernel/git/vdubeyko/nilfs2.git
F: Documentation/filesystems/nilfs2.rst
F: fs/nilfs2/
F: include/trace/events/nilfs2.h
@ -20089,6 +20117,7 @@ Q: https://patchwork.kernel.org/project/linux-pci/list/
B: https://bugzilla.kernel.org
C: irc://irc.oftc.net/linux-pci
T: git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git
F: Documentation/ABI/testing/sysfs-devices-pci-host-bridge
F: Documentation/PCI/
F: Documentation/devicetree/bindings/pci/
F: arch/x86/kernel/early-quirks.c
@ -25093,7 +25122,6 @@ F: drivers/regulator/sy8106a-regulator.c
SYNC FILE FRAMEWORK
M: Sumit Semwal <sumit.semwal@linaro.org>
R: Gustavo Padovan <gustavo@padovan.org>
L: linux-media@vger.kernel.org
L: dri-devel@lists.freedesktop.org
S: Maintained
@ -26298,7 +26326,7 @@ M: Jarkko Sakkinen <jarkko@kernel.org>
R: Jason Gunthorpe <jgg@ziepe.ca>
L: linux-integrity@vger.kernel.org
S: Maintained
W: https://codeberg.org/jarkko/linux-tpmdd-test
W: https://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd-test.git/about/
Q: https://patchwork.kernel.org/project/linux-integrity/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd.git
F: Documentation/devicetree/bindings/tpm/
@ -26388,14 +26416,16 @@ M: David Lechner <dlechner@baylibre.com>
S: Maintained
F: Documentation/devicetree/bindings/trigger-source/*
TRUSTED SECURITY MODULE (TSM) INFRASTRUCTURE
TRUSTED EXECUTION ENVIRONMENT SECURITY MANAGER (TSM)
M: Dan Williams <dan.j.williams@intel.com>
L: linux-coco@lists.linux.dev
S: Maintained
F: Documentation/ABI/testing/configfs-tsm-report
F: Documentation/driver-api/coco/
F: Documentation/driver-api/pci/tsm.rst
F: drivers/pci/tsm.c
F: drivers/virt/coco/guest/
F: include/linux/tsm*.h
F: include/linux/*tsm*.h
F: samples/tsm-mr/
TRUSTED SERVICES TEE DRIVER

View File

@ -232,17 +232,14 @@ config HAVE_EFFICIENT_UNALIGNED_ACCESS
config ARCH_USE_BUILTIN_BSWAP
bool
help
Modern versions of GCC (since 4.4) have builtin functions
for handling byte-swapping. Using these, instead of the old
inline assembler that the architecture code provides in the
__arch_bswapXX() macros, allows the compiler to see what's
happening and offers more opportunity for optimisation. In
particular, the compiler will be able to combine the byteswap
with a nearby load or store and use load-and-swap or
store-and-swap instructions if the architecture has them. It
should almost *never* result in code which is worse than the
hand-coded assembler in <asm/swab.h>. But just in case it
does, the use of the builtins is optional.
GCC and Clang have builtin functions for handling byte-swapping.
Using these allows the compiler to see what's happening and
offers more opportunity for optimisation. In particular, the
compiler will be able to combine the byteswap with a nearby load
or store and use load-and-swap or store-and-swap instructions if
the architecture has them. It should almost *never* result in code
which is worse than the hand-coded assembler in <asm/swab.h>.
But just in case it does, the use of the builtins is optional.
Any architecture with load-and-swap or store-and-swap
instructions should set this. And it shouldn't hurt to set it

View File

@ -224,28 +224,26 @@ static int pci_dac_dma_supported(struct pci_dev *dev, u64 mask)
until either pci_unmap_single or pci_dma_sync_single is performed. */
static dma_addr_t
pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size,
pci_map_single_1(struct pci_dev *pdev, phys_addr_t paddr, size_t size,
int dac_allowed)
{
struct pci_controller *hose = pdev ? pdev->sysdata : pci_isa_hose;
dma_addr_t max_dma = pdev ? pdev->dma_mask : ISA_DMA_MASK;
unsigned long offset = offset_in_page(paddr);
struct pci_iommu_arena *arena;
long npages, dma_ofs, i;
unsigned long paddr;
dma_addr_t ret;
unsigned int align = 0;
struct device *dev = pdev ? &pdev->dev : NULL;
paddr = __pa(cpu_addr);
#if !DEBUG_NODIRECT
/* First check to see if we can use the direct map window. */
if (paddr + size + __direct_map_base - 1 <= max_dma
&& paddr + size <= __direct_map_size) {
ret = paddr + __direct_map_base;
DBGA2("pci_map_single: [%p,%zx] -> direct %llx from %ps\n",
cpu_addr, size, ret, __builtin_return_address(0));
DBGA2("pci_map_single: [%pa,%zx] -> direct %llx from %ps\n",
&paddr, size, ret, __builtin_return_address(0));
return ret;
}
@ -255,8 +253,8 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size,
if (dac_allowed) {
ret = paddr + alpha_mv.pci_dac_offset;
DBGA2("pci_map_single: [%p,%zx] -> DAC %llx from %ps\n",
cpu_addr, size, ret, __builtin_return_address(0));
DBGA2("pci_map_single: [%pa,%zx] -> DAC %llx from %ps\n",
&paddr, size, ret, __builtin_return_address(0));
return ret;
}
@ -290,10 +288,10 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size,
arena->ptes[i + dma_ofs] = mk_iommu_pte(paddr);
ret = arena->dma_base + dma_ofs * PAGE_SIZE;
ret += (unsigned long)cpu_addr & ~PAGE_MASK;
ret += offset;
DBGA2("pci_map_single: [%p,%zx] np %ld -> sg %llx from %ps\n",
cpu_addr, size, npages, ret, __builtin_return_address(0));
DBGA2("pci_map_single: [%pa,%zx] np %ld -> sg %llx from %ps\n",
&paddr, size, npages, ret, __builtin_return_address(0));
return ret;
}
@ -322,19 +320,18 @@ static struct pci_dev *alpha_gendev_to_pci(struct device *dev)
return NULL;
}
static dma_addr_t alpha_pci_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size,
enum dma_data_direction dir,
static dma_addr_t alpha_pci_map_phys(struct device *dev, phys_addr_t phys,
size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
struct pci_dev *pdev = alpha_gendev_to_pci(dev);
int dac_allowed;
BUG_ON(dir == DMA_NONE);
if (unlikely(attrs & DMA_ATTR_MMIO))
return DMA_MAPPING_ERROR;
dac_allowed = pdev ? pci_dac_dma_supported(pdev, pdev->dma_mask) : 0;
return pci_map_single_1(pdev, (char *)page_address(page) + offset,
size, dac_allowed);
dac_allowed = pdev ? pci_dac_dma_supported(pdev, pdev->dma_mask) : 0;
return pci_map_single_1(pdev, phys, size, dac_allowed);
}
/* Unmap a single streaming mode DMA translation. The DMA_ADDR and
@ -343,7 +340,7 @@ static dma_addr_t alpha_pci_map_page(struct device *dev, struct page *page,
the cpu to the buffer are guaranteed to see whatever the device
wrote there. */
static void alpha_pci_unmap_page(struct device *dev, dma_addr_t dma_addr,
static void alpha_pci_unmap_phys(struct device *dev, dma_addr_t dma_addr,
size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
@ -353,8 +350,6 @@ static void alpha_pci_unmap_page(struct device *dev, dma_addr_t dma_addr,
struct pci_iommu_arena *arena;
long dma_ofs, npages;
BUG_ON(dir == DMA_NONE);
if (dma_addr >= __direct_map_base
&& dma_addr < __direct_map_base + __direct_map_size) {
/* Nothing to do. */
@ -429,7 +424,7 @@ try_again:
}
memset(cpu_addr, 0, size);
*dma_addrp = pci_map_single_1(pdev, cpu_addr, size, 0);
*dma_addrp = pci_map_single_1(pdev, virt_to_phys(cpu_addr), size, 0);
if (*dma_addrp == DMA_MAPPING_ERROR) {
free_pages((unsigned long)cpu_addr, order);
if (alpha_mv.mv_pci_tbi || (gfp & GFP_DMA))
@ -643,9 +638,8 @@ static int alpha_pci_map_sg(struct device *dev, struct scatterlist *sg,
/* Fast path single entry scatterlists. */
if (nents == 1) {
sg->dma_length = sg->length;
sg->dma_address
= pci_map_single_1(pdev, SG_ENT_VIRT_ADDRESS(sg),
sg->length, dac_allowed);
sg->dma_address = pci_map_single_1(pdev, sg_phys(sg),
sg->length, dac_allowed);
if (sg->dma_address == DMA_MAPPING_ERROR)
return -EIO;
return 1;
@ -917,8 +911,8 @@ iommu_unbind(struct pci_iommu_arena *arena, long pg_start, long pg_count)
const struct dma_map_ops alpha_pci_ops = {
.alloc = alpha_pci_alloc_coherent,
.free = alpha_pci_free_coherent,
.map_page = alpha_pci_map_page,
.unmap_page = alpha_pci_unmap_page,
.map_phys = alpha_pci_map_phys,
.unmap_phys = alpha_pci_unmap_phys,
.map_sg = alpha_pci_map_sg,
.unmap_sg = alpha_pci_unmap_sg,
.dma_supported = alpha_pci_supported,

View File

@ -1161,8 +1161,6 @@ config AEABI
disambiguate both ABIs and allow for backward compatibility support
(selected with CONFIG_OABI_COMPAT).
To use this you need GCC version 4.0.0 or later.
config OABI_COMPAT
bool "Allow old ABI binaries to run with this kernel (EXPERIMENTAL)"
depends on AEABI && !THUMB2_KERNEL

View File

@ -308,7 +308,7 @@ CONFIG_PANIC_ON_OOPS=y
CONFIG_PANIC_TIMEOUT=-1
CONFIG_SOFTLOCKUP_DETECTOR=y
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
CONFIG_BOOTPARAM_HUNG_TASK_PANIC=y
CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1
CONFIG_WQ_WATCHDOG=y
# CONFIG_SCHED_DEBUG is not set
CONFIG_FUNCTION_TRACER=y

View File

@ -624,16 +624,14 @@ static void __arm_dma_free(struct device *dev, size_t size, void *cpu_addr,
kfree(buf);
}
static void dma_cache_maint_page(struct page *page, unsigned long offset,
size_t size, enum dma_data_direction dir,
static void dma_cache_maint_page(phys_addr_t phys, size_t size,
enum dma_data_direction dir,
void (*op)(const void *, size_t, int))
{
unsigned long pfn;
unsigned long offset = offset_in_page(phys);
unsigned long pfn = __phys_to_pfn(phys);
size_t left = size;
pfn = page_to_pfn(page) + offset / PAGE_SIZE;
offset %= PAGE_SIZE;
/*
* A single sg entry may refer to multiple physically contiguous
* pages. But we still need to process highmem pages individually.
@ -644,17 +642,18 @@ static void dma_cache_maint_page(struct page *page, unsigned long offset,
size_t len = left;
void *vaddr;
page = pfn_to_page(pfn);
if (PageHighMem(page)) {
phys = __pfn_to_phys(pfn);
if (PhysHighMem(phys)) {
if (len + offset > PAGE_SIZE)
len = PAGE_SIZE - offset;
if (cache_is_vipt_nonaliasing()) {
vaddr = kmap_atomic(page);
vaddr = kmap_atomic_pfn(pfn);
op(vaddr + offset, len, dir);
kunmap_atomic(vaddr);
} else {
struct page *page = phys_to_page(phys);
vaddr = kmap_high_get(page);
if (vaddr) {
op(vaddr + offset, len, dir);
@ -662,7 +661,8 @@ static void dma_cache_maint_page(struct page *page, unsigned long offset,
}
}
} else {
vaddr = page_address(page) + offset;
phys += offset;
vaddr = phys_to_virt(phys);
op(vaddr, len, dir);
}
offset = 0;
@ -676,14 +676,11 @@ static void dma_cache_maint_page(struct page *page, unsigned long offset,
* Note: Drivers should NOT use this function directly.
* Use the driver DMA support - see dma-mapping.h (dma_sync_*)
*/
static void __dma_page_cpu_to_dev(struct page *page, unsigned long off,
size_t size, enum dma_data_direction dir)
void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
enum dma_data_direction dir)
{
phys_addr_t paddr;
dma_cache_maint_page(paddr, size, dir, dmac_map_area);
dma_cache_maint_page(page, off, size, dir, dmac_map_area);
paddr = page_to_phys(page) + off;
if (dir == DMA_FROM_DEVICE) {
outer_inv_range(paddr, paddr + size);
} else {
@ -692,17 +689,15 @@ static void __dma_page_cpu_to_dev(struct page *page, unsigned long off,
/* FIXME: non-speculating: flush on bidirectional mappings? */
}
static void __dma_page_dev_to_cpu(struct page *page, unsigned long off,
size_t size, enum dma_data_direction dir)
void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
enum dma_data_direction dir)
{
phys_addr_t paddr = page_to_phys(page) + off;
/* FIXME: non-speculating: not required */
/* in any case, don't bother invalidating if DMA to device */
if (dir != DMA_TO_DEVICE) {
outer_inv_range(paddr, paddr + size);
dma_cache_maint_page(page, off, size, dir, dmac_unmap_area);
dma_cache_maint_page(paddr, size, dir, dmac_unmap_area);
}
/*
@ -737,6 +732,9 @@ static int __dma_info_to_prot(enum dma_data_direction dir, unsigned long attrs)
if (attrs & DMA_ATTR_PRIVILEGED)
prot |= IOMMU_PRIV;
if (attrs & DMA_ATTR_MMIO)
prot |= IOMMU_MMIO;
switch (dir) {
case DMA_BIDIRECTIONAL:
return prot | IOMMU_READ | IOMMU_WRITE;
@ -1205,7 +1203,7 @@ static int __map_sg_chunk(struct device *dev, struct scatterlist *sg,
unsigned int len = PAGE_ALIGN(s->offset + s->length);
if (!dev->dma_coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
__dma_page_cpu_to_dev(sg_page(s), s->offset, s->length, dir);
arch_sync_dma_for_device(sg_phys(s), s->length, dir);
prot = __dma_info_to_prot(dir, attrs);
@ -1307,8 +1305,7 @@ static void arm_iommu_unmap_sg(struct device *dev,
__iommu_remove_mapping(dev, sg_dma_address(s),
sg_dma_len(s));
if (!dev->dma_coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
__dma_page_dev_to_cpu(sg_page(s), s->offset,
s->length, dir);
arch_sync_dma_for_cpu(sg_phys(s), s->length, dir);
}
}
@ -1330,7 +1327,7 @@ static void arm_iommu_sync_sg_for_cpu(struct device *dev,
return;
for_each_sg(sg, s, nents, i)
__dma_page_dev_to_cpu(sg_page(s), s->offset, s->length, dir);
arch_sync_dma_for_cpu(sg_phys(s), s->length, dir);
}
@ -1352,29 +1349,31 @@ static void arm_iommu_sync_sg_for_device(struct device *dev,
return;
for_each_sg(sg, s, nents, i)
__dma_page_cpu_to_dev(sg_page(s), s->offset, s->length, dir);
arch_sync_dma_for_device(sg_phys(s), s->length, dir);
}
/**
* arm_iommu_map_page
* arm_iommu_map_phys
* @dev: valid struct device pointer
* @page: page that buffer resides in
* @offset: offset into page for start of buffer
* @phys: physical address that buffer resides in
* @size: size of buffer to map
* @dir: DMA transfer direction
* @attrs: DMA mapping attributes
*
* IOMMU aware version of arm_dma_map_page()
*/
static dma_addr_t arm_iommu_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size, enum dma_data_direction dir,
unsigned long attrs)
static dma_addr_t arm_iommu_map_phys(struct device *dev, phys_addr_t phys,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(dev);
int len = PAGE_ALIGN(size + offset_in_page(phys));
phys_addr_t addr = phys & PAGE_MASK;
dma_addr_t dma_addr;
int ret, prot, len = PAGE_ALIGN(size + offset);
int ret, prot;
if (!dev->dma_coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
__dma_page_cpu_to_dev(page, offset, size, dir);
if (!dev->dma_coherent &&
!(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
arch_sync_dma_for_device(phys, size, dir);
dma_addr = __alloc_iova(mapping, len);
if (dma_addr == DMA_MAPPING_ERROR)
@ -1382,12 +1381,11 @@ static dma_addr_t arm_iommu_map_page(struct device *dev, struct page *page,
prot = __dma_info_to_prot(dir, attrs);
ret = iommu_map(mapping->domain, dma_addr, page_to_phys(page), len,
prot, GFP_KERNEL);
ret = iommu_map(mapping->domain, dma_addr, addr, len, prot, GFP_KERNEL);
if (ret < 0)
goto fail;
return dma_addr + offset;
return dma_addr + offset_in_page(phys);
fail:
__free_iova(mapping, dma_addr, len);
return DMA_MAPPING_ERROR;
@ -1399,100 +1397,45 @@ fail:
* @handle: DMA address of buffer
* @size: size of buffer (same as passed to dma_map_page)
* @dir: DMA transfer direction (same as passed to dma_map_page)
* @attrs: DMA mapping attributes
*
* IOMMU aware version of arm_dma_unmap_page()
* IOMMU aware version of arm_dma_unmap_phys()
*/
static void arm_iommu_unmap_page(struct device *dev, dma_addr_t handle,
static void arm_iommu_unmap_phys(struct device *dev, dma_addr_t handle,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(dev);
dma_addr_t iova = handle & PAGE_MASK;
struct page *page;
int offset = handle & ~PAGE_MASK;
int len = PAGE_ALIGN(size + offset);
if (!iova)
return;
if (!dev->dma_coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
page = phys_to_page(iommu_iova_to_phys(mapping->domain, iova));
__dma_page_dev_to_cpu(page, offset, size, dir);
if (!dev->dma_coherent &&
!(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) {
phys_addr_t phys = iommu_iova_to_phys(mapping->domain, iova);
arch_sync_dma_for_cpu(phys + offset, size, dir);
}
iommu_unmap(mapping->domain, iova, len);
__free_iova(mapping, iova, len);
}
/**
* arm_iommu_map_resource - map a device resource for DMA
* @dev: valid struct device pointer
* @phys_addr: physical address of resource
* @size: size of resource to map
* @dir: DMA transfer direction
*/
static dma_addr_t arm_iommu_map_resource(struct device *dev,
phys_addr_t phys_addr, size_t size,
enum dma_data_direction dir, unsigned long attrs)
{
struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(dev);
dma_addr_t dma_addr;
int ret, prot;
phys_addr_t addr = phys_addr & PAGE_MASK;
unsigned int offset = phys_addr & ~PAGE_MASK;
size_t len = PAGE_ALIGN(size + offset);
dma_addr = __alloc_iova(mapping, len);
if (dma_addr == DMA_MAPPING_ERROR)
return dma_addr;
prot = __dma_info_to_prot(dir, attrs) | IOMMU_MMIO;
ret = iommu_map(mapping->domain, dma_addr, addr, len, prot, GFP_KERNEL);
if (ret < 0)
goto fail;
return dma_addr + offset;
fail:
__free_iova(mapping, dma_addr, len);
return DMA_MAPPING_ERROR;
}
/**
* arm_iommu_unmap_resource - unmap a device DMA resource
* @dev: valid struct device pointer
* @dma_handle: DMA address to resource
* @size: size of resource to map
* @dir: DMA transfer direction
*/
static void arm_iommu_unmap_resource(struct device *dev, dma_addr_t dma_handle,
size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(dev);
dma_addr_t iova = dma_handle & PAGE_MASK;
unsigned int offset = dma_handle & ~PAGE_MASK;
size_t len = PAGE_ALIGN(size + offset);
if (!iova)
return;
iommu_unmap(mapping->domain, iova, len);
__free_iova(mapping, iova, len);
}
static void arm_iommu_sync_single_for_cpu(struct device *dev,
dma_addr_t handle, size_t size, enum dma_data_direction dir)
{
struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(dev);
dma_addr_t iova = handle & PAGE_MASK;
struct page *page;
unsigned int offset = handle & ~PAGE_MASK;
phys_addr_t phys;
if (dev->dma_coherent || !iova)
return;
page = phys_to_page(iommu_iova_to_phys(mapping->domain, iova));
__dma_page_dev_to_cpu(page, offset, size, dir);
phys = iommu_iova_to_phys(mapping->domain, iova);
arch_sync_dma_for_cpu(phys + offset, size, dir);
}
static void arm_iommu_sync_single_for_device(struct device *dev,
@ -1500,14 +1443,14 @@ static void arm_iommu_sync_single_for_device(struct device *dev,
{
struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(dev);
dma_addr_t iova = handle & PAGE_MASK;
struct page *page;
unsigned int offset = handle & ~PAGE_MASK;
phys_addr_t phys;
if (dev->dma_coherent || !iova)
return;
page = phys_to_page(iommu_iova_to_phys(mapping->domain, iova));
__dma_page_cpu_to_dev(page, offset, size, dir);
phys = iommu_iova_to_phys(mapping->domain, iova);
arch_sync_dma_for_device(phys + offset, size, dir);
}
static const struct dma_map_ops iommu_ops = {
@ -1516,8 +1459,8 @@ static const struct dma_map_ops iommu_ops = {
.mmap = arm_iommu_mmap_attrs,
.get_sgtable = arm_iommu_get_sgtable,
.map_page = arm_iommu_map_page,
.unmap_page = arm_iommu_unmap_page,
.map_phys = arm_iommu_map_phys,
.unmap_phys = arm_iommu_unmap_phys,
.sync_single_for_cpu = arm_iommu_sync_single_for_cpu,
.sync_single_for_device = arm_iommu_sync_single_for_device,
@ -1525,9 +1468,6 @@ static const struct dma_map_ops iommu_ops = {
.unmap_sg = arm_iommu_unmap_sg,
.sync_sg_for_cpu = arm_iommu_sync_sg_for_cpu,
.sync_sg_for_device = arm_iommu_sync_sg_for_device,
.map_resource = arm_iommu_map_resource,
.unmap_resource = arm_iommu_unmap_resource,
};
/**
@ -1794,20 +1734,6 @@ void arch_teardown_dma_ops(struct device *dev)
set_dma_ops(dev, NULL);
}
void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
enum dma_data_direction dir)
{
__dma_page_cpu_to_dev(phys_to_page(paddr), paddr & (PAGE_SIZE - 1),
size, dir);
}
void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
enum dma_data_direction dir)
{
__dma_page_dev_to_cpu(phys_to_page(paddr), paddr & (PAGE_SIZE - 1),
size, dir);
}
void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp_t gfp, unsigned long attrs)
{

View File

@ -521,18 +521,24 @@ static void jazz_dma_free(struct device *dev, size_t size, void *vaddr,
__free_pages(virt_to_page(vaddr), get_order(size));
}
static dma_addr_t jazz_dma_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size, enum dma_data_direction dir,
unsigned long attrs)
static dma_addr_t jazz_dma_map_phys(struct device *dev, phys_addr_t phys,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
phys_addr_t phys = page_to_phys(page) + offset;
if (unlikely(attrs & DMA_ATTR_MMIO))
/*
* This check is included because older versions of the code lacked
* MMIO path support, and my ability to test this path is limited.
* However, from a software technical standpoint, there is no restriction,
* as the following code operates solely on physical addresses.
*/
return DMA_MAPPING_ERROR;
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
arch_sync_dma_for_device(phys, size, dir);
return vdma_alloc(phys, size);
}
static void jazz_dma_unmap_page(struct device *dev, dma_addr_t dma_addr,
static void jazz_dma_unmap_phys(struct device *dev, dma_addr_t dma_addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
@ -607,8 +613,8 @@ static void jazz_dma_sync_sg_for_cpu(struct device *dev,
const struct dma_map_ops jazz_dma_ops = {
.alloc = jazz_dma_alloc,
.free = jazz_dma_free,
.map_page = jazz_dma_map_page,
.unmap_page = jazz_dma_unmap_page,
.map_phys = jazz_dma_map_phys,
.unmap_phys = jazz_dma_unmap_phys,
.map_sg = jazz_dma_map_sg,
.unmap_sg = jazz_dma_unmap_sg,
.sync_single_for_cpu = jazz_dma_sync_single_for_cpu,

View File

@ -5,4 +5,12 @@
/* crash kernel regions are Page size agliged */
#define CRASH_ALIGN PAGE_SIZE
#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
static inline bool arch_add_crash_res_to_iomem(void)
{
return false;
}
#define arch_add_crash_res_to_iomem arch_add_crash_res_to_iomem
#endif
#endif /* _ASM_POWERPC_CRASH_RESERVE_H */

View File

@ -274,12 +274,12 @@ extern void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
unsigned long mask, gfp_t flag, int node);
extern void iommu_free_coherent(struct iommu_table *tbl, size_t size,
void *vaddr, dma_addr_t dma_handle);
extern dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
struct page *page, unsigned long offset,
size_t size, unsigned long mask,
extern dma_addr_t iommu_map_phys(struct device *dev, struct iommu_table *tbl,
phys_addr_t phys, size_t size,
unsigned long mask,
enum dma_data_direction direction,
unsigned long attrs);
extern void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle,
extern void iommu_unmap_phys(struct iommu_table *tbl, dma_addr_t dma_handle,
size_t size, enum dma_data_direction direction,
unsigned long attrs);

View File

@ -93,28 +93,26 @@ static void dma_iommu_free_coherent(struct device *dev, size_t size,
/* Creates TCEs for a user provided buffer. The user buffer must be
* contiguous real kernel storage (not vmalloc). The address passed here
* comprises a page address and offset into that page. The dma_addr_t
* returned will point to the same byte within the page as was passed in.
* is a physical address to that page. The dma_addr_t returned will point
* to the same byte within the page as was passed in.
*/
static dma_addr_t dma_iommu_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size,
static dma_addr_t dma_iommu_map_phys(struct device *dev, phys_addr_t phys,
size_t size,
enum dma_data_direction direction,
unsigned long attrs)
{
return iommu_map_page(dev, get_iommu_table_base(dev), page, offset,
size, dma_get_mask(dev), direction, attrs);
return iommu_map_phys(dev, get_iommu_table_base(dev), phys, size,
dma_get_mask(dev), direction, attrs);
}
static void dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
static void dma_iommu_unmap_phys(struct device *dev, dma_addr_t dma_handle,
size_t size, enum dma_data_direction direction,
unsigned long attrs)
{
iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size, direction,
iommu_unmap_phys(get_iommu_table_base(dev), dma_handle, size, direction,
attrs);
}
static int dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
int nelems, enum dma_data_direction direction,
unsigned long attrs)
@ -211,8 +209,8 @@ const struct dma_map_ops dma_iommu_ops = {
.map_sg = dma_iommu_map_sg,
.unmap_sg = dma_iommu_unmap_sg,
.dma_supported = dma_iommu_dma_supported,
.map_page = dma_iommu_map_page,
.unmap_page = dma_iommu_unmap_page,
.map_phys = dma_iommu_map_phys,
.unmap_phys = dma_iommu_unmap_phys,
.get_required_mask = dma_iommu_get_required_mask,
.mmap = dma_common_mmap,
.get_sgtable = dma_common_get_sgtable,

View File

@ -848,12 +848,12 @@ EXPORT_SYMBOL_GPL(iommu_tce_table_put);
/* Creates TCEs for a user provided buffer. The user buffer must be
* contiguous real kernel storage (not vmalloc). The address passed here
* comprises a page address and offset into that page. The dma_addr_t
* returned will point to the same byte within the page as was passed in.
* is physical address into that page. The dma_addr_t returned will point
* to the same byte within the page as was passed in.
*/
dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
struct page *page, unsigned long offset, size_t size,
unsigned long mask, enum dma_data_direction direction,
dma_addr_t iommu_map_phys(struct device *dev, struct iommu_table *tbl,
phys_addr_t phys, size_t size, unsigned long mask,
enum dma_data_direction direction,
unsigned long attrs)
{
dma_addr_t dma_handle = DMA_MAPPING_ERROR;
@ -863,7 +863,7 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
BUG_ON(direction == DMA_NONE);
vaddr = page_address(page) + offset;
vaddr = phys_to_virt(phys);
uaddr = (unsigned long)vaddr;
if (tbl) {
@ -890,7 +890,7 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
return dma_handle;
}
void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle,
void iommu_unmap_phys(struct iommu_table *tbl, dma_addr_t dma_handle,
size_t size, enum dma_data_direction direction,
unsigned long attrs)
{

View File

@ -551,18 +551,20 @@ static void ps3_free_coherent(struct device *_dev, size_t size, void *vaddr,
/* Creates TCEs for a user provided buffer. The user buffer must be
* contiguous real kernel storage (not vmalloc). The address passed here
* comprises a page address and offset into that page. The dma_addr_t
* returned will point to the same byte within the page as was passed in.
* is physical address to that hat page. The dma_addr_t returned will point
* to the same byte within the page as was passed in.
*/
static dma_addr_t ps3_sb_map_page(struct device *_dev, struct page *page,
unsigned long offset, size_t size, enum dma_data_direction direction,
unsigned long attrs)
static dma_addr_t ps3_sb_map_phys(struct device *_dev, phys_addr_t phys,
size_t size, enum dma_data_direction direction, unsigned long attrs)
{
struct ps3_system_bus_device *dev = ps3_dev_to_system_bus_dev(_dev);
int result;
dma_addr_t bus_addr;
void *ptr = page_address(page) + offset;
void *ptr = phys_to_virt(phys);
if (unlikely(attrs & DMA_ATTR_MMIO))
return DMA_MAPPING_ERROR;
result = ps3_dma_map(dev->d_region, (unsigned long)ptr, size,
&bus_addr,
@ -577,8 +579,8 @@ static dma_addr_t ps3_sb_map_page(struct device *_dev, struct page *page,
return bus_addr;
}
static dma_addr_t ps3_ioc0_map_page(struct device *_dev, struct page *page,
unsigned long offset, size_t size,
static dma_addr_t ps3_ioc0_map_phys(struct device *_dev, phys_addr_t phys,
size_t size,
enum dma_data_direction direction,
unsigned long attrs)
{
@ -586,7 +588,10 @@ static dma_addr_t ps3_ioc0_map_page(struct device *_dev, struct page *page,
int result;
dma_addr_t bus_addr;
u64 iopte_flag;
void *ptr = page_address(page) + offset;
void *ptr = phys_to_virt(phys);
if (unlikely(attrs & DMA_ATTR_MMIO))
return DMA_MAPPING_ERROR;
iopte_flag = CBE_IOPTE_M;
switch (direction) {
@ -613,7 +618,7 @@ static dma_addr_t ps3_ioc0_map_page(struct device *_dev, struct page *page,
return bus_addr;
}
static void ps3_unmap_page(struct device *_dev, dma_addr_t dma_addr,
static void ps3_unmap_phys(struct device *_dev, dma_addr_t dma_addr,
size_t size, enum dma_data_direction direction, unsigned long attrs)
{
struct ps3_system_bus_device *dev = ps3_dev_to_system_bus_dev(_dev);
@ -690,8 +695,8 @@ static const struct dma_map_ops ps3_sb_dma_ops = {
.map_sg = ps3_sb_map_sg,
.unmap_sg = ps3_sb_unmap_sg,
.dma_supported = ps3_dma_supported,
.map_page = ps3_sb_map_page,
.unmap_page = ps3_unmap_page,
.map_phys = ps3_sb_map_phys,
.unmap_phys = ps3_unmap_phys,
.mmap = dma_common_mmap,
.get_sgtable = dma_common_get_sgtable,
.alloc_pages_op = dma_common_alloc_pages,
@ -704,8 +709,8 @@ static const struct dma_map_ops ps3_ioc0_dma_ops = {
.map_sg = ps3_ioc0_map_sg,
.unmap_sg = ps3_ioc0_unmap_sg,
.dma_supported = ps3_dma_supported,
.map_page = ps3_ioc0_map_page,
.unmap_page = ps3_unmap_page,
.map_phys = ps3_ioc0_map_phys,
.unmap_phys = ps3_unmap_phys,
.mmap = dma_common_mmap,
.get_sgtable = dma_common_get_sgtable,
.alloc_pages_op = dma_common_alloc_pages,

View File

@ -86,17 +86,18 @@ static void ibmebus_free_coherent(struct device *dev,
kfree(vaddr);
}
static dma_addr_t ibmebus_map_page(struct device *dev,
struct page *page,
unsigned long offset,
static dma_addr_t ibmebus_map_phys(struct device *dev, phys_addr_t phys,
size_t size,
enum dma_data_direction direction,
unsigned long attrs)
{
return (dma_addr_t)(page_address(page) + offset);
if (attrs & DMA_ATTR_MMIO)
return DMA_MAPPING_ERROR;
return (dma_addr_t)(phys_to_virt(phys));
}
static void ibmebus_unmap_page(struct device *dev,
static void ibmebus_unmap_phys(struct device *dev,
dma_addr_t dma_addr,
size_t size,
enum dma_data_direction direction,
@ -146,8 +147,8 @@ static const struct dma_map_ops ibmebus_dma_ops = {
.unmap_sg = ibmebus_unmap_sg,
.dma_supported = ibmebus_dma_supported,
.get_required_mask = ibmebus_dma_get_required_mask,
.map_page = ibmebus_map_page,
.unmap_page = ibmebus_unmap_page,
.map_phys = ibmebus_map_phys,
.unmap_phys = ibmebus_unmap_phys,
};
static int ibmebus_match_path(struct device *dev, const void *data)

View File

@ -512,18 +512,21 @@ static void vio_dma_iommu_free_coherent(struct device *dev, size_t size,
vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
}
static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size,
enum dma_data_direction direction,
unsigned long attrs)
static dma_addr_t vio_dma_iommu_map_phys(struct device *dev, phys_addr_t phys,
size_t size,
enum dma_data_direction direction,
unsigned long attrs)
{
struct vio_dev *viodev = to_vio_dev(dev);
struct iommu_table *tbl = get_iommu_table_base(dev);
dma_addr_t ret = DMA_MAPPING_ERROR;
if (unlikely(attrs & DMA_ATTR_MMIO))
return ret;
if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))))
goto out_fail;
ret = iommu_map_page(dev, tbl, page, offset, size, dma_get_mask(dev),
ret = iommu_map_phys(dev, tbl, phys, size, dma_get_mask(dev),
direction, attrs);
if (unlikely(ret == DMA_MAPPING_ERROR))
goto out_deallocate;
@ -536,7 +539,7 @@ out_fail:
return DMA_MAPPING_ERROR;
}
static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
static void vio_dma_iommu_unmap_phys(struct device *dev, dma_addr_t dma_handle,
size_t size,
enum dma_data_direction direction,
unsigned long attrs)
@ -544,7 +547,7 @@ static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
struct vio_dev *viodev = to_vio_dev(dev);
struct iommu_table *tbl = get_iommu_table_base(dev);
iommu_unmap_page(tbl, dma_handle, size, direction, attrs);
iommu_unmap_phys(tbl, dma_handle, size, direction, attrs);
vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
}
@ -605,8 +608,8 @@ static const struct dma_map_ops vio_dma_mapping_ops = {
.free = vio_dma_iommu_free_coherent,
.map_sg = vio_dma_iommu_map_sg,
.unmap_sg = vio_dma_iommu_unmap_sg,
.map_page = vio_dma_iommu_map_page,
.unmap_page = vio_dma_iommu_unmap_page,
.map_phys = vio_dma_iommu_map_phys,
.unmap_phys = vio_dma_iommu_unmap_phys,
.dma_supported = dma_iommu_dma_supported,
.get_required_mask = dma_iommu_get_required_mask,
.mmap = dma_common_mmap,

View File

@ -260,26 +260,35 @@ static void dma_4u_free_coherent(struct device *dev, size_t size,
free_pages((unsigned long)cpu, order);
}
static dma_addr_t dma_4u_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t sz,
enum dma_data_direction direction,
static dma_addr_t dma_4u_map_phys(struct device *dev, phys_addr_t phys,
size_t sz, enum dma_data_direction direction,
unsigned long attrs)
{
struct iommu *iommu;
struct strbuf *strbuf;
iopte_t *base;
unsigned long flags, npages, oaddr;
unsigned long i, base_paddr, ctx;
unsigned long i, ctx;
u32 bus_addr, ret;
unsigned long iopte_protection;
if (unlikely(attrs & DMA_ATTR_MMIO))
/*
* This check is included because older versions of the code
* lacked MMIO path support, and my ability to test this path
* is limited. However, from a software technical standpoint,
* there is no restriction, as the following code operates
* solely on physical addresses.
*/
goto bad_no_ctx;
iommu = dev->archdata.iommu;
strbuf = dev->archdata.stc;
if (unlikely(direction == DMA_NONE))
goto bad_no_ctx;
oaddr = (unsigned long)(page_address(page) + offset);
oaddr = (unsigned long)(phys_to_virt(phys));
npages = IO_PAGE_ALIGN(oaddr + sz) - (oaddr & IO_PAGE_MASK);
npages >>= IO_PAGE_SHIFT;
@ -296,7 +305,6 @@ static dma_addr_t dma_4u_map_page(struct device *dev, struct page *page,
bus_addr = (iommu->tbl.table_map_base +
((base - iommu->page_table) << IO_PAGE_SHIFT));
ret = bus_addr | (oaddr & ~IO_PAGE_MASK);
base_paddr = __pa(oaddr & IO_PAGE_MASK);
if (strbuf->strbuf_enabled)
iopte_protection = IOPTE_STREAMING(ctx);
else
@ -304,8 +312,8 @@ static dma_addr_t dma_4u_map_page(struct device *dev, struct page *page,
if (direction != DMA_TO_DEVICE)
iopte_protection |= IOPTE_WRITE;
for (i = 0; i < npages; i++, base++, base_paddr += IO_PAGE_SIZE)
iopte_val(*base) = iopte_protection | base_paddr;
for (i = 0; i < npages; i++, base++, phys += IO_PAGE_SIZE)
iopte_val(*base) = iopte_protection | phys;
return ret;
@ -383,7 +391,7 @@ do_flush_sync:
vaddr, ctx, npages);
}
static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr,
static void dma_4u_unmap_phys(struct device *dev, dma_addr_t bus_addr,
size_t sz, enum dma_data_direction direction,
unsigned long attrs)
{
@ -753,8 +761,8 @@ static int dma_4u_supported(struct device *dev, u64 device_mask)
static const struct dma_map_ops sun4u_dma_ops = {
.alloc = dma_4u_alloc_coherent,
.free = dma_4u_free_coherent,
.map_page = dma_4u_map_page,
.unmap_page = dma_4u_unmap_page,
.map_phys = dma_4u_map_phys,
.unmap_phys = dma_4u_unmap_phys,
.map_sg = dma_4u_map_sg,
.unmap_sg = dma_4u_unmap_sg,
.sync_single_for_cpu = dma_4u_sync_single_for_cpu,

View File

@ -352,9 +352,8 @@ static void dma_4v_free_coherent(struct device *dev, size_t size, void *cpu,
free_pages((unsigned long)cpu, order);
}
static dma_addr_t dma_4v_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t sz,
enum dma_data_direction direction,
static dma_addr_t dma_4v_map_phys(struct device *dev, phys_addr_t phys,
size_t sz, enum dma_data_direction direction,
unsigned long attrs)
{
struct iommu *iommu;
@ -362,18 +361,27 @@ static dma_addr_t dma_4v_map_page(struct device *dev, struct page *page,
struct iommu_map_table *tbl;
u64 mask;
unsigned long flags, npages, oaddr;
unsigned long i, base_paddr;
unsigned long prot;
unsigned long i, prot;
dma_addr_t bus_addr, ret;
long entry;
if (unlikely(attrs & DMA_ATTR_MMIO))
/*
* This check is included because older versions of the code
* lacked MMIO path support, and my ability to test this path
* is limited. However, from a software technical standpoint,
* there is no restriction, as the following code operates
* solely on physical addresses.
*/
goto bad;
iommu = dev->archdata.iommu;
atu = iommu->atu;
if (unlikely(direction == DMA_NONE))
goto bad;
oaddr = (unsigned long)(page_address(page) + offset);
oaddr = (unsigned long)(phys_to_virt(phys));
npages = IO_PAGE_ALIGN(oaddr + sz) - (oaddr & IO_PAGE_MASK);
npages >>= IO_PAGE_SHIFT;
@ -391,7 +399,6 @@ static dma_addr_t dma_4v_map_page(struct device *dev, struct page *page,
bus_addr = (tbl->table_map_base + (entry << IO_PAGE_SHIFT));
ret = bus_addr | (oaddr & ~IO_PAGE_MASK);
base_paddr = __pa(oaddr & IO_PAGE_MASK);
prot = HV_PCI_MAP_ATTR_READ;
if (direction != DMA_TO_DEVICE)
prot |= HV_PCI_MAP_ATTR_WRITE;
@ -403,8 +410,8 @@ static dma_addr_t dma_4v_map_page(struct device *dev, struct page *page,
iommu_batch_start(dev, prot, entry);
for (i = 0; i < npages; i++, base_paddr += IO_PAGE_SIZE) {
long err = iommu_batch_add(base_paddr, mask);
for (i = 0; i < npages; i++, phys += IO_PAGE_SIZE) {
long err = iommu_batch_add(phys, mask);
if (unlikely(err < 0L))
goto iommu_map_fail;
}
@ -426,7 +433,7 @@ iommu_map_fail:
return DMA_MAPPING_ERROR;
}
static void dma_4v_unmap_page(struct device *dev, dma_addr_t bus_addr,
static void dma_4v_unmap_phys(struct device *dev, dma_addr_t bus_addr,
size_t sz, enum dma_data_direction direction,
unsigned long attrs)
{
@ -686,8 +693,8 @@ static int dma_4v_supported(struct device *dev, u64 device_mask)
static const struct dma_map_ops sun4v_dma_ops = {
.alloc = dma_4v_alloc_coherent,
.free = dma_4v_free_coherent,
.map_page = dma_4v_map_page,
.unmap_page = dma_4v_unmap_page,
.map_phys = dma_4v_map_phys,
.unmap_phys = dma_4v_unmap_phys,
.map_sg = dma_4v_map_sg,
.unmap_sg = dma_4v_unmap_sg,
.dma_supported = dma_4v_supported,

View File

@ -94,13 +94,14 @@ static int __init iounit_init(void)
subsys_initcall(iounit_init);
/* One has to hold iounit->lock to call this */
static unsigned long iounit_get_area(struct iounit_struct *iounit, unsigned long vaddr, int size)
static dma_addr_t iounit_get_area(struct iounit_struct *iounit,
phys_addr_t phys, int size)
{
int i, j, k, npages;
unsigned long rotor, scan, limit;
iopte_t iopte;
npages = ((vaddr & ~PAGE_MASK) + size + (PAGE_SIZE-1)) >> PAGE_SHIFT;
npages = (offset_in_page(phys) + size + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
/* A tiny bit of magic ingredience :) */
switch (npages) {
@ -109,7 +110,7 @@ static unsigned long iounit_get_area(struct iounit_struct *iounit, unsigned long
default: i = 0x0213; break;
}
IOD(("iounit_get_area(%08lx,%d[%d])=", vaddr, size, npages));
IOD(("%s(%pa,%d[%d])=", __func__, &phys, size, npages));
next: j = (i & 15);
rotor = iounit->rotor[j - 1];
@ -124,7 +125,8 @@ nexti: scan = find_next_zero_bit(iounit->bmap, limit, scan);
}
i >>= 4;
if (!(i & 15))
panic("iounit_get_area: Couldn't find free iopte slots for (%08lx,%d)\n", vaddr, size);
panic("iounit_get_area: Couldn't find free iopte slots for (%pa,%d)\n",
&phys, size);
goto next;
}
for (k = 1, scan++; k < npages; k++)
@ -132,30 +134,29 @@ nexti: scan = find_next_zero_bit(iounit->bmap, limit, scan);
goto nexti;
iounit->rotor[j - 1] = (scan < limit) ? scan : iounit->limit[j - 1];
scan -= npages;
iopte = MKIOPTE(__pa(vaddr & PAGE_MASK));
vaddr = IOUNIT_DMA_BASE + (scan << PAGE_SHIFT) + (vaddr & ~PAGE_MASK);
iopte = MKIOPTE(phys & PAGE_MASK);
phys = IOUNIT_DMA_BASE + (scan << PAGE_SHIFT) + offset_in_page(phys);
for (k = 0; k < npages; k++, iopte = __iopte(iopte_val(iopte) + 0x100), scan++) {
set_bit(scan, iounit->bmap);
sbus_writel(iopte_val(iopte), &iounit->page_table[scan]);
}
IOD(("%08lx\n", vaddr));
return vaddr;
IOD(("%pa\n", &phys));
return phys;
}
static dma_addr_t iounit_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t len, enum dma_data_direction dir,
unsigned long attrs)
static dma_addr_t iounit_map_phys(struct device *dev, phys_addr_t phys,
size_t len, enum dma_data_direction dir, unsigned long attrs)
{
void *vaddr = page_address(page) + offset;
struct iounit_struct *iounit = dev->archdata.iommu;
unsigned long ret, flags;
unsigned long flags;
dma_addr_t ret;
/* XXX So what is maxphys for us and how do drivers know it? */
if (!len || len > 256 * 1024)
return DMA_MAPPING_ERROR;
spin_lock_irqsave(&iounit->lock, flags);
ret = iounit_get_area(iounit, (unsigned long)vaddr, len);
ret = iounit_get_area(iounit, phys, len);
spin_unlock_irqrestore(&iounit->lock, flags);
return ret;
}
@ -171,14 +172,15 @@ static int iounit_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
/* FIXME: Cache some resolved pages - often several sg entries are to the same page */
spin_lock_irqsave(&iounit->lock, flags);
for_each_sg(sgl, sg, nents, i) {
sg->dma_address = iounit_get_area(iounit, (unsigned long) sg_virt(sg), sg->length);
sg->dma_address =
iounit_get_area(iounit, sg_phys(sg), sg->length);
sg->dma_length = sg->length;
}
spin_unlock_irqrestore(&iounit->lock, flags);
return nents;
}
static void iounit_unmap_page(struct device *dev, dma_addr_t vaddr, size_t len,
static void iounit_unmap_phys(struct device *dev, dma_addr_t vaddr, size_t len,
enum dma_data_direction dir, unsigned long attrs)
{
struct iounit_struct *iounit = dev->archdata.iommu;
@ -279,8 +281,8 @@ static const struct dma_map_ops iounit_dma_ops = {
.alloc = iounit_alloc,
.free = iounit_free,
#endif
.map_page = iounit_map_page,
.unmap_page = iounit_unmap_page,
.map_phys = iounit_map_phys,
.unmap_phys = iounit_unmap_phys,
.map_sg = iounit_map_sg,
.unmap_sg = iounit_unmap_sg,
};

View File

@ -181,18 +181,20 @@ static void iommu_flush_iotlb(iopte_t *iopte, unsigned int niopte)
}
}
static dma_addr_t __sbus_iommu_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t len, bool per_page_flush)
static dma_addr_t __sbus_iommu_map_phys(struct device *dev, phys_addr_t paddr,
size_t len, bool per_page_flush, unsigned long attrs)
{
struct iommu_struct *iommu = dev->archdata.iommu;
phys_addr_t paddr = page_to_phys(page) + offset;
unsigned long off = paddr & ~PAGE_MASK;
unsigned long off = offset_in_page(paddr);
unsigned long npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
unsigned long pfn = __phys_to_pfn(paddr);
unsigned int busa, busa0;
iopte_t *iopte, *iopte0;
int ioptex, i;
if (unlikely(attrs & DMA_ATTR_MMIO))
return DMA_MAPPING_ERROR;
/* XXX So what is maxphys for us and how do drivers know it? */
if (!len || len > 256 * 1024)
return DMA_MAPPING_ERROR;
@ -202,10 +204,10 @@ static dma_addr_t __sbus_iommu_map_page(struct device *dev, struct page *page,
* XXX Is this a good assumption?
* XXX What if someone else unmaps it here and races us?
*/
if (per_page_flush && !PageHighMem(page)) {
if (per_page_flush && !PhysHighMem(paddr)) {
unsigned long vaddr, p;
vaddr = (unsigned long)page_address(page) + offset;
vaddr = (unsigned long)phys_to_virt(paddr);
for (p = vaddr & PAGE_MASK; p < vaddr + len; p += PAGE_SIZE)
flush_page_for_dma(p);
}
@ -231,19 +233,19 @@ static dma_addr_t __sbus_iommu_map_page(struct device *dev, struct page *page,
return busa0 + off;
}
static dma_addr_t sbus_iommu_map_page_gflush(struct device *dev,
struct page *page, unsigned long offset, size_t len,
enum dma_data_direction dir, unsigned long attrs)
static dma_addr_t sbus_iommu_map_phys_gflush(struct device *dev,
phys_addr_t phys, size_t len, enum dma_data_direction dir,
unsigned long attrs)
{
flush_page_for_dma(0);
return __sbus_iommu_map_page(dev, page, offset, len, false);
return __sbus_iommu_map_phys(dev, phys, len, false, attrs);
}
static dma_addr_t sbus_iommu_map_page_pflush(struct device *dev,
struct page *page, unsigned long offset, size_t len,
enum dma_data_direction dir, unsigned long attrs)
static dma_addr_t sbus_iommu_map_phys_pflush(struct device *dev,
phys_addr_t phys, size_t len, enum dma_data_direction dir,
unsigned long attrs)
{
return __sbus_iommu_map_page(dev, page, offset, len, true);
return __sbus_iommu_map_phys(dev, phys, len, true, attrs);
}
static int __sbus_iommu_map_sg(struct device *dev, struct scatterlist *sgl,
@ -254,8 +256,8 @@ static int __sbus_iommu_map_sg(struct device *dev, struct scatterlist *sgl,
int j;
for_each_sg(sgl, sg, nents, j) {
sg->dma_address =__sbus_iommu_map_page(dev, sg_page(sg),
sg->offset, sg->length, per_page_flush);
sg->dma_address = __sbus_iommu_map_phys(dev, sg_phys(sg),
sg->length, per_page_flush, attrs);
if (sg->dma_address == DMA_MAPPING_ERROR)
return -EIO;
sg->dma_length = sg->length;
@ -277,7 +279,7 @@ static int sbus_iommu_map_sg_pflush(struct device *dev, struct scatterlist *sgl,
return __sbus_iommu_map_sg(dev, sgl, nents, dir, attrs, true);
}
static void sbus_iommu_unmap_page(struct device *dev, dma_addr_t dma_addr,
static void sbus_iommu_unmap_phys(struct device *dev, dma_addr_t dma_addr,
size_t len, enum dma_data_direction dir, unsigned long attrs)
{
struct iommu_struct *iommu = dev->archdata.iommu;
@ -303,7 +305,7 @@ static void sbus_iommu_unmap_sg(struct device *dev, struct scatterlist *sgl,
int i;
for_each_sg(sgl, sg, nents, i) {
sbus_iommu_unmap_page(dev, sg->dma_address, sg->length, dir,
sbus_iommu_unmap_phys(dev, sg->dma_address, sg->length, dir,
attrs);
sg->dma_address = 0x21212121;
}
@ -426,8 +428,8 @@ static const struct dma_map_ops sbus_iommu_dma_gflush_ops = {
.alloc = sbus_iommu_alloc,
.free = sbus_iommu_free,
#endif
.map_page = sbus_iommu_map_page_gflush,
.unmap_page = sbus_iommu_unmap_page,
.map_phys = sbus_iommu_map_phys_gflush,
.unmap_phys = sbus_iommu_unmap_phys,
.map_sg = sbus_iommu_map_sg_gflush,
.unmap_sg = sbus_iommu_unmap_sg,
};
@ -437,8 +439,8 @@ static const struct dma_map_ops sbus_iommu_dma_pflush_ops = {
.alloc = sbus_iommu_alloc,
.free = sbus_iommu_free,
#endif
.map_page = sbus_iommu_map_page_pflush,
.unmap_page = sbus_iommu_unmap_page,
.map_phys = sbus_iommu_map_phys_pflush,
.unmap_phys = sbus_iommu_unmap_phys,
.map_sg = sbus_iommu_map_sg_pflush,
.unmap_sg = sbus_iommu_unmap_sg,
};

View File

@ -5,7 +5,6 @@ generic-y += device.h
generic-y += dma-mapping.h
generic-y += emergency-restart.h
generic-y += exec.h
generic-y += extable.h
generic-y += ftrace.h
generic-y += hw_irq.h
generic-y += irq_regs.h

View File

@ -9,6 +9,7 @@
#include <linux/fs.h>
#include <asm/mman.h>
#include <asm/seccomp.h>
#include <asm/extable.h>
/* workaround for a warning with -Wmissing-prototypes */
void foo(void);
@ -42,4 +43,7 @@ void foo(void)
DEFINE(HOSTFS_ATTR_CTIME, ATTR_CTIME);
DEFINE(HOSTFS_ATTR_ATIME_SET, ATTR_ATIME_SET);
DEFINE(HOSTFS_ATTR_MTIME_SET, ATTR_MTIME_SET);
DEFINE(ALT_INSTR_SIZE, sizeof(struct alt_instr));
DEFINE(EXTABLE_SIZE, sizeof(struct exception_table_entry));
}

View File

@ -197,8 +197,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
"773:\n"
#define ALTINSTR_ENTRY(ft_flags) \
".pushsection .altinstructions,\"a\"\n" \
ANNOTATE_DATA_SPECIAL \
".pushsection .altinstructions, \"aM\", @progbits, " \
__stringify(ALT_INSTR_SIZE) "\n" \
" .long 771b - .\n" /* label */ \
" .long 774f - .\n" /* new instruction */ \
" .4byte " __stringify(ft_flags) "\n" /* feature + flags */ \
@ -208,7 +208,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define ALTINSTR_REPLACEMENT(newinstr) /* replacement */ \
".pushsection .altinstr_replacement, \"ax\"\n" \
ANNOTATE_DATA_SPECIAL \
ANNOTATE_DATA_SPECIAL "\n" \
"# ALT: replacement\n" \
"774:\n\t" newinstr "\n775:\n" \
".popsection\n"
@ -339,7 +339,6 @@ void nop_func(void);
* instruction. See apply_alternatives().
*/
.macro altinstr_entry orig alt ft_flags orig_len alt_len
ANNOTATE_DATA_SPECIAL
.long \orig - .
.long \alt - .
.4byte \ft_flags
@ -363,7 +362,7 @@ void nop_func(void);
741: \
.skip -(((744f-743f)-(741b-740b)) > 0) * ((744f-743f)-(741b-740b)),0x90 ;\
742: \
.pushsection .altinstructions,"a" ; \
.pushsection .altinstructions, "aM", @progbits, ALT_INSTR_SIZE ;\
altinstr_entry 740b,743f,flag,742b-740b,744f-743f ; \
.popsection ; \
.pushsection .altinstr_replacement,"ax" ; \

View File

@ -126,18 +126,21 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
#ifdef __KERNEL__
#ifndef COMPILE_OFFSETS
#include <asm/asm-offsets.h>
#endif
# include <asm/extable_fixup_types.h>
/* Exception table entry */
#ifdef __ASSEMBLER__
# define _ASM_EXTABLE_TYPE(from, to, type) \
.pushsection "__ex_table","a" ; \
.balign 4 ; \
ANNOTATE_DATA_SPECIAL ; \
.long (from) - . ; \
.long (to) - . ; \
.long type ; \
# define _ASM_EXTABLE_TYPE(from, to, type) \
.pushsection "__ex_table", "aM", @progbits, EXTABLE_SIZE ; \
.balign 4 ; \
.long (from) - . ; \
.long (to) - . ; \
.long type ; \
.popsection
# ifdef CONFIG_KPROBES
@ -180,18 +183,18 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
".purgem extable_type_reg\n"
# define _ASM_EXTABLE_TYPE(from, to, type) \
" .pushsection \"__ex_table\",\"a\"\n" \
" .pushsection __ex_table, \"aM\", @progbits, " \
__stringify(EXTABLE_SIZE) "\n" \
" .balign 4\n" \
ANNOTATE_DATA_SPECIAL \
" .long (" #from ") - .\n" \
" .long (" #to ") - .\n" \
" .long " __stringify(type) " \n" \
" .popsection\n"
# define _ASM_EXTABLE_TYPE_REG(from, to, type, reg) \
" .pushsection \"__ex_table\",\"a\"\n" \
" .pushsection __ex_table, \"aM\", @progbits, " \
__stringify(EXTABLE_SIZE) "\n" \
" .balign 4\n" \
ANNOTATE_DATA_SPECIAL \
" .long (" #from ") - .\n" \
" .long (" #to ") - .\n" \
DEFINE_EXTABLE_TYPE_REG \

View File

@ -70,7 +70,7 @@ extern void __WARN_trap(struct bug_entry *bug, ...);
#define _BUG_FLAGS_ASM(format, file, line, flags, size, extra) \
".pushsection __bug_table,\"aw\"\n\t" \
ANNOTATE_DATA_SPECIAL \
ANNOTATE_DATA_SPECIAL "\n\t" \
"2:\n\t" \
__BUG_ENTRY(format, file, line, flags) \
"\t.org 2b + " size "\n" \

View File

@ -101,7 +101,7 @@ static __always_inline bool _static_cpu_has(u16 bit)
asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]")
".pushsection .altinstr_aux,\"ax\"\n"
"6:\n"
ANNOTATE_DATA_SPECIAL
ANNOTATE_DATA_SPECIAL "\n"
" testb %[bitnum], %a[cap_byte]\n"
" jnz %l[t_yes]\n"
" jmp %l[t_no]\n"

View File

@ -2,6 +2,8 @@
#ifndef _ASM_X86_CPUMASK_H
#define _ASM_X86_CPUMASK_H
#ifndef __ASSEMBLER__
#include <linux/compiler.h>
#include <linux/cpumask.h>
extern void setup_cpu_local_masks(void);

View File

@ -60,6 +60,12 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
}
#define div_u64_rem div_u64_rem
/*
* gcc tends to zero extend 32bit values and do full 64bit maths.
* Define asm functions that avoid this.
* (clang generates better code for the C versions.)
*/
#ifndef __clang__
static inline u64 mul_u32_u32(u32 a, u32 b)
{
u32 high, low;
@ -71,6 +77,19 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
}
#define mul_u32_u32 mul_u32_u32
static inline u64 add_u64_u32(u64 a, u32 b)
{
u32 high = a >> 32, low = a;
asm ("addl %[b], %[low]; adcl $0, %[high]"
: [low] "+r" (low), [high] "+r" (high)
: [b] "rm" (b) );
return low | (u64)high << 32;
}
#define add_u64_u32 add_u64_u32
#endif
/*
* __div64_32() is never called on x86, so prevent the
* generic definition from getting built.
@ -84,21 +103,25 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
* Will generate an #DE when the result doesn't fit u64, could fix with an
* __ex_table[] entry when it becomes an issue.
*/
static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
static inline u64 mul_u64_add_u64_div_u64(u64 rax, u64 mul, u64 add, u64 div)
{
u64 q;
u64 rdx;
asm ("mulq %2; divq %3" : "=a" (q)
: "a" (a), "rm" (mul), "rm" (div)
: "rdx");
asm ("mulq %[mul]" : "+a" (rax), "=d" (rdx) : [mul] "rm" (mul));
return q;
if (!statically_true(!add))
asm ("addq %[add], %[lo]; adcq $0, %[hi]" :
[lo] "+r" (rax), [hi] "+r" (rdx) : [add] "irm" (add));
asm ("divq %[div]" : "+a" (rax), "+d" (rdx) : [div] "rm" (div));
return rax;
}
#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
#define mul_u64_add_u64_div_u64 mul_u64_add_u64_div_u64
static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
{
return mul_u64_u64_div_u64(a, mul, div);
return mul_u64_add_u64_div_u64(a, mul, 0, div);
}
#define mul_u64_u32_div mul_u64_u32_div

View File

@ -101,7 +101,7 @@
#define ASM_CALL_ARG0 \
"1: call %c[__func] \n" \
ANNOTATE_REACHABLE(1b)
ANNOTATE_REACHABLE(1b) " \n"
#define ASM_CALL_ARG1 \
"movq %[arg1], %%rdi \n" \

View File

@ -15,7 +15,7 @@
#define JUMP_TABLE_ENTRY(key, label) \
".pushsection __jump_table, \"aw\" \n\t" \
_ASM_ALIGN "\n\t" \
ANNOTATE_DATA_SPECIAL \
ANNOTATE_DATA_SPECIAL "\n" \
".long 1b - . \n\t" \
".long " label " - . \n\t" \
_ASM_PTR " " key " - . \n\t" \

View File

@ -466,7 +466,7 @@ static inline void call_depth_return_thunk(void) {}
*/
# define CALL_NOSPEC \
ALTERNATIVE_2( \
ANNOTATE_RETPOLINE_SAFE \
ANNOTATE_RETPOLINE_SAFE "\n" \
"call *%[thunk_target]\n", \
" jmp 904f;\n" \
" .align 16\n" \
@ -482,7 +482,7 @@ static inline void call_depth_return_thunk(void) {}
"904: call 901b;\n", \
X86_FEATURE_RETPOLINE, \
"lfence;\n" \
ANNOTATE_RETPOLINE_SAFE \
ANNOTATE_RETPOLINE_SAFE "\n" \
"call *%[thunk_target]\n", \
X86_FEATURE_RETPOLINE_LFENCE)

View File

@ -249,7 +249,7 @@ extern struct paravirt_patch_template pv_ops;
* don't need to bother with CFI prefixes.
*/
#define PARAVIRT_CALL \
ANNOTATE_RETPOLINE_SAFE \
ANNOTATE_RETPOLINE_SAFE "\n\t" \
"call *%[paravirt_opptr];"
/*

View File

@ -77,7 +77,7 @@ static __always_inline unsigned long smap_save(void)
unsigned long flags;
asm volatile ("# smap_save\n\t"
ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE
ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE "\n\t"
"", "pushf; pop %0; clac",
X86_FEATURE_SMAP)
: "=rm" (flags) : : "memory", "cc");
@ -88,7 +88,7 @@ static __always_inline unsigned long smap_save(void)
static __always_inline void smap_restore(unsigned long flags)
{
asm volatile ("# smap_restore\n\t"
ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE
ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE "\n\t"
"", "push %0; popf",
X86_FEATURE_SMAP)
: : "g" (flags) : "memory", "cc");
@ -101,9 +101,9 @@ static __always_inline void smap_restore(unsigned long flags)
ALTERNATIVE("", "stac", X86_FEATURE_SMAP)
#define ASM_CLAC_UNSAFE \
ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "clac", X86_FEATURE_SMAP)
ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "\n\t" "clac", X86_FEATURE_SMAP)
#define ASM_STAC_UNSAFE \
ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "stac", X86_FEATURE_SMAP)
ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "\n\t" "stac", X86_FEATURE_SMAP)
#endif /* __ASSEMBLER__ */

View File

@ -36,7 +36,7 @@
".align 4 \n" \
".globl " STATIC_CALL_TRAMP_STR(name) " \n" \
STATIC_CALL_TRAMP_STR(name) ": \n" \
ANNOTATE_NOENDBR \
ANNOTATE_NOENDBR " \n" \
insns " \n" \
".byte 0x0f, 0xb9, 0xcc \n" \
".type " STATIC_CALL_TRAMP_STR(name) ", @function \n" \

View File

@ -79,7 +79,7 @@ struct x86_init_paging {
/**
* struct x86_init_timers - platform specific timer setup
* @setup_perpcu_clockev: set up the per cpu clock event device for the
* @setup_percpu_clockev: set up the per cpu clock event device for the
* boot cpu
* @timer_init: initialize the platform timer (default PIT/HPET)
* @wallclock_init: init the wallclock device
@ -132,7 +132,7 @@ struct x86_hyper_init {
/**
* struct x86_init_acpi - x86 ACPI init functions
* @set_root_poitner: set RSDP address
* @set_root_pointer: set RSDP address
* @get_root_pointer: get RSDP address
* @reduced_hw_early_init: hardware reduced platform early init
*/
@ -145,14 +145,14 @@ struct x86_init_acpi {
/**
* struct x86_guest - Functions used by misc guest incarnations like SEV, TDX, etc.
*
* @enc_status_change_prepare Notify HV before the encryption status of a range is changed
* @enc_status_change_finish Notify HV after the encryption status of a range is changed
* @enc_tlb_flush_required Returns true if a TLB flush is needed before changing page encryption status
* @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status
* @enc_kexec_begin Begin the two-step process of converting shared memory back
* @enc_status_change_prepare: Notify HV before the encryption status of a range is changed
* @enc_status_change_finish: Notify HV after the encryption status of a range is changed
* @enc_tlb_flush_required: Returns true if a TLB flush is needed before changing page encryption status
* @enc_cache_flush_required: Returns true if a cache flush is needed before changing page encryption status
* @enc_kexec_begin: Begin the two-step process of converting shared memory back
* to private. It stops the new conversions from being started
* and waits in-flight conversions to finish, if possible.
* @enc_kexec_finish Finish the two-step process of converting shared memory to
* @enc_kexec_finish: Finish the two-step process of converting shared memory to
* private. All memory is private after the call when
* the function returns.
* It is called on only one CPU while the others are shut down
@ -229,7 +229,7 @@ struct x86_legacy_devices {
* given platform/subarch.
* @X86_LEGACY_I8042_FIRMWARE_ABSENT: firmware reports that the controller
* is absent.
* @X86_LEGACY_i8042_EXPECTED_PRESENT: the controller is likely to be
* @X86_LEGACY_I8042_EXPECTED_PRESENT: the controller is likely to be
* present, the i8042 driver should probe for controller existence.
*/
enum x86_legacy_i8042_state {
@ -244,6 +244,8 @@ enum x86_legacy_i8042_state {
* @i8042: indicated if we expect the device to have i8042 controller
* present.
* @rtc: this device has a CMOS real-time clock present
* @warm_reset: 1 if platform allows warm reset, else 0
* @no_vga: 1 if (FADT.boot_flags & ACPI_FADT_NO_VGA) is set, else 0
* @reserve_bios_regions: boot code will search for the EBDA address and the
* start of the 640k - 1M BIOS region. If false, the platform must
* ensure that its memory map correctly reserves sub-1MB regions as needed.
@ -290,9 +292,10 @@ struct x86_hyper_runtime {
* @calibrate_tsc: calibrate TSC, if different from CPU
* @get_wallclock: get time from HW clock like RTC etc.
* @set_wallclock: set time back to HW clock
* @is_untracked_pat_range exclude from PAT logic
* @nmi_init enable NMI on cpus
* @get_nmi_reason get the reason an NMI was received
* @iommu_shutdown: set by an IOMMU driver for shutdown if necessary
* @is_untracked_pat_range: exclude from PAT logic
* @nmi_init: enable NMI on cpus
* @get_nmi_reason: get the reason an NMI was received
* @save_sched_clock_state: save state for sched_clock() on suspend
* @restore_sched_clock_state: restore state for sched_clock() on resume
* @apic_post_init: adjust apic if needed
@ -307,6 +310,7 @@ struct x86_hyper_runtime {
* @realmode_reserve: reserve memory for realmode trampoline
* @realmode_init: initialize realmode trampoline
* @hyper: x86 hypervisor specific runtime callbacks
* @guest: guest incarnations callbacks
*/
struct x86_platform_ops {
unsigned long (*calibrate_cpu)(void);

View File

@ -2229,7 +2229,7 @@ asm (
" .pushsection .init.text, \"ax\", @progbits\n"
" .type int3_selftest_asm, @function\n"
"int3_selftest_asm:\n"
ANNOTATE_NOENDBR
ANNOTATE_NOENDBR "\n"
/*
* INT3 padded with NOP to CALL_INSN_SIZE. The INT3 triggers an
* exception, then the int3_exception_nb notifier emulates a call to
@ -2247,7 +2247,7 @@ asm (
" .pushsection .init.text, \"ax\", @progbits\n"
" .type int3_selftest_callee, @function\n"
"int3_selftest_callee:\n"
ANNOTATE_NOENDBR
ANNOTATE_NOENDBR "\n"
" movl $0x1234, (%" _ASM_ARG1 ")\n"
ASM_RET
" .size int3_selftest_callee, . - int3_selftest_callee\n"

View File

@ -222,13 +222,14 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
}
/* Map a single area into the IOMMU */
static dma_addr_t gart_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size,
enum dma_data_direction dir,
static dma_addr_t gart_map_phys(struct device *dev, phys_addr_t paddr,
size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
unsigned long bus;
phys_addr_t paddr = page_to_phys(page) + offset;
if (unlikely(attrs & DMA_ATTR_MMIO))
return DMA_MAPPING_ERROR;
if (!need_iommu(dev, paddr, size))
return paddr;
@ -242,7 +243,7 @@ static dma_addr_t gart_map_page(struct device *dev, struct page *page,
/*
* Free a DMA mapping.
*/
static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
static void gart_unmap_phys(struct device *dev, dma_addr_t dma_addr,
size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
@ -282,7 +283,7 @@ static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
for_each_sg(sg, s, nents, i) {
if (!s->dma_length || !s->length)
break;
gart_unmap_page(dev, s->dma_address, s->dma_length, dir, 0);
gart_unmap_phys(dev, s->dma_address, s->dma_length, dir, 0);
}
}
@ -487,7 +488,7 @@ static void
gart_free_coherent(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_addr, unsigned long attrs)
{
gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0);
gart_unmap_phys(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0);
dma_direct_free(dev, size, vaddr, dma_addr, attrs);
}
@ -672,8 +673,8 @@ static __init int init_amd_gatt(struct agp_kern_info *info)
static const struct dma_map_ops gart_dma_ops = {
.map_sg = gart_map_sg,
.unmap_sg = gart_unmap_sg,
.map_page = gart_map_page,
.unmap_page = gart_unmap_page,
.map_phys = gart_map_phys,
.unmap_phys = gart_unmap_phys,
.alloc = gart_alloc_coherent,
.free = gart_free_coherent,
.mmap = dma_common_mmap,

View File

@ -124,4 +124,7 @@ static void __used common(void)
OFFSET(ARIA_CTX_rounds, aria_ctx, rounds);
#endif
BLANK();
DEFINE(ALT_INSTR_SIZE, sizeof(struct alt_instr));
DEFINE(EXTABLE_SIZE, sizeof(struct exception_table_entry));
}

View File

@ -45,6 +45,7 @@
#include <linux/task_work.h>
#include <linux/hardirq.h>
#include <linux/kexec.h>
#include <linux/vmcore_info.h>
#include <asm/fred.h>
#include <asm/cpu_device_id.h>
@ -1729,6 +1730,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
}
out:
/* Given it didn't panic, mark it as recoverable */
hwerr_log_error_type(HWERR_RECOV_OTHERS);
instrumentation_end();
clear:

View File

@ -25,7 +25,7 @@ asm(
".type arch_rethook_trampoline, @function\n"
"arch_rethook_trampoline:\n"
#ifdef CONFIG_X86_64
ANNOTATE_NOENDBR /* This is only jumped from ret instruction */
ANNOTATE_NOENDBR "\n" /* This is only jumped from ret instruction */
/* Push a fake return address to tell the unwinder it's a rethook. */
" pushq $arch_rethook_trampoline\n"
UNWIND_HINT_FUNC

View File

@ -50,8 +50,8 @@ asm (".global __static_call_return\n\t"
".type __static_call_return, @function\n\t"
ASM_FUNC_ALIGN "\n\t"
"__static_call_return:\n\t"
ANNOTATE_NOENDBR
ANNOTATE_RETPOLINE_SAFE
ANNOTATE_NOENDBR "\n\t"
ANNOTATE_RETPOLINE_SAFE "\n\t"
"ret; int3\n\t"
".size __static_call_return, . - __static_call_return \n\t");

View File

@ -13,7 +13,7 @@ asm(
".globl just_return_func\n"
ASM_FUNC_ALIGN
"just_return_func:\n"
ANNOTATE_NOENDBR
ANNOTATE_NOENDBR "\n"
ASM_RET
".size just_return_func, .-just_return_func\n"
);

View File

@ -160,7 +160,7 @@ obj-$(CONFIG_RPMSG) += rpmsg/
obj-$(CONFIG_SOUNDWIRE) += soundwire/
# Virtualization drivers
obj-$(CONFIG_VIRT_DRIVERS) += virt/
obj-y += virt/
obj-$(CONFIG_HYPERV) += hv/
obj-$(CONFIG_PM_DEVFREQ) += devfreq/

View File

@ -44,6 +44,7 @@
#include <linux/uuid.h>
#include <linux/ras.h>
#include <linux/task_work.h>
#include <linux/vmcore_info.h>
#include <acpi/actbl1.h>
#include <acpi/ghes.h>
@ -864,6 +865,40 @@ int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
}
EXPORT_SYMBOL_NS_GPL(cxl_cper_kfifo_get, "CXL");
static void ghes_log_hwerr(int sev, guid_t *sec_type)
{
if (sev != CPER_SEV_RECOVERABLE)
return;
if (guid_equal(sec_type, &CPER_SEC_PROC_ARM) ||
guid_equal(sec_type, &CPER_SEC_PROC_GENERIC) ||
guid_equal(sec_type, &CPER_SEC_PROC_IA)) {
hwerr_log_error_type(HWERR_RECOV_CPU);
return;
}
if (guid_equal(sec_type, &CPER_SEC_CXL_PROT_ERR) ||
guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID) ||
guid_equal(sec_type, &CPER_SEC_CXL_DRAM_GUID) ||
guid_equal(sec_type, &CPER_SEC_CXL_MEM_MODULE_GUID)) {
hwerr_log_error_type(HWERR_RECOV_CXL);
return;
}
if (guid_equal(sec_type, &CPER_SEC_PCIE) ||
guid_equal(sec_type, &CPER_SEC_PCI_X_BUS)) {
hwerr_log_error_type(HWERR_RECOV_PCI);
return;
}
if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
hwerr_log_error_type(HWERR_RECOV_MEMORY);
return;
}
hwerr_log_error_type(HWERR_RECOV_OTHERS);
}
static void ghes_do_proc(struct ghes *ghes,
const struct acpi_hest_generic_status *estatus)
{
@ -885,6 +920,7 @@ static void ghes_do_proc(struct ghes *ghes,
if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
fru_text = gdata->fru_text;
ghes_log_hwerr(sev, sec_type);
if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);

View File

@ -19,6 +19,7 @@ use kernel::{
cred::Credential,
error::Error,
fs::file::{self, File},
id_pool::IdPool,
list::{List, ListArc, ListArcField, ListLinks},
mm,
prelude::*,
@ -394,6 +395,8 @@ kernel::list::impl_list_item! {
struct ProcessNodeRefs {
/// Used to look up nodes using the 32-bit id that this process knows it by.
by_handle: RBTree<u32, ListArc<NodeRefInfo, { NodeRefInfo::LIST_PROC }>>,
/// Used to quickly find unused ids in `by_handle`.
handle_is_present: IdPool,
/// Used to look up nodes without knowing their local 32-bit id. The usize is the address of
/// the underlying `Node` struct as returned by `Node::global_id`.
by_node: RBTree<usize, u32>,
@ -408,6 +411,7 @@ impl ProcessNodeRefs {
fn new() -> Self {
Self {
by_handle: RBTree::new(),
handle_is_present: IdPool::new(),
by_node: RBTree::new(),
freeze_listeners: RBTree::new(),
}
@ -802,7 +806,7 @@ impl Process {
pub(crate) fn insert_or_update_handle(
self: ArcBorrow<'_, Process>,
node_ref: NodeRef,
is_mananger: bool,
is_manager: bool,
) -> Result<u32> {
{
let mut refs = self.node_refs.lock();
@ -821,7 +825,33 @@ impl Process {
let reserve2 = RBTreeNodeReservation::new(GFP_KERNEL)?;
let info = UniqueArc::new_uninit(GFP_KERNEL)?;
let mut refs = self.node_refs.lock();
let mut refs_lock = self.node_refs.lock();
let mut refs = &mut *refs_lock;
let (unused_id, by_handle_slot) = loop {
// ID 0 may only be used by the manager.
let start = if is_manager { 0 } else { 1 };
if let Some(res) = refs.handle_is_present.find_unused_id(start) {
match refs.by_handle.entry(res.as_u32()) {
rbtree::Entry::Vacant(entry) => break (res, entry),
rbtree::Entry::Occupied(_) => {
pr_err!("Detected mismatch between handle_is_present and by_handle");
res.acquire();
kernel::warn_on!(true);
return Err(EINVAL);
}
}
}
let grow_request = refs.handle_is_present.grow_request().ok_or(ENOMEM)?;
drop(refs_lock);
let resizer = grow_request.realloc(GFP_KERNEL)?;
refs_lock = self.node_refs.lock();
refs = &mut *refs_lock;
refs.handle_is_present.grow(resizer);
};
let handle = unused_id.as_u32();
// Do a lookup again as node may have been inserted before the lock was reacquired.
if let Some(handle_ref) = refs.by_node.get(&node_ref.node.global_id()) {
@ -831,20 +861,9 @@ impl Process {
return Ok(handle);
}
// Find id.
let mut target: u32 = if is_mananger { 0 } else { 1 };
for handle in refs.by_handle.keys() {
if *handle > target {
break;
}
if *handle == target {
target = target.checked_add(1).ok_or(ENOMEM)?;
}
}
let gid = node_ref.node.global_id();
let (info_proc, info_node) = {
let info_init = NodeRefInfo::new(node_ref, target, self.into());
let info_init = NodeRefInfo::new(node_ref, handle, self.into());
match info.pin_init_with(info_init) {
Ok(info) => ListArc::pair_from_pin_unique(info),
// error is infallible
@ -865,9 +884,10 @@ impl Process {
// `info_node` into the right node's `refs` list.
unsafe { info_proc.node_ref2().node.insert_node_info(info_node) };
refs.by_node.insert(reserve1.into_node(gid, target));
refs.by_handle.insert(reserve2.into_node(target, info_proc));
Ok(target)
refs.by_node.insert(reserve1.into_node(gid, handle));
by_handle_slot.insert(info_proc, reserve2);
unused_id.acquire();
Ok(handle)
}
pub(crate) fn get_transaction_node(&self, handle: u32) -> BinderResult<NodeRef> {
@ -932,6 +952,16 @@ impl Process {
let id = info.node_ref().node.global_id();
refs.by_handle.remove(&handle);
refs.by_node.remove(&id);
refs.handle_is_present.release_id(handle as usize);
if let Some(shrink) = refs.handle_is_present.shrink_request() {
drop(refs);
// This intentionally ignores allocation failures.
if let Ok(new_bitmap) = shrink.realloc(GFP_KERNEL) {
refs = self.node_refs.lock();
refs.handle_is_present.shrink(new_bitmap);
}
}
}
} else {
// All refs are cleared in process exit, so this warning is expected in that case.

View File

@ -334,6 +334,19 @@ static struct device *next_device(struct klist_iter *i)
return dev;
}
static struct device *prev_device(struct klist_iter *i)
{
struct klist_node *n = klist_prev(i);
struct device *dev = NULL;
struct device_private *dev_prv;
if (n) {
dev_prv = to_device_private_bus(n);
dev = dev_prv->device;
}
return dev;
}
/**
* bus_for_each_dev - device iterator.
* @bus: bus type.
@ -414,6 +427,31 @@ struct device *bus_find_device(const struct bus_type *bus,
}
EXPORT_SYMBOL_GPL(bus_find_device);
struct device *bus_find_device_reverse(const struct bus_type *bus,
struct device *start, const void *data,
device_match_t match)
{
struct subsys_private *sp = bus_to_subsys(bus);
struct klist_iter i;
struct device *dev;
if (!sp)
return NULL;
klist_iter_init_node(&sp->klist_devices, &i,
(start ? &start->p->knode_bus : NULL));
while ((dev = prev_device(&i))) {
if (match(dev, data)) {
get_device(dev);
break;
}
}
klist_iter_exit(&i);
subsys_put(sp);
return dev;
}
EXPORT_SYMBOL_GPL(bus_find_device_reverse);
static struct device_driver *next_driver(struct klist_iter *i)
{
struct klist_node *n = klist_next(i);

View File

@ -3,6 +3,7 @@
* Copyright (C) 2013 Boris BREZILLON <b.brezillon@overkiz.com>
*/
#include <linux/bitfield.h>
#include <linux/bitops.h>
#include <linux/clk-provider.h>
#include <linux/clkdev.h>

View File

@ -117,9 +117,6 @@ struct at91_clk_pms {
unsigned int parent;
};
#define field_get(_mask, _reg) (((_reg) & (_mask)) >> (ffs(_mask) - 1))
#define field_prep(_mask, _val) (((_val) << (ffs(_mask) - 1)) & (_mask))
#define ndck(a, s) (a[s - 1].id + 1)
#define nck(a) (a[ARRAY_SIZE(a) - 1].id + 1)

View File

@ -7,6 +7,7 @@
* Contact: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
*/
#include <linux/bitfield.h>
#include <linux/clk-provider.h>
#include <linux/init.h>
#include <linux/io.h>
@ -171,8 +172,7 @@ static u8 cpg_div6_clock_get_parent(struct clk_hw *hw)
if (clock->src_mask == 0)
return 0;
hw_index = (readl(clock->reg) & clock->src_mask) >>
__ffs(clock->src_mask);
hw_index = field_get(clock->src_mask, readl(clock->reg));
for (i = 0; i < clk_hw_get_num_parents(hw); i++) {
if (clock->parents[i] == hw_index)
return i;
@ -191,7 +191,7 @@ static int cpg_div6_clock_set_parent(struct clk_hw *hw, u8 index)
if (index >= clk_hw_get_num_parents(hw))
return -EINVAL;
src = clock->parents[index] << __ffs(clock->src_mask);
src = field_prep(clock->src_mask, clock->parents[index]);
writel((readl(clock->reg) & ~clock->src_mask) | src, clock->reg);
return 0;
}

View File

@ -54,10 +54,8 @@ static unsigned long cpg_pll_clk_recalc_rate(struct clk_hw *hw,
{
struct cpg_pll_clk *pll_clk = to_pll_clk(hw);
unsigned int mult;
u32 val;
val = readl(pll_clk->pllcr_reg) & CPG_PLLnCR_STC_MASK;
mult = (val >> __ffs(CPG_PLLnCR_STC_MASK)) + 1;
mult = FIELD_GET(CPG_PLLnCR_STC_MASK, readl(pll_clk->pllcr_reg)) + 1;
return parent_rate * mult * pll_clk->fixed_mult;
}
@ -94,7 +92,7 @@ static int cpg_pll_clk_set_rate(struct clk_hw *hw, unsigned long rate,
val = readl(pll_clk->pllcr_reg);
val &= ~CPG_PLLnCR_STC_MASK;
val |= (mult - 1) << __ffs(CPG_PLLnCR_STC_MASK);
val |= FIELD_PREP(CPG_PLLnCR_STC_MASK, mult - 1);
writel(val, pll_clk->pllcr_reg);
for (i = 1000; i; i--) {
@ -176,11 +174,7 @@ static unsigned long cpg_z_clk_recalc_rate(struct clk_hw *hw,
unsigned long parent_rate)
{
struct cpg_z_clk *zclk = to_z_clk(hw);
unsigned int mult;
u32 val;
val = readl(zclk->reg) & zclk->mask;
mult = 32 - (val >> __ffs(zclk->mask));
unsigned int mult = 32 - field_get(zclk->mask, readl(zclk->reg));
return DIV_ROUND_CLOSEST_ULL((u64)parent_rate * mult,
32 * zclk->fixed_div);
@ -231,7 +225,8 @@ static int cpg_z_clk_set_rate(struct clk_hw *hw, unsigned long rate,
if (readl(zclk->kick_reg) & CPG_FRQCRB_KICK)
return -EBUSY;
cpg_reg_modify(zclk->reg, zclk->mask, (32 - mult) << __ffs(zclk->mask));
cpg_reg_modify(zclk->reg, zclk->mask,
field_prep(zclk->mask, 32 - mult));
/*
* Set KICK bit in FRQCRB to update hardware setting and wait for

Some files were not shown because too many files have changed in this diff Show More