From a2f54ff15c3bdc0132e20aae041607e2320dbd73 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 28 Jul 2025 13:17:00 +0900 Subject: [PATCH 001/246] scsi: core: sysfs: Correct sysfs attributes access rights The SCSI sysfs attributes "supported_mode" and "active_mode" do not define a store method and thus cannot be modified. Correct the DEVICE_ATTR() call for these two attributes to not include S_IWUSR to allow write access as they are read-only. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20250728041700.76660-1-dlemoal@kernel.org Reviewed-by: John Garry Reviewed-by: Johannes Thumshin Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index d772258e29ad..e6464b998960 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -265,7 +265,7 @@ show_shost_supported_mode(struct device *dev, struct device_attribute *attr, return show_shost_mode(supported_mode, buf); } -static DEVICE_ATTR(supported_mode, S_IRUGO | S_IWUSR, show_shost_supported_mode, NULL); +static DEVICE_ATTR(supported_mode, S_IRUGO, show_shost_supported_mode, NULL); static ssize_t show_shost_active_mode(struct device *dev, @@ -279,7 +279,7 @@ show_shost_active_mode(struct device *dev, return show_shost_mode(shost->active_mode, buf); } -static DEVICE_ATTR(active_mode, S_IRUGO | S_IWUSR, show_shost_active_mode, NULL); +static DEVICE_ATTR(active_mode, S_IRUGO, show_shost_active_mode, NULL); static int check_reset_type(const char *str) { From 383cd6d879a18acdaa84c29330b25c49cbc0b490 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 29 Jul 2025 07:49:30 +0100 Subject: [PATCH 002/246] scsi: scsi_debug: Make read-only arrays static const Don't populate the read-only arrays on the stack at run time, instead make them static const. Also reduces overall size. before: text data bss dec hex filename 367439 89582 5952 462973 7107d drivers/scsi/scsi_debug.o after: text data bss dec hex filename 365847 90702 5952 462501 70ea5 drivers/scsi/scsi_debug.o (gcc 14.2.0, x86-64) Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20250729064930.1659007-1-colin.i.king@gmail.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 91 ++++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 34 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 0847767d4d43..353cb60e1abe 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -2674,8 +2674,10 @@ static int resp_rsup_tmfs(struct scsi_cmnd *scp, static int resp_err_recov_pg(unsigned char *p, int pcontrol, int target) { /* Read-Write Error Recovery page for mode_sense */ - unsigned char err_recov_pg[] = {0x1, 0xa, 0xc0, 11, 240, 0, 0, 0, - 5, 0, 0xff, 0xff}; + static const unsigned char err_recov_pg[] = { + 0x1, 0xa, 0xc0, 11, 240, 0, 0, 0, + 5, 0, 0xff, 0xff + }; memcpy(p, err_recov_pg, sizeof(err_recov_pg)); if (1 == pcontrol) @@ -2685,8 +2687,10 @@ static int resp_err_recov_pg(unsigned char *p, int pcontrol, int target) static int resp_disconnect_pg(unsigned char *p, int pcontrol, int target) { /* Disconnect-Reconnect page for mode_sense */ - unsigned char disconnect_pg[] = {0x2, 0xe, 128, 128, 0, 10, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0}; + static const unsigned char disconnect_pg[] = { + 0x2, 0xe, 128, 128, 0, 10, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 + }; memcpy(p, disconnect_pg, sizeof(disconnect_pg)); if (1 == pcontrol) @@ -2696,9 +2700,11 @@ static int resp_disconnect_pg(unsigned char *p, int pcontrol, int target) static int resp_format_pg(unsigned char *p, int pcontrol, int target) { /* Format device page for mode_sense */ - unsigned char format_pg[] = {0x3, 0x16, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0x40, 0, 0, 0}; + static const unsigned char format_pg[] = { + 0x3, 0x16, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0x40, 0, 0, 0 + }; memcpy(p, format_pg, sizeof(format_pg)); put_unaligned_be16(sdebug_sectors_per, p + 10); @@ -2716,10 +2722,14 @@ static unsigned char caching_pg[] = {0x8, 18, 0x14, 0, 0xff, 0xff, 0, 0, static int resp_caching_pg(unsigned char *p, int pcontrol, int target) { /* Caching page for mode_sense */ - unsigned char ch_caching_pg[] = {/* 0x8, 18, */ 0x4, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - unsigned char d_caching_pg[] = {0x8, 18, 0x14, 0, 0xff, 0xff, 0, 0, - 0xff, 0xff, 0xff, 0xff, 0x80, 0x14, 0, 0, 0, 0, 0, 0}; + static const unsigned char ch_caching_pg[] = { + /* 0x8, 18, */ 0x4, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + static const unsigned char d_caching_pg[] = { + 0x8, 18, 0x14, 0, 0xff, 0xff, 0, 0, + 0xff, 0xff, 0xff, 0xff, 0x80, 0x14, 0, 0, 0, 0, 0, 0 + }; if (SDEBUG_OPT_N_WCE & sdebug_opts) caching_pg[2] &= ~0x4; /* set WCE=0 (default WCE=1) */ @@ -2738,8 +2748,10 @@ static int resp_ctrl_m_pg(unsigned char *p, int pcontrol, int target) { /* Control mode page for mode_sense */ unsigned char ch_ctrl_m_pg[] = {/* 0xa, 10, */ 0x6, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - unsigned char d_ctrl_m_pg[] = {0xa, 10, 2, 0, 0, 0, 0, 0, - 0, 0, 0x2, 0x4b}; + static const unsigned char d_ctrl_m_pg[] = { + 0xa, 10, 2, 0, 0, 0, 0, 0, + 0, 0, 0x2, 0x4b + }; if (sdebug_dsense) ctrl_m_pg[2] |= 0x4; @@ -2794,10 +2806,14 @@ static int resp_grouping_m_pg(unsigned char *p, int pcontrol, int target) static int resp_iec_m_pg(unsigned char *p, int pcontrol, int target) { /* Informational Exceptions control mode page for mode_sense */ - unsigned char ch_iec_m_pg[] = {/* 0x1c, 0xa, */ 0x4, 0xf, 0, 0, 0, 0, - 0, 0, 0x0, 0x0}; - unsigned char d_iec_m_pg[] = {0x1c, 0xa, 0x08, 0, 0, 0, 0, 0, - 0, 0, 0x0, 0x0}; + static const unsigned char ch_iec_m_pg[] = { + /* 0x1c, 0xa, */ 0x4, 0xf, 0, 0, 0, 0, + 0, 0, 0x0, 0x0 + }; + static const unsigned char d_iec_m_pg[] = { + 0x1c, 0xa, 0x08, 0, 0, 0, 0, 0, + 0, 0, 0x0, 0x0 + }; memcpy(p, iec_m_pg, sizeof(iec_m_pg)); if (1 == pcontrol) @@ -2809,8 +2825,9 @@ static int resp_iec_m_pg(unsigned char *p, int pcontrol, int target) static int resp_sas_sf_m_pg(unsigned char *p, int pcontrol, int target) { /* SAS SSP mode page - short format for mode_sense */ - unsigned char sas_sf_m_pg[] = {0x19, 0x6, - 0x6, 0x0, 0x7, 0xd0, 0x0, 0x0}; + static const unsigned char sas_sf_m_pg[] = { + 0x19, 0x6, 0x6, 0x0, 0x7, 0xd0, 0x0, 0x0 + }; memcpy(p, sas_sf_m_pg, sizeof(sas_sf_m_pg)); if (1 == pcontrol) @@ -2854,9 +2871,10 @@ static int resp_sas_pcd_m_spg(unsigned char *p, int pcontrol, int target, static int resp_sas_sha_m_spg(unsigned char *p, int pcontrol) { /* SAS SSP shared protocol specific port mode subpage */ - unsigned char sas_sha_m_pg[] = {0x59, 0x2, 0, 0xc, 0, 0x6, 0x10, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - }; + static const unsigned char sas_sha_m_pg[] = { + 0x59, 0x2, 0, 0xc, 0, 0x6, 0x10, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }; memcpy(p, sas_sha_m_pg, sizeof(sas_sha_m_pg)); if (1 == pcontrol) @@ -2923,8 +2941,10 @@ static int process_medium_part_m_pg(struct sdebug_dev_info *devip, static int resp_compression_m_pg(unsigned char *p, int pcontrol, int target, unsigned char dce) { /* Compression page for mode_sense (tape) */ - unsigned char compression_pg[] = {0x0f, 14, 0x40, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 00, 00}; + static const unsigned char compression_pg[] = { + 0x0f, 14, 0x40, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0 + }; memcpy(p, compression_pg, sizeof(compression_pg)); if (dce) @@ -3282,9 +3302,10 @@ bad_pcode: static int resp_temp_l_pg(unsigned char *arr) { - unsigned char temp_l_pg[] = {0x0, 0x0, 0x3, 0x2, 0x0, 38, - 0x0, 0x1, 0x3, 0x2, 0x0, 65, - }; + static const unsigned char temp_l_pg[] = { + 0x0, 0x0, 0x3, 0x2, 0x0, 38, + 0x0, 0x1, 0x3, 0x2, 0x0, 65, + }; memcpy(arr, temp_l_pg, sizeof(temp_l_pg)); return sizeof(temp_l_pg); @@ -3292,8 +3313,9 @@ static int resp_temp_l_pg(unsigned char *arr) static int resp_ie_l_pg(unsigned char *arr) { - unsigned char ie_l_pg[] = {0x0, 0x0, 0x3, 0x3, 0x0, 0x0, 38, - }; + static const unsigned char ie_l_pg[] = { + 0x0, 0x0, 0x3, 0x3, 0x0, 0x0, 38, + }; memcpy(arr, ie_l_pg, sizeof(ie_l_pg)); if (iec_m_pg[2] & 0x4) { /* TEST bit set */ @@ -3305,11 +3327,12 @@ static int resp_ie_l_pg(unsigned char *arr) static int resp_env_rep_l_spg(unsigned char *arr) { - unsigned char env_rep_l_spg[] = {0x0, 0x0, 0x23, 0x8, - 0x0, 40, 72, 0xff, 45, 18, 0, 0, - 0x1, 0x0, 0x23, 0x8, - 0x0, 55, 72, 35, 55, 45, 0, 0, - }; + static const unsigned char env_rep_l_spg[] = { + 0x0, 0x0, 0x23, 0x8, + 0x0, 40, 72, 0xff, 45, 18, 0, 0, + 0x1, 0x0, 0x23, 0x8, + 0x0, 55, 72, 35, 55, 45, 0, 0, + }; memcpy(arr, env_rep_l_spg, sizeof(env_rep_l_spg)); return sizeof(env_rep_l_spg); From be71ce9796c36517c677ab1d3c6691423dd0bdec Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 8 Jul 2025 10:51:23 +0200 Subject: [PATCH 003/246] drm/bridge: fix OF node leak Make sure to drop the OF node reference taken when creating the aux bridge device when the device is later released. Fixes: 6914968a0b52 ("drm/bridge: properly refcount DT nodes in aux bridge drivers") Cc: Dmitry Baryshkov Signed-off-by: Johan Hovold Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20250708085124.15445-2-johan@kernel.org Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/bridge/aux-bridge.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/bridge/aux-bridge.c b/drivers/gpu/drm/bridge/aux-bridge.c index b63304d3a80f..b3e4cdff61d6 100644 --- a/drivers/gpu/drm/bridge/aux-bridge.c +++ b/drivers/gpu/drm/bridge/aux-bridge.c @@ -18,6 +18,7 @@ static void drm_aux_bridge_release(struct device *dev) { struct auxiliary_device *adev = to_auxiliary_dev(dev); + of_node_put(dev->of_node); ida_free(&drm_aux_bridge_ida, adev->id); kfree(adev); @@ -65,6 +66,7 @@ int drm_aux_bridge_register(struct device *parent) ret = auxiliary_device_init(adev); if (ret) { + of_node_put(adev->dev.of_node); ida_free(&drm_aux_bridge_ida, adev->id); kfree(adev); return ret; From 7c527c15cdda2e0a26a05ac15a44d3e14738fc55 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sun, 3 Aug 2025 21:20:12 +0900 Subject: [PATCH 004/246] firewire: core: use reference counting to invoke address handlers safely The lifetime of address handler has been managed by linked list and RCU. This approach was introduced in commit 35202f7d8420 ("firewire: remove global lock around address handlers, convert to RCU"). The invocations of address handler are performed within RCU read-side critical sections. In commit 57e6d9f85fff ("firewire: ohci: use workqueue to handle events of AR request/response contexts"), the invocations are in a workqueue context. The approach still imposes limitation that sleeping is not allowed within RCU read-side critical sections. However, since sleeping is not permitted within RCU read-side critical sections, this approach still has a limitation. This commit adds reference counting to decouple handler invocation from handler discovery. The linked list and RCU is used to discover the handlers, while the reference counting is used to invoke them safely. Link: https://lore.kernel.org/r/20250803122015.236493-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-transaction.c | 32 +++++++++++++++++++++++++++-- include/linux/firewire.h | 4 ++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index d28477d84697..29ca9f3f14ce 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -550,6 +550,23 @@ const struct fw_address_region fw_unit_space_region = { .start = 0xfffff0000900ULL, .end = 0x1000000000000ULL, }; #endif /* 0 */ +static void complete_address_handler(struct kref *kref) +{ + struct fw_address_handler *handler = container_of(kref, struct fw_address_handler, kref); + + complete(&handler->done); +} + +static void get_address_handler(struct fw_address_handler *handler) +{ + kref_get(&handler->kref); +} + +static int put_address_handler(struct fw_address_handler *handler) +{ + return kref_put(&handler->kref, complete_address_handler); +} + /** * fw_core_add_address_handler() - register for incoming requests * @handler: callback @@ -596,6 +613,8 @@ int fw_core_add_address_handler(struct fw_address_handler *handler, if (other != NULL) { handler->offset += other->length; } else { + init_completion(&handler->done); + kref_init(&handler->kref); list_add_tail_rcu(&handler->link, &address_handler_list); ret = 0; break; @@ -621,6 +640,9 @@ void fw_core_remove_address_handler(struct fw_address_handler *handler) list_del_rcu(&handler->link); synchronize_rcu(); + + if (!put_address_handler(handler)) + wait_for_completion(&handler->done); } EXPORT_SYMBOL(fw_core_remove_address_handler); @@ -913,10 +935,13 @@ static void handle_exclusive_region_request(struct fw_card *card, scoped_guard(rcu) { handler = lookup_enclosing_address_handler(&address_handler_list, offset, request->length); - if (handler) + if (handler) { + get_address_handler(handler); handler->address_callback(card, request, tcode, destination, source, p->generation, offset, request->data, request->length, handler->callback_data); + put_address_handler(handler); + } } if (!handler) @@ -952,10 +977,13 @@ static void handle_fcp_region_request(struct fw_card *card, scoped_guard(rcu) { list_for_each_entry_rcu(handler, &address_handler_list, link) { - if (is_enclosing_handler(handler, offset, request->length)) + if (is_enclosing_handler(handler, offset, request->length)) { + get_address_handler(handler); handler->address_callback(card, request, tcode, destination, source, p->generation, offset, request->data, request->length, handler->callback_data); + put_address_handler(handler); + } } } diff --git a/include/linux/firewire.h b/include/linux/firewire.h index cceb70415ed2..d38c6e538e5c 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -341,7 +341,11 @@ struct fw_address_handler { u64 length; fw_address_callback_t address_callback; void *callback_data; + + // Only for core functions. struct list_head link; + struct kref kref; + struct completion done; }; struct fw_address_region { From e8cf6875005b017c293bf1b9be707c43f3eff9f4 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sun, 3 Aug 2025 21:20:13 +0900 Subject: [PATCH 005/246] firewire: core: call handler for exclusive regions outside RCU read-side critical section The previous commit added reference counting to ensure safe invocations of address handlers. This commit moves the invocation of handlers for exclusive regions outside of the RCU read-side critical section. The address handler for the requested region is selected within the critical section, then invoked outside of it. Link: https://lore.kernel.org/r/20250803122015.236493-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-transaction.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 29ca9f3f14ce..a742971c65fa 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -935,17 +935,19 @@ static void handle_exclusive_region_request(struct fw_card *card, scoped_guard(rcu) { handler = lookup_enclosing_address_handler(&address_handler_list, offset, request->length); - if (handler) { + if (handler) get_address_handler(handler); - handler->address_callback(card, request, tcode, destination, source, - p->generation, offset, request->data, - request->length, handler->callback_data); - put_address_handler(handler); - } } - if (!handler) + if (!handler) { fw_send_response(card, request, RCODE_ADDRESS_ERROR); + return; + } + + // Outside the RCU read-side critical section. Without spinlock. With reference count. + handler->address_callback(card, request, tcode, destination, source, p->generation, offset, + request->data, request->length, handler->callback_data); + put_address_handler(handler); } static void handle_fcp_region_request(struct fw_card *card, From e884a8a0c573ca5c191b269f31993733ecb6250e Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sun, 3 Aug 2025 21:20:14 +0900 Subject: [PATCH 006/246] firewire: core: call FCP address handlers outside RCU read-side critical section The former commit added reference counting to ensure safe invocations of address handlers. Unlike the exclusive-region address handlers, all FCP address handlers should be called on receiving an FCP request. This commit uses the part of kernel stack to collect address handlers up to 4 within the section, then invoke them outside of the section. Reference counting ensures that each handler remains valid and safe to call. Lifting the limitation of supporting only 4 handlers is left for next work. Link: https://lore.kernel.org/r/20250803122015.236493-4-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-transaction.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index a742971c65fa..7a62c660e912 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -950,13 +950,17 @@ static void handle_exclusive_region_request(struct fw_card *card, put_address_handler(handler); } +// To use kmalloc allocator efficiently, this should be power of two. +#define BUFFER_ON_KERNEL_STACK_SIZE 4 + static void handle_fcp_region_request(struct fw_card *card, struct fw_packet *p, struct fw_request *request, unsigned long long offset) { - struct fw_address_handler *handler; - int tcode, destination, source; + struct fw_address_handler *buffer_on_kernel_stack[BUFFER_ON_KERNEL_STACK_SIZE]; + struct fw_address_handler *handler, **handlers; + int tcode, destination, source, i, count; if ((offset != (CSR_REGISTER_BASE | CSR_FCP_COMMAND) && offset != (CSR_REGISTER_BASE | CSR_FCP_RESPONSE)) || @@ -977,18 +981,27 @@ static void handle_fcp_region_request(struct fw_card *card, return; } + count = 0; + handlers = buffer_on_kernel_stack; scoped_guard(rcu) { list_for_each_entry_rcu(handler, &address_handler_list, link) { if (is_enclosing_handler(handler, offset, request->length)) { get_address_handler(handler); - handler->address_callback(card, request, tcode, destination, source, - p->generation, offset, request->data, - request->length, handler->callback_data); - put_address_handler(handler); + handlers[count] = handler; + if (++count >= ARRAY_SIZE(buffer_on_kernel_stack)) + break; } } } + for (i = 0; i < count; ++i) { + handler = handlers[i]; + handler->address_callback(card, request, tcode, destination, source, + p->generation, offset, request->data, + request->length, handler->callback_data); + put_address_handler(handler); + } + fw_send_response(card, request, RCODE_COMPLETE); } From 0342273e14c25971f2916de2b598db2e9cfeec15 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sun, 3 Aug 2025 21:20:15 +0900 Subject: [PATCH 007/246] firewire: core: reallocate buffer for FCP address handlers when more than 4 are registered The former commit has a limitation that only up to 4 FCP address handlers could be processed per request. Although it suffices for most use cases, it is technically a regression. This commit lifts the restriction by reallocating the buffer from kernel heap when more than 4 handlers are registered. The allocation is performed within RCU read-side critical section, thus it uses GCP_ATOMIC flag. The buffer size is rounded up to the next power of two to align with kmalloc allocation units. Link: https://lore.kernel.org/r/20250803122015.236493-5-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-transaction.c | 36 +++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 7a62c660e912..1d1c2d8f85ae 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -960,7 +960,7 @@ static void handle_fcp_region_request(struct fw_card *card, { struct fw_address_handler *buffer_on_kernel_stack[BUFFER_ON_KERNEL_STACK_SIZE]; struct fw_address_handler *handler, **handlers; - int tcode, destination, source, i, count; + int tcode, destination, source, i, count, buffer_size; if ((offset != (CSR_REGISTER_BASE | CSR_FCP_COMMAND) && offset != (CSR_REGISTER_BASE | CSR_FCP_RESPONSE)) || @@ -983,13 +983,38 @@ static void handle_fcp_region_request(struct fw_card *card, count = 0; handlers = buffer_on_kernel_stack; + buffer_size = ARRAY_SIZE(buffer_on_kernel_stack); scoped_guard(rcu) { list_for_each_entry_rcu(handler, &address_handler_list, link) { if (is_enclosing_handler(handler, offset, request->length)) { + if (count >= buffer_size) { + int next_size = buffer_size * 2; + struct fw_address_handler **buffer_on_kernel_heap; + + if (handlers == buffer_on_kernel_stack) + buffer_on_kernel_heap = NULL; + else + buffer_on_kernel_heap = handlers; + + buffer_on_kernel_heap = + krealloc_array(buffer_on_kernel_heap, next_size, + sizeof(*buffer_on_kernel_heap), GFP_ATOMIC); + // FCP is used for purposes unrelated to significant system + // resources (e.g. storage or networking), so allocation + // failures are not considered so critical. + if (!buffer_on_kernel_heap) + break; + + if (handlers == buffer_on_kernel_stack) { + memcpy(buffer_on_kernel_heap, buffer_on_kernel_stack, + sizeof(buffer_on_kernel_stack)); + } + + handlers = buffer_on_kernel_heap; + buffer_size = next_size; + } get_address_handler(handler); - handlers[count] = handler; - if (++count >= ARRAY_SIZE(buffer_on_kernel_stack)) - break; + handlers[count++] = handler; } } } @@ -1002,6 +1027,9 @@ static void handle_fcp_region_request(struct fw_card *card, put_address_handler(handler); } + if (handlers != buffer_on_kernel_stack) + kfree(handlers); + fw_send_response(card, request, RCODE_COMPLETE); } From 584460393efbcccb6388b1cd5d37284b5326709c Mon Sep 17 00:00:00 2001 From: Andy Yan Date: Wed, 16 Jul 2025 20:55:55 +0800 Subject: [PATCH 008/246] drm/bridge: Describe the newly introduced drm_connector parameter for drm_bridge_detect This fix the make htmldocs warnings: drivers/gpu/drm/drm_bridge.c:1242: warning: Function parameter or struct member 'connector' not described in 'drm_bridge_detect' Fixes: 5d156a9c3d5e ("drm/bridge: Pass down connector to drm bridge detect hook") Signed-off-by: Andy Yan Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20250716125602.3166573-1-andyshrk@163.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/drm_bridge.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/drm_bridge.c b/drivers/gpu/drm/drm_bridge.c index dd45d9b504d8..4bde00083047 100644 --- a/drivers/gpu/drm/drm_bridge.c +++ b/drivers/gpu/drm/drm_bridge.c @@ -1227,6 +1227,7 @@ EXPORT_SYMBOL(drm_atomic_bridge_chain_check); /** * drm_bridge_detect - check if anything is attached to the bridge output * @bridge: bridge control structure + * @connector: attached connector * * If the bridge supports output detection, as reported by the * DRM_BRIDGE_OP_DETECT bridge ops flag, call &drm_bridge_funcs.detect for the From eec8e8c048caa826ecbde7bf40f0ac2d11eef99d Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 1 Aug 2025 13:46:41 +0300 Subject: [PATCH 009/246] drm/bridge: document HDMI CEC callbacks Provide documentation for the drm_bridge callbacks related to the DRM_BRIDGE_OP_HDMI_CEC_ADAPTER flag. Fixes: a74288c8ded7 ("drm/display: bridge-connector: handle CEC adapters") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/r/20250611140933.1429a1b8@canb.auug.org.au Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20250801-drm-hdmi-cec-docs-v1-1-be63e6008d0e@oss.qualcomm.com Signed-off-by: Dmitry Baryshkov --- include/drm/drm_bridge.h | 48 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/include/drm/drm_bridge.h b/include/drm/drm_bridge.h index 8ed80cad77ec..b0e6653ee42e 100644 --- a/include/drm/drm_bridge.h +++ b/include/drm/drm_bridge.h @@ -866,13 +866,61 @@ struct drm_bridge_funcs { struct drm_connector *connector, bool enable, int direction); + /** + * @hdmi_cec_init: + * + * Initialize CEC part of the bridge. + * + * This callback is optional, it can be implemented by bridges that + * set the @DRM_BRIDGE_OP_HDMI_CEC_ADAPTER flag in their + * &drm_bridge->ops. + * + * Returns: + * 0 on success, a negative error code otherwise + */ int (*hdmi_cec_init)(struct drm_bridge *bridge, struct drm_connector *connector); + /** + * @hdmi_cec_enable: + * + * Enable or disable the CEC adapter inside the bridge. + * + * This callback is optional, it can be implemented by bridges that + * set the @DRM_BRIDGE_OP_HDMI_CEC_ADAPTER flag in their + * &drm_bridge->ops. + * + * Returns: + * 0 on success, a negative error code otherwise + */ int (*hdmi_cec_enable)(struct drm_bridge *bridge, bool enable); + /** + * @hdmi_cec_log_addr: + * + * Set the logical address of the CEC adapter inside the bridge. + * + * This callback is optional, it can be implemented by bridges that + * set the @DRM_BRIDGE_OP_HDMI_CEC_ADAPTER flag in their + * &drm_bridge->ops. + * + * Returns: + * 0 on success, a negative error code otherwise + */ int (*hdmi_cec_log_addr)(struct drm_bridge *bridge, u8 logical_addr); + /** + * @hdmi_cec_transmit: + * + * Transmit the message using the CEC adapter inside the bridge. + * + * This callback is optional, it can be implemented by bridges that + * set the @DRM_BRIDGE_OP_HDMI_CEC_ADAPTER flag in their + * &drm_bridge->ops. + * + * Returns: + * 0 on success, a negative error code otherwise + */ int (*hdmi_cec_transmit)(struct drm_bridge *bridge, u8 attempts, u32 signal_free_time, struct cec_msg *msg); From 034d319c8899e8c5c0a35c6692c7fc7e8c12c374 Mon Sep 17 00:00:00 2001 From: Nitin Rawat Date: Tue, 29 Jul 2025 04:27:11 +0530 Subject: [PATCH 010/246] scsi: ufs: core: Fix interrupt handling for MCQ Mode Commit 3c7ac40d7322 ("scsi: ufs: core: Delegate the interrupt service routine to a threaded IRQ handler") introduced a regression where the UFS interrupt status register (IS) was not cleared in ufshcd_intr() when operating in MCQ mode. As a result, the IS register remained uncleared. This led to a persistent issue during UIC interrupts: ufshcd_is_auto_hibern8_error() consistently returned true because the UFSHCD_UIC_HIBERN8_MASK bit was set, while the active command was neither UIC_CMD_DME_HIBER_ENTER nor UIC_CMD_DME_HIBER_EXIT. This caused continuous auto hibern8 enter errors and device failed to boot. To fix this, ensure that the interrupt status register is properly cleared in the ufshcd_intr() function for both MCQ mode with ESI enabled. [ 4.553226] ufshcd-qcom 1d84000.ufs: ufshcd_check_errors: Auto Hibern8 Enter failed - status: 0x00000040, upmcrs: 0x00000001 [ 4.553229] ufshcd-qcom 1d84000.ufs: ufshcd_check_errors: saved_err 0x40 saved_uic_err 0x0 [ 4.553311] host_regs: 00000000: d5c7033f 20e0071f 00000400 00000000 [ 4.553312] host_regs: 00000010: 01000000 00010217 00000c96 00000000 [ 4.553314] host_regs: 00000020: 00000440 00170ef5 00000000 00000000 [ 4.553316] host_regs: 00000030: 0000010f 00000001 00000000 00000000 [ 4.553317] host_regs: 00000040: 00000000 00000000 00000000 00000000 [ 4.553319] host_regs: 00000050: fffdf000 0000000f 00000000 00000000 [ 4.553320] host_regs: 00000060: 00000001 80000000 00000000 00000000 [ 4.553322] host_regs: 00000070: fffde000 0000000f 00000000 00000000 [ 4.553323] host_regs: 00000080: 00000001 00000000 00000000 00000000 [ 4.553325] host_regs: 00000090: 00000002 d0020000 00000000 01930200 Fixes: 3c7ac40d7322 ("scsi: ufs: core: Delegate the interrupt service routine to a threaded IRQ handler") Co-developed-by: Palash Kambar Signed-off-by: Palash Kambar Signed-off-by: Nitin Rawat Link: https://lore.kernel.org/r/20250728225711.29273-1-quic_nitirawa@quicinc.com Tested-by: Neil Armstrong # on SM8650-QRD Reviewed-by: Bart Van Assche Reviewed-by: Peter Wang Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 5442bb8540b5..baf28bd748cc 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -7133,14 +7133,19 @@ static irqreturn_t ufshcd_threaded_intr(int irq, void *__hba) static irqreturn_t ufshcd_intr(int irq, void *__hba) { struct ufs_hba *hba = __hba; + u32 intr_status, enabled_intr_status; /* Move interrupt handling to thread when MCQ & ESI are not enabled */ if (!hba->mcq_enabled || !hba->mcq_esi_enabled) return IRQ_WAKE_THREAD; + intr_status = ufshcd_readl(hba, REG_INTERRUPT_STATUS); + enabled_intr_status = intr_status & ufshcd_readl(hba, REG_INTERRUPT_ENABLE); + + ufshcd_writel(hba, intr_status, REG_INTERRUPT_STATUS); + /* Directly handle interrupts since MCQ ESI handlers does the hard job */ - return ufshcd_sl_intr(hba, ufshcd_readl(hba, REG_INTERRUPT_STATUS) & - ufshcd_readl(hba, REG_INTERRUPT_ENABLE)); + return ufshcd_sl_intr(hba, enabled_intr_status); } static int ufshcd_clear_tm_cmd(struct ufs_hba *hba, int tag) From a59976116a01dad1c72460f9ed700bf4b3fdbebd Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Thu, 31 Jul 2025 13:33:11 +0200 Subject: [PATCH 011/246] scsi: lpfc: Fix wrong function reference in a comment Function scsi_host_remove() doesn't exist, the actual function name is scsi_remove_host(). Signed-off-by: Jean Delvare Link: https://lore.kernel.org/r/20250731133311.52034cc4@endymion Reviewed-by: Justin Tee Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_vport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_vport.c b/drivers/scsi/lpfc/lpfc_vport.c index 2797aa75a689..aff6c9d5e7c2 100644 --- a/drivers/scsi/lpfc/lpfc_vport.c +++ b/drivers/scsi/lpfc/lpfc_vport.c @@ -666,7 +666,7 @@ lpfc_vport_delete(struct fc_vport *fc_vport) * Take early refcount for outstanding I/O requests we schedule during * delete processing for unreg_vpi. Always keep this before * scsi_remove_host() as we can no longer obtain a reference through - * scsi_host_get() after scsi_host_remove as shost is set to SHOST_DEL. + * scsi_host_get() after scsi_remove_host as shost is set to SHOST_DEL. */ if (!scsi_host_get(shost)) return VPORT_INVAL; From eea6cafb5890db488fce1c69d05464214616d800 Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Fri, 1 Aug 2025 18:52:02 +0000 Subject: [PATCH 012/246] scsi: lpfc: Remove redundant assignment to avoid memory leak Remove the redundant assignment if kzalloc() succeeds to avoid memory leak. Fixes: bd2cdd5e400f ("scsi: lpfc: NVME Initiator: Add debugfs support") Signed-off-by: Jiasheng Jiang Link: https://lore.kernel.org/r/20250801185202.42631-1-jiashengjiangcool@gmail.com Reviewed-by: Justin Tee Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_debugfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_debugfs.c b/drivers/scsi/lpfc/lpfc_debugfs.c index 2c7d876c64c7..47cffdf2a8ac 100644 --- a/drivers/scsi/lpfc/lpfc_debugfs.c +++ b/drivers/scsi/lpfc/lpfc_debugfs.c @@ -6289,7 +6289,6 @@ lpfc_debugfs_initialize(struct lpfc_vport *vport) } phba->nvmeio_trc_on = 1; phba->nvmeio_trc_output_idx = 0; - phba->nvmeio_trc = NULL; } else { nvmeio_off: phba->nvmeio_trc_size = 0; From 7ec2bd6cd2d0ce6d6224519f895cb932ed5af667 Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Mon, 4 Aug 2025 14:01:54 +0800 Subject: [PATCH 013/246] scsi: ufs: mediatek: Fix out-of-bounds access in MCQ IRQ mapping Address a potential out-of-bounds access issue when accessing 'host->mcq_intr_info[q_index]'. The value of 'q_index' might exceed the valid array bounds if 'q_index == nr'. Correct condition to 'q_index >= nr' to prevent accessing invalid memory. Fixes: 66e26a4b8a77 ("scsi: ufs: host: mediatek: Set IRQ affinity policy for MCQ mode") Cc: stable@vger.kernel.org Reported-by: Dan Carpenter Signed-off-by: Peter Wang Link: https://lore.kernel.org/r/20250804060249.1387057-1-peter.wang@mediatek.com Signed-off-by: Martin K. Petersen --- drivers/ufs/host/ufs-mediatek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ufs/host/ufs-mediatek.c b/drivers/ufs/host/ufs-mediatek.c index 86ae73b89d4d..f902ce08c95a 100644 --- a/drivers/ufs/host/ufs-mediatek.c +++ b/drivers/ufs/host/ufs-mediatek.c @@ -818,7 +818,7 @@ static u32 ufs_mtk_mcq_get_irq(struct ufs_hba *hba, unsigned int cpu) unsigned int q_index; q_index = map->mq_map[cpu]; - if (q_index > nr) { + if (q_index >= nr) { dev_err(hba->dev, "hwq index %d exceed %d\n", q_index, nr); return MTK_MCQ_INVALID_IRQ; From 72fc388d8bc0b49fd038477b74618cc15ce18b56 Mon Sep 17 00:00:00 2001 From: Waqar Hameed Date: Tue, 5 Aug 2025 11:33:36 +0200 Subject: [PATCH 014/246] scsi: ufs: core: Remove error print for devm_add_action_or_reset() When devm_add_action_or_reset() fails, it is due to a failed memory allocation and will thus return -ENOMEM. dev_err_probe() doesn't do anything when error is -ENOMEM. Therefore, remove the useless call to dev_err_probe() when devm_add_action_or_reset() fails, and just return the value instead. Signed-off-by: Waqar Hameed Link: https://lore.kernel.org/r/pndtt2mkt8v.a.out@axis.com Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index baf28bd748cc..2e1fa8cf83f5 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -10515,8 +10515,7 @@ int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle) err = devm_add_action_or_reset(dev, ufshcd_devres_release, host); if (err) - return dev_err_probe(dev, err, - "failed to add ufshcd dealloc action\n"); + return err; host->nr_maps = HCTX_TYPE_POLL + 1; hba = shost_priv(host); From f8f6e72fe28595969829d63db93ecaa56a0c2811 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Tue, 5 Aug 2025 20:57:50 +0300 Subject: [PATCH 015/246] drm/omap: Pass along the format info from .fb_create() to drm_helper_mode_fill_fb_struct() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plumb the format info from .fb_create() all the way to drm_helper_mode_fill_fb_struct() to avoid the redundant lookup. For the fbdev case a manual drm_get_format_info() lookup is needed. The patch is based on the driver parts of the patchset at Link: below, which missed converting the omap driver. Due to the absence of this change in the patchset at Link:, after the Fixed: commit below, omap_framebuffer_init() -> drm_helper_mode_fill_fb_struct() set drm_framebuffer::format incorrectly to NULL, which lead to the !fb->format WARN() in drm_framebuffer_init() and causing framebuffer creation to fail. This patch fixes both of these issues. v2: Amend the commit log mentioning the functional issues the patch fixes. (Tomi) Cc: Ville Syrjälä Cc: Tomi Valkeinen Cc: Thomas Zimmermann Cc: Maarten Lankhorst Cc: Maxime Ripard Fixes: 41ab92d35ccd ("drm: Make passing of format info to drm_helper_mode_fill_fb_struct() mandatory") Reported-by: Mark Brown Closes: https://lore.kernel.org/all/98b3a62c-91ff-4f91-a58b-e1265f84180b@sirena.org.uk Link: https://lore.kernel.org/all/20250701090722.13645-1-ville.syrjala@linux.intel.com Tested-by: Mark Brown Tested-by: Linux Kernel Functional Testing Acked-by: Alex Deucher Reviewed-by: Tomi Valkeinen Signed-off-by: Imre Deak Link: https://lore.kernel.org/r/20250805175752.690504-2-imre.deak@intel.com --- drivers/gpu/drm/omapdrm/omap_fb.c | 23 ++++++++++------------- drivers/gpu/drm/omapdrm/omap_fb.h | 2 ++ drivers/gpu/drm/omapdrm/omap_fbdev.c | 5 ++++- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/omapdrm/omap_fb.c b/drivers/gpu/drm/omapdrm/omap_fb.c index 30c81e2e5d6b..bb3105556f19 100644 --- a/drivers/gpu/drm/omapdrm/omap_fb.c +++ b/drivers/gpu/drm/omapdrm/omap_fb.c @@ -351,7 +351,7 @@ struct drm_framebuffer *omap_framebuffer_create(struct drm_device *dev, } } - fb = omap_framebuffer_init(dev, mode_cmd, bos); + fb = omap_framebuffer_init(dev, info, mode_cmd, bos); if (IS_ERR(fb)) goto error; @@ -365,9 +365,9 @@ error: } struct drm_framebuffer *omap_framebuffer_init(struct drm_device *dev, + const struct drm_format_info *info, const struct drm_mode_fb_cmd2 *mode_cmd, struct drm_gem_object **bos) { - const struct drm_format_info *format = NULL; struct omap_framebuffer *omap_fb = NULL; struct drm_framebuffer *fb = NULL; unsigned int pitch = mode_cmd->pitches[0]; @@ -377,15 +377,12 @@ struct drm_framebuffer *omap_framebuffer_init(struct drm_device *dev, dev, mode_cmd, mode_cmd->width, mode_cmd->height, (char *)&mode_cmd->pixel_format); - format = drm_get_format_info(dev, mode_cmd->pixel_format, - mode_cmd->modifier[0]); - for (i = 0; i < ARRAY_SIZE(formats); i++) { if (formats[i] == mode_cmd->pixel_format) break; } - if (!format || i == ARRAY_SIZE(formats)) { + if (i == ARRAY_SIZE(formats)) { dev_dbg(dev->dev, "unsupported pixel format: %4.4s\n", (char *)&mode_cmd->pixel_format); ret = -EINVAL; @@ -399,7 +396,7 @@ struct drm_framebuffer *omap_framebuffer_init(struct drm_device *dev, } fb = &omap_fb->base; - omap_fb->format = format; + omap_fb->format = info; mutex_init(&omap_fb->lock); /* @@ -407,23 +404,23 @@ struct drm_framebuffer *omap_framebuffer_init(struct drm_device *dev, * that the two planes of multiplane formats need the same number of * bytes per pixel. */ - if (format->num_planes == 2 && pitch != mode_cmd->pitches[1]) { + if (info->num_planes == 2 && pitch != mode_cmd->pitches[1]) { dev_dbg(dev->dev, "pitches differ between planes 0 and 1\n"); ret = -EINVAL; goto fail; } - if (pitch % format->cpp[0]) { + if (pitch % info->cpp[0]) { dev_dbg(dev->dev, "buffer pitch (%u bytes) is not a multiple of pixel size (%u bytes)\n", - pitch, format->cpp[0]); + pitch, info->cpp[0]); ret = -EINVAL; goto fail; } - for (i = 0; i < format->num_planes; i++) { + for (i = 0; i < info->num_planes; i++) { struct plane *plane = &omap_fb->planes[i]; - unsigned int vsub = i == 0 ? 1 : format->vsub; + unsigned int vsub = i == 0 ? 1 : info->vsub; unsigned int size; size = pitch * mode_cmd->height / vsub; @@ -440,7 +437,7 @@ struct drm_framebuffer *omap_framebuffer_init(struct drm_device *dev, plane->dma_addr = 0; } - drm_helper_mode_fill_fb_struct(dev, fb, NULL, mode_cmd); + drm_helper_mode_fill_fb_struct(dev, fb, info, mode_cmd); ret = drm_framebuffer_init(dev, fb, &omap_framebuffer_funcs); if (ret) { diff --git a/drivers/gpu/drm/omapdrm/omap_fb.h b/drivers/gpu/drm/omapdrm/omap_fb.h index 0873f953cf1d..e6010302a22b 100644 --- a/drivers/gpu/drm/omapdrm/omap_fb.h +++ b/drivers/gpu/drm/omapdrm/omap_fb.h @@ -13,6 +13,7 @@ struct drm_connector; struct drm_device; struct drm_file; struct drm_framebuffer; +struct drm_format_info; struct drm_gem_object; struct drm_mode_fb_cmd2; struct drm_plane_state; @@ -23,6 +24,7 @@ struct drm_framebuffer *omap_framebuffer_create(struct drm_device *dev, struct drm_file *file, const struct drm_format_info *info, const struct drm_mode_fb_cmd2 *mode_cmd); struct drm_framebuffer *omap_framebuffer_init(struct drm_device *dev, + const struct drm_format_info *info, const struct drm_mode_fb_cmd2 *mode_cmd, struct drm_gem_object **bos); int omap_framebuffer_pin(struct drm_framebuffer *fb); void omap_framebuffer_unpin(struct drm_framebuffer *fb); diff --git a/drivers/gpu/drm/omapdrm/omap_fbdev.c b/drivers/gpu/drm/omapdrm/omap_fbdev.c index 7b6396890681..948af7ec1130 100644 --- a/drivers/gpu/drm/omapdrm/omap_fbdev.c +++ b/drivers/gpu/drm/omapdrm/omap_fbdev.c @@ -197,7 +197,10 @@ int omap_fbdev_driver_fbdev_probe(struct drm_fb_helper *helper, goto fail; } - fb = omap_framebuffer_init(dev, &mode_cmd, &bo); + fb = omap_framebuffer_init(dev, + drm_get_format_info(dev, mode_cmd.pixel_format, + mode_cmd.modifier[0]), + &mode_cmd, &bo); if (IS_ERR(fb)) { dev_err(dev->dev, "failed to allocate fb\n"); /* note: if fb creation failed, we can't rely on fb destroy From d2b524c9064301471e8ffe4ffd85ab8870966aa4 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Tue, 5 Aug 2025 20:57:51 +0300 Subject: [PATCH 016/246] drm/nouveau: Pass along the format info from .fb_create() to drm_helper_mode_fill_fb_struct() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plumb the format info from .fb_create() all the way to drm_helper_mode_fill_fb_struct() to avoid the redundant lookup. The patch is based on the driver parts of the patchset at Link: below, which missed converting the nouveau driver. Due to the absence of this change in the patchset at Link:, after the Fixed: commit below, nouveau_framebuffer_new() -> drm_helper_mode_fill_fb_struct() set drm_framebuffer::format incorrectly to NULL, which lead to the !fb->format WARN() in drm_framebuffer_init() and causing framebuffer creation to fail. This patch fixes both of these issues. v2: Amend the commit log mentioning the functional issues the patch fixes. (Tomi) Cc: Ville Syrjälä Cc: Lyude Paul Cc: Danilo Krummrich Cc: Thomas Zimmermann Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Tomi Valkeinen Cc: nouveau@lists.freedesktop.org Fixes: 41ab92d35ccd ("drm: Make passing of format info to drm_helper_mode_fill_fb_struct() mandatory") Link: https://lore.kernel.org/all/20250701090722.13645-1-ville.syrjala@linux.intel.com Acked-by: Alex Deucher Acked-by: Danilo Krummrich Reviewed-by: James Jones Tested-by: Linux Kernel Functional Testing Tested-by: James Jones Signed-off-by: Imre Deak Link: https://lore.kernel.org/r/20250805175752.690504-3-imre.deak@intel.com --- drivers/gpu/drm/nouveau/nouveau_display.c | 9 +++------ drivers/gpu/drm/nouveau/nouveau_display.h | 3 +++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c b/drivers/gpu/drm/nouveau/nouveau_display.c index e1e542126310..805d0a87aa54 100644 --- a/drivers/gpu/drm/nouveau/nouveau_display.c +++ b/drivers/gpu/drm/nouveau/nouveau_display.c @@ -253,6 +253,7 @@ nouveau_check_bl_size(struct nouveau_drm *drm, struct nouveau_bo *nvbo, int nouveau_framebuffer_new(struct drm_device *dev, + const struct drm_format_info *info, const struct drm_mode_fb_cmd2 *mode_cmd, struct drm_gem_object *gem, struct drm_framebuffer **pfb) @@ -260,7 +261,6 @@ nouveau_framebuffer_new(struct drm_device *dev, struct nouveau_drm *drm = nouveau_drm(dev); struct nouveau_bo *nvbo = nouveau_gem_object(gem); struct drm_framebuffer *fb; - const struct drm_format_info *info; unsigned int height, i; uint32_t tile_mode; uint8_t kind; @@ -295,9 +295,6 @@ nouveau_framebuffer_new(struct drm_device *dev, kind = nvbo->kind; } - info = drm_get_format_info(dev, mode_cmd->pixel_format, - mode_cmd->modifier[0]); - for (i = 0; i < info->num_planes; i++) { height = drm_format_info_plane_height(info, mode_cmd->height, @@ -321,7 +318,7 @@ nouveau_framebuffer_new(struct drm_device *dev, if (!(fb = *pfb = kzalloc(sizeof(*fb), GFP_KERNEL))) return -ENOMEM; - drm_helper_mode_fill_fb_struct(dev, fb, NULL, mode_cmd); + drm_helper_mode_fill_fb_struct(dev, fb, info, mode_cmd); fb->obj[0] = gem; ret = drm_framebuffer_init(dev, fb, &nouveau_framebuffer_funcs); @@ -344,7 +341,7 @@ nouveau_user_framebuffer_create(struct drm_device *dev, if (!gem) return ERR_PTR(-ENOENT); - ret = nouveau_framebuffer_new(dev, mode_cmd, gem, &fb); + ret = nouveau_framebuffer_new(dev, info, mode_cmd, gem, &fb); if (ret == 0) return fb; diff --git a/drivers/gpu/drm/nouveau/nouveau_display.h b/drivers/gpu/drm/nouveau/nouveau_display.h index e45f211501f6..470e0910d484 100644 --- a/drivers/gpu/drm/nouveau/nouveau_display.h +++ b/drivers/gpu/drm/nouveau/nouveau_display.h @@ -8,8 +8,11 @@ #include +struct drm_format_info; + int nouveau_framebuffer_new(struct drm_device *dev, + const struct drm_format_info *info, const struct drm_mode_fb_cmd2 *mode_cmd, struct drm_gem_object *gem, struct drm_framebuffer **pfb); From c0a8e4443d768e5c86ddb52a3a744a151e7b72b0 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Tue, 5 Aug 2025 20:57:52 +0300 Subject: [PATCH 017/246] drm/radeon: Pass along the format info from .fb_create() to drm_helper_mode_fill_fb_struct() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plumb the format info from .fb_create() all the way to drm_helper_mode_fill_fb_struct() to avoid the redundant lookup. For the fbdev case a manual drm_get_format_info() lookup is needed. The patch is based on the driver parts of the patchset at Link: below, which missed converting the radeon driver. Due to the absence of this change in the patchset at Link:, after the Fixed: commit below, radeon_framebuffer_init() -> drm_helper_mode_fill_fb_struct() set drm_framebuffer::format incorrectly to NULL, which lead to the !fb->format WARN() in drm_framebuffer_init() and causing framebuffer creation to fail. This patch fixes both of these issues. v2: Amend the commit log mentioning the functional issues the patch fixes. (Tomi) Cc: Ville Syrjälä Cc: Alex Deucher Cc: Christian König Cc: Thomas Zimmermann Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: amd-gfx@lists.freedesktop.org Cc: Tomi Valkeinen Fixes: 41ab92d35ccd ("drm: Make passing of format info to drm_helper_mode_fill_fb_struct() mandatory") Link: https://lore.kernel.org/all/20250701090722.13645-1-ville.syrjala@linux.intel.com Acked-by: Alex Deucher Tested-by: Linux Kernel Functional Testing Signed-off-by: Imre Deak Link: https://lore.kernel.org/r/20250805175752.690504-4-imre.deak@intel.com --- drivers/gpu/drm/radeon/radeon_display.c | 5 +++-- drivers/gpu/drm/radeon/radeon_fbdev.c | 11 ++++++----- drivers/gpu/drm/radeon/radeon_mode.h | 2 ++ 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c index b4bf5dfeea2d..4dc77c398617 100644 --- a/drivers/gpu/drm/radeon/radeon_display.c +++ b/drivers/gpu/drm/radeon/radeon_display.c @@ -1297,12 +1297,13 @@ static const struct drm_framebuffer_funcs radeon_fb_funcs = { int radeon_framebuffer_init(struct drm_device *dev, struct drm_framebuffer *fb, + const struct drm_format_info *info, const struct drm_mode_fb_cmd2 *mode_cmd, struct drm_gem_object *obj) { int ret; fb->obj[0] = obj; - drm_helper_mode_fill_fb_struct(dev, fb, NULL, mode_cmd); + drm_helper_mode_fill_fb_struct(dev, fb, info, mode_cmd); ret = drm_framebuffer_init(dev, fb, &radeon_fb_funcs); if (ret) { fb->obj[0] = NULL; @@ -1341,7 +1342,7 @@ radeon_user_framebuffer_create(struct drm_device *dev, return ERR_PTR(-ENOMEM); } - ret = radeon_framebuffer_init(dev, fb, mode_cmd, obj); + ret = radeon_framebuffer_init(dev, fb, info, mode_cmd, obj); if (ret) { kfree(fb); drm_gem_object_put(obj); diff --git a/drivers/gpu/drm/radeon/radeon_fbdev.c b/drivers/gpu/drm/radeon/radeon_fbdev.c index e3a481bbee7b..dc81b0c2dbff 100644 --- a/drivers/gpu/drm/radeon/radeon_fbdev.c +++ b/drivers/gpu/drm/radeon/radeon_fbdev.c @@ -53,10 +53,10 @@ static void radeon_fbdev_destroy_pinned_object(struct drm_gem_object *gobj) } static int radeon_fbdev_create_pinned_object(struct drm_fb_helper *fb_helper, + const struct drm_format_info *info, struct drm_mode_fb_cmd2 *mode_cmd, struct drm_gem_object **gobj_p) { - const struct drm_format_info *info; struct radeon_device *rdev = fb_helper->dev->dev_private; struct drm_gem_object *gobj = NULL; struct radeon_bo *rbo = NULL; @@ -67,8 +67,6 @@ static int radeon_fbdev_create_pinned_object(struct drm_fb_helper *fb_helper, int height = mode_cmd->height; u32 cpp; - info = drm_get_format_info(rdev_to_drm(rdev), mode_cmd->pixel_format, - mode_cmd->modifier[0]); cpp = info->cpp[0]; /* need to align pitch with crtc limits */ @@ -206,6 +204,7 @@ int radeon_fbdev_driver_fbdev_probe(struct drm_fb_helper *fb_helper, struct drm_fb_helper_surface_size *sizes) { struct radeon_device *rdev = fb_helper->dev->dev_private; + const struct drm_format_info *format_info; struct drm_mode_fb_cmd2 mode_cmd = { }; struct fb_info *info; struct drm_gem_object *gobj; @@ -224,7 +223,9 @@ int radeon_fbdev_driver_fbdev_probe(struct drm_fb_helper *fb_helper, mode_cmd.pixel_format = drm_mode_legacy_fb_format(sizes->surface_bpp, sizes->surface_depth); - ret = radeon_fbdev_create_pinned_object(fb_helper, &mode_cmd, &gobj); + format_info = drm_get_format_info(rdev_to_drm(rdev), mode_cmd.pixel_format, + mode_cmd.modifier[0]); + ret = radeon_fbdev_create_pinned_object(fb_helper, format_info, &mode_cmd, &gobj); if (ret) { DRM_ERROR("failed to create fbcon object %d\n", ret); return ret; @@ -236,7 +237,7 @@ int radeon_fbdev_driver_fbdev_probe(struct drm_fb_helper *fb_helper, ret = -ENOMEM; goto err_radeon_fbdev_destroy_pinned_object; } - ret = radeon_framebuffer_init(rdev_to_drm(rdev), fb, &mode_cmd, gobj); + ret = radeon_framebuffer_init(rdev_to_drm(rdev), fb, format_info, &mode_cmd, gobj); if (ret) { DRM_ERROR("failed to initialize framebuffer %d\n", ret); goto err_kfree; diff --git a/drivers/gpu/drm/radeon/radeon_mode.h b/drivers/gpu/drm/radeon/radeon_mode.h index 3102f6c2d055..9e34da2cacef 100644 --- a/drivers/gpu/drm/radeon/radeon_mode.h +++ b/drivers/gpu/drm/radeon/radeon_mode.h @@ -40,6 +40,7 @@ struct drm_fb_helper; struct drm_fb_helper_surface_size; +struct drm_format_info; struct edid; struct drm_edid; @@ -890,6 +891,7 @@ extern void radeon_combios_encoder_dpms_scratch_regs(struct drm_encoder *encoder, bool on); int radeon_framebuffer_init(struct drm_device *dev, struct drm_framebuffer *rfb, + const struct drm_format_info *info, const struct drm_mode_fb_cmd2 *mode_cmd, struct drm_gem_object *obj); From 0af1561b2d60bab2a2b00720a5c7b292ecc549ec Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Fri, 8 Aug 2025 12:20:17 -0300 Subject: [PATCH 018/246] smb: client: fix race with concurrent opens in unlink(2) According to some logs reported by customers, CIFS client might end up reporting unlinked files as existing in stat(2) due to concurrent opens racing with unlink(2). Besides sending the removal request to the server, the unlink process could involve closing any deferred close as well as marking all existing open handles as deleted to prevent them from deferring closes, which increases the race window for potential concurrent opens. Fix this by unhashing the dentry in cifs_unlink() to prevent any subsequent opens. Any open attempts, while we're still unlinking, will block on parent's i_rwsem. Reported-by: Jay Shin Signed-off-by: Paulo Alcantara (Red Hat) Reviewed-by: David Howells Cc: Al Viro Cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/inode.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 75be4b46bc6f..cf9060f0fc08 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1943,15 +1943,24 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct tcon_link *tlink; struct cifs_tcon *tcon; + __u32 dosattr = 0, origattr = 0; struct TCP_Server_Info *server; struct iattr *attrs = NULL; - __u32 dosattr = 0, origattr = 0; + bool rehash = false; cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry); if (unlikely(cifs_forced_shutdown(cifs_sb))) return -EIO; + /* Unhash dentry in advance to prevent any concurrent opens */ + spin_lock(&dentry->d_lock); + if (!d_unhashed(dentry)) { + __d_drop(dentry); + rehash = true; + } + spin_unlock(&dentry->d_lock); + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -2003,7 +2012,8 @@ psx_del_no_retry: cifs_drop_nlink(inode); } } else if (rc == -ENOENT) { - d_drop(dentry); + if (simple_positive(dentry)) + d_delete(dentry); } else if (rc == -EBUSY) { if (server->ops->rename_pending_delete) { rc = server->ops->rename_pending_delete(full_path, @@ -2056,6 +2066,8 @@ unlink_out: kfree(attrs); free_xid(xid); cifs_put_tlink(tlink); + if (rehash) + d_rehash(dentry); return rc; } From d84291fc7453df7881a970716f8256273aca5747 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Fri, 8 Aug 2025 11:43:29 -0300 Subject: [PATCH 019/246] smb: client: fix race with concurrent opens in rename(2) Besides sending the rename request to the server, the rename process also involves closing any deferred close, waiting for outstanding I/O to complete as well as marking all existing open handles as deleted to prevent them from deferring closes, which increases the race window for potential concurrent opens on the target file. Fix this by unhashing the dentry in advance to prevent any concurrent opens on the target. Signed-off-by: Paulo Alcantara (Red Hat) Reviewed-by: David Howells Cc: Al Viro Cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/inode.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index cf9060f0fc08..fe453a4b3dc8 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2474,6 +2474,7 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *tcon; + bool rehash = false; unsigned int xid; int rc, tmprc; int retry_count = 0; @@ -2489,6 +2490,17 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, if (unlikely(cifs_forced_shutdown(cifs_sb))) return -EIO; + /* + * Prevent any concurrent opens on the target by unhashing the dentry. + * VFS already unhashes the target when renaming directories. + */ + if (d_is_positive(target_dentry) && !d_is_dir(target_dentry)) { + if (!d_unhashed(target_dentry)) { + d_drop(target_dentry); + rehash = true; + } + } + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -2530,6 +2542,8 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir, } } + if (!rc) + rehash = false; /* * No-replace is the natural behavior for CIFS, so skip unlink hacks. */ @@ -2588,12 +2602,16 @@ unlink_target: goto cifs_rename_exit; rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, to_name); + if (!rc) + rehash = false; } /* force revalidate to go get info when needed */ CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0; cifs_rename_exit: + if (rehash) + d_rehash(target_dentry); kfree(info_buf_source); free_dentry_path(page2); free_dentry_path(page1); From 0e270f32975fd21874185ba53653630dd40bf560 Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Thu, 7 Aug 2025 10:03:18 +0800 Subject: [PATCH 020/246] ASoC: fsl_sai: replace regmap_write with regmap_update_bits Use the regmap_write() for software reset in fsl_sai_config_disable would cause the FSL_SAI_CSR_BCE bit to be cleared. Refer to commit 197c53c8ecb34 ("ASoC: fsl_sai: Don't disable bitclock for i.MX8MP") FSL_SAI_CSR_BCE should not be cleared. So need to use regmap_update_bits() instead of regmap_write() for these bit operations. Fixes: dc78f7e59169d ("ASoC: fsl_sai: Force a software reset when starting in consumer mode") Signed-off-by: Shengjiu Wang Link: https://patch.msgid.link/20250807020318.2143219-1-shengjiu.wang@nxp.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_sai.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c index c313b654236c..d0367b21f775 100644 --- a/sound/soc/fsl/fsl_sai.c +++ b/sound/soc/fsl/fsl_sai.c @@ -809,9 +809,9 @@ static void fsl_sai_config_disable(struct fsl_sai *sai, int dir) * are running concurrently. */ /* Software Reset */ - regmap_write(sai->regmap, FSL_SAI_xCSR(tx, ofs), FSL_SAI_CSR_SR); + regmap_update_bits(sai->regmap, FSL_SAI_xCSR(tx, ofs), FSL_SAI_CSR_SR, FSL_SAI_CSR_SR); /* Clear SR bit to finish the reset */ - regmap_write(sai->regmap, FSL_SAI_xCSR(tx, ofs), 0); + regmap_update_bits(sai->regmap, FSL_SAI_xCSR(tx, ofs), FSL_SAI_CSR_SR, 0); } static int fsl_sai_trigger(struct snd_pcm_substream *substream, int cmd, @@ -930,11 +930,11 @@ static int fsl_sai_dai_probe(struct snd_soc_dai *cpu_dai) unsigned int ofs = sai->soc_data->reg_offset; /* Software Reset for both Tx and Rx */ - regmap_write(sai->regmap, FSL_SAI_TCSR(ofs), FSL_SAI_CSR_SR); - regmap_write(sai->regmap, FSL_SAI_RCSR(ofs), FSL_SAI_CSR_SR); + regmap_update_bits(sai->regmap, FSL_SAI_TCSR(ofs), FSL_SAI_CSR_SR, FSL_SAI_CSR_SR); + regmap_update_bits(sai->regmap, FSL_SAI_RCSR(ofs), FSL_SAI_CSR_SR, FSL_SAI_CSR_SR); /* Clear SR bit to finish the reset */ - regmap_write(sai->regmap, FSL_SAI_TCSR(ofs), 0); - regmap_write(sai->regmap, FSL_SAI_RCSR(ofs), 0); + regmap_update_bits(sai->regmap, FSL_SAI_TCSR(ofs), FSL_SAI_CSR_SR, 0); + regmap_update_bits(sai->regmap, FSL_SAI_RCSR(ofs), FSL_SAI_CSR_SR, 0); regmap_update_bits(sai->regmap, FSL_SAI_TCR1(ofs), FSL_SAI_CR1_RFW_MASK(sai->soc_data->fifo_depth), @@ -1824,11 +1824,11 @@ static int fsl_sai_runtime_resume(struct device *dev) regcache_cache_only(sai->regmap, false); regcache_mark_dirty(sai->regmap); - regmap_write(sai->regmap, FSL_SAI_TCSR(ofs), FSL_SAI_CSR_SR); - regmap_write(sai->regmap, FSL_SAI_RCSR(ofs), FSL_SAI_CSR_SR); + regmap_update_bits(sai->regmap, FSL_SAI_TCSR(ofs), FSL_SAI_CSR_SR, FSL_SAI_CSR_SR); + regmap_update_bits(sai->regmap, FSL_SAI_RCSR(ofs), FSL_SAI_CSR_SR, FSL_SAI_CSR_SR); usleep_range(1000, 2000); - regmap_write(sai->regmap, FSL_SAI_TCSR(ofs), 0); - regmap_write(sai->regmap, FSL_SAI_RCSR(ofs), 0); + regmap_update_bits(sai->regmap, FSL_SAI_TCSR(ofs), FSL_SAI_CSR_SR, 0); + regmap_update_bits(sai->regmap, FSL_SAI_RCSR(ofs), FSL_SAI_CSR_SR, 0); ret = regcache_sync(sai->regmap); if (ret) From 43e0da37d5cfb23eec6aeee9422f84d86621ce2b Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Wed, 6 Aug 2025 15:00:30 +0100 Subject: [PATCH 021/246] ASoC: codecs: tx-macro: correct tx_macro_component_drv name We already have a component driver named "RX-MACRO", which is lpass-rx-macro.c. The tx macro component driver's name should be "TX-MACRO" accordingly. Fix it. Cc: Srinivas Kandagatla Signed-off-by: Alexey Klimov Reviewed-by: Neil Armstrong Link: https://patch.msgid.link/20250806140030.691477-1-alexey.klimov@linaro.org Signed-off-by: Mark Brown --- sound/soc/codecs/lpass-tx-macro.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/lpass-tx-macro.c b/sound/soc/codecs/lpass-tx-macro.c index 40d79bee4584..1da34cb3505f 100644 --- a/sound/soc/codecs/lpass-tx-macro.c +++ b/sound/soc/codecs/lpass-tx-macro.c @@ -2229,7 +2229,7 @@ static int tx_macro_register_mclk_output(struct tx_macro *tx) } static const struct snd_soc_component_driver tx_macro_component_drv = { - .name = "RX-MACRO", + .name = "TX-MACRO", .probe = tx_macro_component_probe, .controls = tx_macro_snd_controls, .num_controls = ARRAY_SIZE(tx_macro_snd_controls), From 7cdadac0d2b3614d04651be7104a89a1998efec0 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 8 Aug 2025 11:53:24 +0100 Subject: [PATCH 022/246] ASoC: codec: sma1307: replace spelling mistake with new error message There is a spelling mistake in a failure message, replace the message with something a little more meaningful. Signed-off-by: Colin Ian King Link: https://patch.msgid.link/20250808105324.829883-1-colin.i.king@gmail.com Signed-off-by: Mark Brown --- sound/soc/codecs/sma1307.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/sma1307.c b/sound/soc/codecs/sma1307.c index b3d401ada176..6a601e7134ea 100644 --- a/sound/soc/codecs/sma1307.c +++ b/sound/soc/codecs/sma1307.c @@ -1749,7 +1749,7 @@ static void sma1307_setting_loaded(struct sma1307_priv *sma1307, const char *fil sma1307->set.header_size * sizeof(int)); if ((sma1307->set.checksum >> 8) != SMA1307_SETTING_CHECKSUM) { - dev_err(sma1307->dev, "%s: failed by dismatch \"%s\"\n", + dev_err(sma1307->dev, "%s: checksum failed \"%s\"\n", __func__, setting_file); sma1307->set.status = false; return; From f13ab498726bb6c636d6c5cd8c7df911444316dc Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 6 Aug 2025 23:34:52 +0000 Subject: [PATCH 023/246] ASoC: generic: tidyup standardized ASoC menu for generic commit acc84d15e45393fb ("ASoC: generic: Standardize ASoC menu") standardized ASoC generic menu. Then, it moved generic menu position under SoC group. It should be kept generic position. Tidyup it. Suggested-by: Geert Uytterhoeven Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87v7n0c9d0.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- sound/soc/Kconfig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sound/soc/Kconfig b/sound/soc/Kconfig index bf362bfca456..ce74818bd715 100644 --- a/sound/soc/Kconfig +++ b/sound/soc/Kconfig @@ -111,7 +111,6 @@ source "sound/soc/bcm/Kconfig" source "sound/soc/cirrus/Kconfig" source "sound/soc/dwc/Kconfig" source "sound/soc/fsl/Kconfig" -source "sound/soc/generic/Kconfig" source "sound/soc/google/Kconfig" source "sound/soc/hisilicon/Kconfig" source "sound/soc/jz4740/Kconfig" @@ -149,5 +148,8 @@ source "sound/soc/codecs/Kconfig" source "sound/soc/sdw_utils/Kconfig" +# generic frame-work +source "sound/soc/generic/Kconfig" + endif # SND_SOC From 633e391d45bda3fc848d26bee6bbe57ef2935713 Mon Sep 17 00:00:00 2001 From: Shuming Fan Date: Fri, 8 Aug 2025 13:57:06 +0800 Subject: [PATCH 024/246] ASoC: rt721: fix FU33 Boost Volume control not working This patch fixed FU33 Boost Volume control not working. Signed-off-by: Shuming Fan Link: https://patch.msgid.link/20250808055706.1110766-1-shumingf@realtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt721-sdca.c | 2 ++ sound/soc/codecs/rt721-sdca.h | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/sound/soc/codecs/rt721-sdca.c b/sound/soc/codecs/rt721-sdca.c index f6f7c2ffde1c..a4bd29d7220b 100644 --- a/sound/soc/codecs/rt721-sdca.c +++ b/sound/soc/codecs/rt721-sdca.c @@ -278,6 +278,8 @@ static void rt721_sdca_jack_preset(struct rt721_sdca_priv *rt721) RT721_ENT_FLOAT_CTL1, 0x4040); rt_sdca_index_write(rt721->mbq_regmap, RT721_HDA_SDCA_FLOAT, RT721_ENT_FLOAT_CTL4, 0x1201); + rt_sdca_index_write(rt721->mbq_regmap, RT721_BOOST_CTRL, + RT721_BST_4CH_TOP_GATING_CTRL1, 0x002a); regmap_write(rt721->regmap, 0x2f58, 0x07); } diff --git a/sound/soc/codecs/rt721-sdca.h b/sound/soc/codecs/rt721-sdca.h index 0a82c107b19a..71fac9cd8739 100644 --- a/sound/soc/codecs/rt721-sdca.h +++ b/sound/soc/codecs/rt721-sdca.h @@ -56,6 +56,7 @@ struct rt721_sdca_dmic_kctrl_priv { #define RT721_CBJ_CTRL 0x0a #define RT721_CAP_PORT_CTRL 0x0c #define RT721_CLASD_AMP_CTRL 0x0d +#define RT721_BOOST_CTRL 0x0f #define RT721_VENDOR_REG 0x20 #define RT721_RC_CALIB_CTRL 0x40 #define RT721_VENDOR_EQ_L 0x53 @@ -93,6 +94,9 @@ struct rt721_sdca_dmic_kctrl_priv { /* Index (NID:0dh) */ #define RT721_CLASD_AMP_2CH_CAL 0x14 +/* Index (NID:0fh) */ +#define RT721_BST_4CH_TOP_GATING_CTRL1 0x05 + /* Index (NID:20h) */ #define RT721_JD_PRODUCT_NUM 0x00 #define RT721_ANALOG_BIAS_CTL3 0x04 From f48d7a1b0bf11d16d8c9f77a5b9c80a82272f625 Mon Sep 17 00:00:00 2001 From: Shuming Fan Date: Thu, 7 Aug 2025 17:24:32 +0800 Subject: [PATCH 025/246] ASoC: rt1320: fix random cycle mute issue This patch fixed the random cycle mute issue that occurs during long-time playback. Signed-off-by: Shuming Fan Link: https://patch.msgid.link/20250807092432.997989-1-shumingf@realtek.com Signed-off-by: Mark Brown --- sound/soc/codecs/rt1320-sdw.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/rt1320-sdw.c b/sound/soc/codecs/rt1320-sdw.c index b13d7a99bf63..dcddc28e8856 100644 --- a/sound/soc/codecs/rt1320-sdw.c +++ b/sound/soc/codecs/rt1320-sdw.c @@ -109,6 +109,7 @@ static const struct reg_sequence rt1320_blind_write[] = { { 0x0000d540, 0x01 }, { 0xd172, 0x2a }, { 0xc5d6, 0x01 }, + { 0xd478, 0xff }, }; static const struct reg_sequence rt1320_vc_blind_write[] = { @@ -159,7 +160,7 @@ static const struct reg_sequence rt1320_vc_blind_write[] = { { 0xd471, 0x3a }, { 0xd474, 0x11 }, { 0xd475, 0x32 }, - { 0xd478, 0x64 }, + { 0xd478, 0xff }, { 0xd479, 0x20 }, { 0xd47a, 0x10 }, { 0xd47c, 0xff }, From b11f2a9745401d9ccc51c91b5482044d2ea936e8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 8 Aug 2025 11:49:43 +0100 Subject: [PATCH 026/246] ASoC: tas2781: Fix spelling mistake "dismatch" -> "mismatch" There is a spelling mistake (or neologism of dis and match) in a dev_err message. Fix it. Signed-off-by: Colin Ian King Link: https://patch.msgid.link/20250808104943.829668-1-colin.i.king@gmail.com Signed-off-by: Mark Brown --- sound/soc/codecs/tas2781-i2c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/codecs/tas2781-i2c.c b/sound/soc/codecs/tas2781-i2c.c index 9f4d965a1335..8e7e45c046b8 100644 --- a/sound/soc/codecs/tas2781-i2c.c +++ b/sound/soc/codecs/tas2781-i2c.c @@ -1480,7 +1480,7 @@ static ssize_t acoustic_ctl_write(struct file *file, return PTR_ERR(src); if (src[0] > max_pkg_len && src[0] != count) { - dev_err(priv->dev, "pkg(%u), max(%u), count(%u) dismatch.\n", + dev_err(priv->dev, "pkg(%u), max(%u), count(%u) mismatch.\n", src[0], max_pkg_len, (unsigned int)count); ret = 0; goto exit; From 0db77eccd964b11ab2b757031d1354fcc5a025ea Mon Sep 17 00:00:00 2001 From: Christopher Eby Date: Sat, 9 Aug 2025 20:00:06 -0700 Subject: [PATCH 027/246] ALSA: hda/realtek: Add Framework Laptop 13 (AMD Ryzen AI 300) to quirks Framework Laptop 13 (AMD Ryzen AI 300) requires the same quirk for headset detection as other Framework 13 models. Signed-off-by: Christopher Eby Cc: Link: https://patch.msgid.link/20250810030006.9060-1-kreed@kreed.org Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index e27a36e4e92a..337e33a59de8 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7158,6 +7158,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0xf111, 0x0001, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0xf111, 0x0006, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0xf111, 0x0009, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0xf111, 0x000b, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0xf111, 0x000c, "Framework Laptop", ALC295_FIXUP_FRAMEWORK_LAPTOP_MIC_NO_PRESENCE), #if 0 From b6bcbce3359619d05bf387d4f5cc3af63668dbaa Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Thu, 31 Jul 2025 13:18:32 +0100 Subject: [PATCH 028/246] soc/tegra: pmc: Ensure power-domains are in a known state After commit 13a4b7fb6260 ("pmdomain: core: Leave powered-on genpds on until late_initcall_sync") was applied, the Tegra210 Jetson TX1 board failed to boot. Looking into this issue, before this commit was applied, if any of the Tegra power-domains were in 'on' state when the kernel booted, they were being turned off by the genpd core before any driver had chance to request them. This was purely by luck and a consequence of the power-domains being turned off earlier during boot. After this commit was applied, any power-domains in the 'on' state are kept on for longer during boot and therefore, may never transitioned to the off state before they are requested/used. The hang on the Tegra210 Jetson TX1 is caused because devices in some power-domains are accessed without the power-domain being turned off and on, indicating that the power-domain is not in a completely on state. >From reviewing the Tegra PMC driver code, if a power-domain is in the 'on' state there is no guarantee that all the necessary clocks associated with the power-domain are on and even if they are they would not have been requested via the clock framework and so could be turned off later. Some power-domains also have a 'clamping' register that needs to be configured as well. In short, if a power-domain is already 'on' it is difficult to know if it has been configured correctly. Given that the power-domains happened to be switched off during boot previously, to ensure that they are in a good known state on boot, fix this by switching off any power-domains that are on initially when registering the power-domains with the genpd framework. Note that commit 05cfb988a4d0 ("soc/tegra: pmc: Initialise resets associated with a power partition") updated the tegra_powergate_of_get_resets() function to pass the 'off' to ensure that the resets for the power-domain are in the correct state on boot. However, now that we may power off a domain on boot, if it is on, it is better to move this logic into the tegra_powergate_add() function so that there is a single place where we are handling the initial state of the power-domain. Fixes: a38045121bf4 ("soc/tegra: pmc: Add generic PM domain support") Signed-off-by: Jon Hunter Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250731121832.213671-1-jonathanh@nvidia.com Signed-off-by: Ulf Hansson --- drivers/soc/tegra/pmc.c | 51 +++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/drivers/soc/tegra/pmc.c b/drivers/soc/tegra/pmc.c index 2a5f24ee858c..034a2a535a1e 100644 --- a/drivers/soc/tegra/pmc.c +++ b/drivers/soc/tegra/pmc.c @@ -1232,7 +1232,7 @@ err: } static int tegra_powergate_of_get_resets(struct tegra_powergate *pg, - struct device_node *np, bool off) + struct device_node *np) { struct device *dev = pg->pmc->dev; int err; @@ -1247,22 +1247,6 @@ static int tegra_powergate_of_get_resets(struct tegra_powergate *pg, err = reset_control_acquire(pg->reset); if (err < 0) { pr_err("failed to acquire resets: %d\n", err); - goto out; - } - - if (off) { - err = reset_control_assert(pg->reset); - } else { - err = reset_control_deassert(pg->reset); - if (err < 0) - goto out; - - reset_control_release(pg->reset); - } - -out: - if (err) { - reset_control_release(pg->reset); reset_control_put(pg->reset); } @@ -1308,20 +1292,43 @@ static int tegra_powergate_add(struct tegra_pmc *pmc, struct device_node *np) goto set_available; } - err = tegra_powergate_of_get_resets(pg, np, off); + err = tegra_powergate_of_get_resets(pg, np); if (err < 0) { dev_err(dev, "failed to get resets for %pOFn: %d\n", np, err); goto remove_clks; } - if (!IS_ENABLED(CONFIG_PM_GENERIC_DOMAINS)) { - if (off) - WARN_ON(tegra_powergate_power_up(pg, true)); + /* + * If the power-domain is off, then ensure the resets are asserted. + * If the power-domain is on, then power down to ensure that when is + * it turned on the power-domain, clocks and resets are all in the + * expected state. + */ + if (off) { + err = reset_control_assert(pg->reset); + if (err) { + pr_err("failed to assert resets: %d\n", err); + goto remove_resets; + } + } else { + err = tegra_powergate_power_down(pg); + if (err) { + dev_err(dev, "failed to turn off PM domain %s: %d\n", + pg->genpd.name, err); + goto remove_resets; + } + } + /* + * If PM_GENERIC_DOMAINS is not enabled, power-on + * the domain and skip the genpd registration. + */ + if (!IS_ENABLED(CONFIG_PM_GENERIC_DOMAINS)) { + WARN_ON(tegra_powergate_power_up(pg, true)); goto remove_resets; } - err = pm_genpd_init(&pg->genpd, NULL, off); + err = pm_genpd_init(&pg->genpd, NULL, true); if (err < 0) { dev_err(dev, "failed to initialise PM domain %pOFn: %d\n", np, err); From 647b3d59c768d7638dd17c78c8044178364383ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 31 Jul 2025 07:19:41 -0700 Subject: [PATCH 029/246] xfs: fix frozen file system assert in xfs_trans_alloc Commit 83a80e95e797 ("xfs: decouple xfs_trans_alloc_empty from xfs_trans_alloc") move the place of the assert for a frozen file system after the sb_start_intwrite call that ensures it doesn't run on frozen file systems, and thus allows to incorrect trigger it. Fix that by moving it back to where it belongs. Fixes: 83a80e95e797 ("xfs: decouple xfs_trans_alloc_empty from xfs_trans_alloc") Reported-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_trans.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index ece374d622b3..575e7028f423 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -253,8 +253,8 @@ xfs_trans_alloc( * by doing GFP_KERNEL allocations inside sb_start_intwrite(). */ retry: - WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); tp = __xfs_trans_alloc(mp, flags); + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); error = xfs_trans_reserve(tp, resp, blocks, rtextents); if (error == -ENOSPC && want_retry) { xfs_trans_cancel(tp); From d2845519b0723c5d5a0266cbf410495f9b8fd65c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Jul 2025 14:19:44 +0200 Subject: [PATCH 030/246] xfs: fully decouple XFS_IBULK* flags from XFS_IWALK* flags Fix up xfs_inumbers to now pass in the XFS_IBULK* flags into the flags argument to xfs_inobt_walk, which expects the XFS_IWALK* flags. Currently passing the wrong flags works for non-debug builds because the only XFS_IWALK* flag has the same encoding as the corresponding XFS_IBULK* flag, but in debug builds it can trigger an assert that no incorrect flag is passed. Instead just extra the relevant flag. Fixes: 5b35d922c52798 ("xfs: Decouple XFS_IBULK flags from XFS_IWALK flags") Cc: # v5.19 Reported-by: cen zhang Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_itable.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index c8c9b8d8309f..5116842420b2 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -447,17 +447,21 @@ xfs_inumbers( .breq = breq, }; struct xfs_trans *tp; + unsigned int iwalk_flags = 0; int error = 0; if (xfs_bulkstat_already_done(breq->mp, breq->startino)) return 0; + if (breq->flags & XFS_IBULK_SAME_AG) + iwalk_flags |= XFS_IWALK_SAME_AG; + /* * Grab an empty transaction so that we can use its recursive buffer * locking abilities to detect cycles in the inobt without deadlocking. */ tp = xfs_trans_alloc_empty(breq->mp); - error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->flags, + error = xfs_inobt_walk(breq->mp, tp, breq->startino, iwalk_flags, xfs_inumbers_walk, breq->icount, &ic); xfs_trans_cancel(tp); From 82efde9cf2e4ce25eac96a20e36eae7c338df1e0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Jul 2025 14:19:45 +0200 Subject: [PATCH 031/246] xfs: remove XFS_IBULK_SAME_AG Add a new field to struct xfs_ibulk to directly pass XFS_IWALK* flags, and thus remove the need to indirect the SAME_AG flag through XFS_IBULK*. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_ioctl.c | 2 +- fs/xfs/xfs_itable.c | 12 ++---------- fs/xfs/xfs_itable.h | 10 ++++------ 3 files changed, 7 insertions(+), 17 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index fe1f74a3b6a3..e1051a530a50 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -219,7 +219,7 @@ xfs_bulk_ireq_setup( else if (XFS_INO_TO_AGNO(mp, breq->startino) < hdr->agno) return -EINVAL; - breq->flags |= XFS_IBULK_SAME_AG; + breq->iwalk_flags |= XFS_IWALK_SAME_AG; /* Asking for an inode past the end of the AG? We're done! */ if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 5116842420b2..2aa37a4d2706 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -307,7 +307,6 @@ xfs_bulkstat( .breq = breq, }; struct xfs_trans *tp; - unsigned int iwalk_flags = 0; int error; if (breq->idmap != &nop_mnt_idmap) { @@ -328,10 +327,7 @@ xfs_bulkstat( * locking abilities to detect cycles in the inobt without deadlocking. */ tp = xfs_trans_alloc_empty(breq->mp); - if (breq->flags & XFS_IBULK_SAME_AG) - iwalk_flags |= XFS_IWALK_SAME_AG; - - error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags, + error = xfs_iwalk(breq->mp, tp, breq->startino, breq->iwalk_flags, xfs_bulkstat_iwalk, breq->icount, &bc); xfs_trans_cancel(tp); kfree(bc.buf); @@ -447,21 +443,17 @@ xfs_inumbers( .breq = breq, }; struct xfs_trans *tp; - unsigned int iwalk_flags = 0; int error = 0; if (xfs_bulkstat_already_done(breq->mp, breq->startino)) return 0; - if (breq->flags & XFS_IBULK_SAME_AG) - iwalk_flags |= XFS_IWALK_SAME_AG; - /* * Grab an empty transaction so that we can use its recursive buffer * locking abilities to detect cycles in the inobt without deadlocking. */ tp = xfs_trans_alloc_empty(breq->mp); - error = xfs_inobt_walk(breq->mp, tp, breq->startino, iwalk_flags, + error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->iwalk_flags, xfs_inumbers_walk, breq->icount, &ic); xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index f10e8f8f2335..2d0612f14d6e 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -13,17 +13,15 @@ struct xfs_ibulk { xfs_ino_t startino; /* start with this inode */ unsigned int icount; /* number of elements in ubuffer */ unsigned int ocount; /* number of records returned */ - unsigned int flags; /* see XFS_IBULK_FLAG_* */ + unsigned int flags; /* XFS_IBULK_FLAG_* */ + unsigned int iwalk_flags; /* XFS_IWALK_FLAG_* */ }; -/* Only iterate within the same AG as startino */ -#define XFS_IBULK_SAME_AG (1U << 0) - /* Fill out the bs_extents64 field if set. */ -#define XFS_IBULK_NREXT64 (1U << 1) +#define XFS_IBULK_NREXT64 (1U << 0) /* Signal that we can return metadata directories. */ -#define XFS_IBULK_METADIR (1U << 2) +#define XFS_IBULK_METADIR (1U << 1) /* * Advance the user buffer pointer by one record of the given size. If the From e7fb9b71326f43bab25fb8f18c6bfebd7a628696 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 24 Jul 2025 08:12:13 +0000 Subject: [PATCH 032/246] fs/dax: Reject IOCB_ATOMIC in dax_iomap_rw() The DAX write path does not support IOCB_ATOMIC, so reject it when set. Suggested-by: Darrick J. Wong Signed-off-by: John Garry Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/dax.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/dax.c b/fs/dax.c index 4229513806be..20ecf652c129 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1743,6 +1743,9 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, loff_t done = 0; int ret; + if (WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC)) + return -EIO; + if (!iomi.len) return 0; From 68456d05eb57a5d16b4be2d3caf421bdcf2de72e Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 24 Jul 2025 08:12:14 +0000 Subject: [PATCH 033/246] xfs: disallow atomic writes on DAX Atomic writes are not currently supported for DAX, but two problems exist: - we may go down DAX write path for IOCB_ATOMIC, which does not handle IOCB_ATOMIC properly - we report non-zero atomic write limits in statx (for DAX inodes) We may want atomic writes support on DAX in future, but just disallow for now. For this, ensure when IOCB_ATOMIC is set that we check the write size versus the atomic write min and max before branching off to the DAX write path. This is not strictly required for DAX, as we should not get this far in the write path as FMODE_CAN_ATOMIC_WRITE should not be set. In addition, due to reflink being supported for DAX, we automatically get CoW-based atomic writes support being advertised. Remedy this by disallowing atomic writes for a DAX inode for both sw and hw modes. Reported-by: Darrick J. Wong Fixes: 9dffc58f2384 ("xfs: update atomic write limits") Reviewed-by: Darrick J. Wong Signed-off-by: John Garry Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_file.c | 6 +++--- fs/xfs/xfs_inode.h | 11 +++++++++++ fs/xfs/xfs_iops.c | 5 +++-- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 55a304cb3aef..f96fbf5c54c9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1101,9 +1101,6 @@ xfs_file_write_iter( if (xfs_is_shutdown(ip->i_mount)) return -EIO; - if (IS_DAX(inode)) - return xfs_file_dax_write(iocb, from); - if (iocb->ki_flags & IOCB_ATOMIC) { if (ocount < xfs_get_atomic_write_min(ip)) return -EINVAL; @@ -1116,6 +1113,9 @@ xfs_file_write_iter( return ret; } + if (IS_DAX(inode)) + return xfs_file_dax_write(iocb, from); + if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 07fbdcc4cbf5..bd6d33557194 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -358,9 +358,20 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip) static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip) { + if (IS_DAX(VFS_IC(ip))) + return false; + return xfs_inode_buftarg(ip)->bt_awu_max > 0; } +static inline bool xfs_inode_can_sw_atomic_write(const struct xfs_inode *ip) +{ + if (IS_DAX(VFS_IC(ip))) + return false; + + return xfs_can_sw_atomic_write(ip->i_mount); +} + /* * In-core inode flags. */ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 149b5460fbfd..603effabe1ee 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -616,7 +616,8 @@ xfs_get_atomic_write_min( * write of exactly one single fsblock if the bdev will make that * guarantee for us. */ - if (xfs_inode_can_hw_atomic_write(ip) || xfs_can_sw_atomic_write(mp)) + if (xfs_inode_can_hw_atomic_write(ip) || + xfs_inode_can_sw_atomic_write(ip)) return mp->m_sb.sb_blocksize; return 0; @@ -633,7 +634,7 @@ xfs_get_atomic_write_max( * write of exactly one single fsblock if the bdev will make that * guarantee for us. */ - if (!xfs_can_sw_atomic_write(mp)) { + if (!xfs_inode_can_sw_atomic_write(ip)) { if (xfs_inode_can_hw_atomic_write(ip)) return mp->m_sb.sb_blocksize; return 0; From 8dc5e9b037138317c1d3151a7dabe41fa171cee1 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 24 Jul 2025 08:12:15 +0000 Subject: [PATCH 034/246] xfs: reject max_atomic_write mount option for no reflink If the FS has no reflink, then atomic writes greater than 1x block are not supported. As such, for no reflink it is pointless to accept setting max_atomic_write when it cannot be supported, so reject max_atomic_write mount option in this case. It could be still possible to accept max_atomic_write option of size 1x block if HW atomics are supported, so check for this specifically. Fixes: 4528b9052731 ("xfs: allow sysadmins to specify a maximum atomic write limit at mount time") Signed-off-by: John Garry Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_mount.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 2133fbaf1766..dc32c5e34d81 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -779,6 +779,25 @@ xfs_set_max_atomic_write_opt( return -EINVAL; } + if (xfs_has_reflink(mp)) + goto set_limit; + + if (new_max_fsbs == 1) { + if (mp->m_ddev_targp->bt_awu_max || + (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_awu_max)) { + } else { + xfs_warn(mp, + "cannot support atomic writes of size %lluk with no reflink or HW support", + new_max_bytes >> 10); + return -EINVAL; + } + } else { + xfs_warn(mp, + "cannot support atomic writes of size %lluk with no reflink support", + new_max_bytes >> 10); + return -EINVAL; + } + set_limit: error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs); if (error) { From 5d94b19f066480addfcdcb5efde66152ad5a7c0e Mon Sep 17 00:00:00 2001 From: Andrey Albershteyn Date: Thu, 31 Jul 2025 19:07:22 +0200 Subject: [PATCH 035/246] xfs: fix scrub trace with null pointer in quotacheck The quotacheck doesn't initialize sc->ip. Cc: stable@vger.kernel.org # v6.8 Fixes: 21d7500929c8a0 ("xfs: improve dquot iteration for scrub") Reviewed-by: Darrick J. Wong Signed-off-by: Andrey Albershteyn Signed-off-by: Carlos Maiolino --- fs/xfs/scrub/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 1e6e9c10cea2..a8187281eb96 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -479,7 +479,7 @@ DECLARE_EVENT_CLASS(xchk_dqiter_class, __field(xfs_exntst_t, state) ), TP_fast_assign( - __entry->dev = cursor->sc->ip->i_mount->m_super->s_dev; + __entry->dev = cursor->sc->mp->m_super->s_dev; __entry->dqtype = cursor->dqtype; __entry->ino = cursor->quota_ip->i_ino; __entry->cur_id = cursor->id; From f76823e3b284aae30797fded988a807eab2da246 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Jul 2025 07:35:44 +0200 Subject: [PATCH 036/246] xfs: split xfs_zone_record_blocks xfs_zone_record_blocks not only records successfully written blocks that now back file data, but is also used for blocks speculatively written by garbage collection that were never linked to an inode and instantly become invalid. Split the latter functionality out to be easier to understand. This also make it clear that we don't need to attach the rmap inode to a transaction for the skipped blocks case as we never dirty any peristent data structure. Also make the argument order to xfs_zone_record_blocks a bit more natural. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_trace.h | 1 + fs/xfs/xfs_zone_alloc.c | 42 ++++++++++++++++++++++++++++------------- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e1794e3e3156..ac344e42846c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -455,6 +455,7 @@ DEFINE_EVENT(xfs_zone_alloc_class, name, \ xfs_extlen_t len), \ TP_ARGS(oz, rgbno, len)) DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks); +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_skip_blocks); DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks); TRACE_EVENT(xfs_zone_gc_select_victim, diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 33f7eee521a8..f8bd6d741755 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -166,10 +166,9 @@ xfs_open_zone_mark_full( static void xfs_zone_record_blocks( struct xfs_trans *tp, - xfs_fsblock_t fsbno, - xfs_filblks_t len, struct xfs_open_zone *oz, - bool used) + xfs_fsblock_t fsbno, + xfs_filblks_t len) { struct xfs_mount *mp = tp->t_mountp; struct xfs_rtgroup *rtg = oz->oz_rtg; @@ -179,18 +178,37 @@ xfs_zone_record_blocks( xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); - if (used) { - rmapip->i_used_blocks += len; - ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); - } else { - xfs_add_frextents(mp, len); - } + rmapip->i_used_blocks += len; + ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); oz->oz_written += len; if (oz->oz_written == rtg_blocks(rtg)) xfs_open_zone_mark_full(oz); xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); } +/* + * Called for blocks that have been written to disk, but not actually linked to + * an inode, which can happen when garbage collection races with user data + * writes to a file. + */ +static void +xfs_zone_skip_blocks( + struct xfs_open_zone *oz, + xfs_filblks_t len) +{ + struct xfs_rtgroup *rtg = oz->oz_rtg; + + trace_xfs_zone_skip_blocks(oz, 0, len); + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + oz->oz_written += len; + if (oz->oz_written == rtg_blocks(rtg)) + xfs_open_zone_mark_full(oz); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + + xfs_add_frextents(rtg_mount(rtg), len); +} + static int xfs_zoned_map_extent( struct xfs_trans *tp, @@ -250,8 +268,7 @@ xfs_zoned_map_extent( } } - xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, - true); + xfs_zone_record_blocks(tp, oz, new->br_startblock, new->br_blockcount); /* Map the new blocks into the data fork. */ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new); @@ -259,8 +276,7 @@ xfs_zoned_map_extent( skip: trace_xfs_reflink_cow_remap_skip(ip, new); - xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, - false); + xfs_zone_skip_blocks(oz, new->br_blockcount); return 0; } From d02d2c98d25793902f65803ab853b592c7a96b29 Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Mon, 28 Jul 2025 18:07:15 +0800 Subject: [PATCH 037/246] fs: writeback: fix use-after-free in __mark_inode_dirty() An use-after-free issue occurred when __mark_inode_dirty() get the bdi_writeback that was in the progress of switching. CPU: 1 PID: 562 Comm: systemd-random- Not tainted 6.6.56-gb4403bd46a8e #1 ...... pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : __mark_inode_dirty+0x124/0x418 lr : __mark_inode_dirty+0x118/0x418 sp : ffffffc08c9dbbc0 ........ Call trace: __mark_inode_dirty+0x124/0x418 generic_update_time+0x4c/0x60 file_modified+0xcc/0xd0 ext4_buffered_write_iter+0x58/0x124 ext4_file_write_iter+0x54/0x704 vfs_write+0x1c0/0x308 ksys_write+0x74/0x10c __arm64_sys_write+0x1c/0x28 invoke_syscall+0x48/0x114 el0_svc_common.constprop.0+0xc0/0xe0 do_el0_svc+0x1c/0x28 el0_svc+0x40/0xe4 el0t_64_sync_handler+0x120/0x12c el0t_64_sync+0x194/0x198 Root cause is: systemd-random-seed kworker ---------------------------------------------------------------------- ___mark_inode_dirty inode_switch_wbs_work_fn spin_lock(&inode->i_lock); inode_attach_wb locked_inode_to_wb_and_lock_list get inode->i_wb spin_unlock(&inode->i_lock); spin_lock(&wb->list_lock) spin_lock(&inode->i_lock) inode_io_list_move_locked spin_unlock(&wb->list_lock) spin_unlock(&inode->i_lock) spin_lock(&old_wb->list_lock) inode_do_switch_wbs spin_lock(&inode->i_lock) inode->i_wb = new_wb spin_unlock(&inode->i_lock) spin_unlock(&old_wb->list_lock) wb_put_many(old_wb, nr_switched) cgwb_release old wb released wb_wakeup_delayed() accesses wb, then trigger the use-after-free issue Fix this race condition by holding inode spinlock until wb_wakeup_delayed() finished. Signed-off-by: Jiufei Xue Link: https://lore.kernel.org/20250728100715.3863241-1-jiufei.xue@samsung.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cc57367fb641..a07b8cf73ae2 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2608,10 +2608,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) wakeup_bdi = inode_io_list_move_locked(inode, wb, dirty_list); - spin_unlock(&wb->list_lock); - spin_unlock(&inode->i_lock); - trace_writeback_dirty_inode_enqueue(inode); - /* * If this is the first dirty inode for this bdi, * we have to wake-up the corresponding bdi thread @@ -2621,6 +2617,11 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (wakeup_bdi && (wb->bdi->capabilities & BDI_CAP_WRITEBACK)) wb_wakeup_delayed(wb); + + spin_unlock(&wb->list_lock); + spin_unlock(&inode->i_lock); + trace_writeback_dirty_inode_enqueue(inode); + return; } } From 9308366f062129d52e0ee3f7a019f7dd41db33df Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Fri, 8 Aug 2025 03:55:05 +1000 Subject: [PATCH 038/246] open_tree_attr: do not allow id-mapping changes without OPEN_TREE_CLONE As described in commit 7a54947e727b ('Merge patch series "fs: allow changing idmappings"'), open_tree_attr(2) was necessary in order to allow for a detached mount to be created and have its idmappings changed without the risk of any racing threads operating on it. For this reason, mount_setattr(2) still does not allow for id-mappings to be changed. However, there was a bug in commit 2462651ffa76 ("fs: allow changing idmappings") which allowed users to bypass this restriction by calling open_tree_attr(2) *without* OPEN_TREE_CLONE. can_idmap_mount() prevented this bug from allowing an attached mountpoint's id-mapping from being modified (thanks to an is_anon_ns() check), but this still allows for detached (but visible) mounts to have their be id-mapping changed. This risks the same UAF and locking issues as described in the merge commit, and was likely unintentional. Fixes: 2462651ffa76 ("fs: allow changing idmappings") Cc: stable@vger.kernel.org # v6.15+ Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/20250808-open_tree_attr-bugfix-idmap-v1-1-0ec7bc05646c@cyphar.com Signed-off-by: Christian Brauner --- fs/namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index ddfd4457d338..ceb6b57e6a57 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5176,7 +5176,8 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, int ret; struct mount_kattr kattr = {}; - kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE; + if (flags & OPEN_TREE_CLONE) + kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE; if (flags & AT_RECURSIVE) kattr.kflags |= MOUNT_KATTR_RECURSE; From 81e4b9cf365df4cde30157a85cc9f3d673946118 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Fri, 8 Aug 2025 03:55:06 +1000 Subject: [PATCH 039/246] selftests/mount_setattr: add smoke tests for open_tree_attr(2) bug There appear to be no other open_tree_attr(2) tests at the moment, but as a minimal solution just add some additional checks in the existing MOUNT_ATTR_IDMAP tests to make sure that open_tree_attr(2) cannot be used to bypass the tested restrictions that apply to mount_setattr(2). Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/20250808-open_tree_attr-bugfix-idmap-v1-2-0ec7bc05646c@cyphar.com Signed-off-by: Christian Brauner --- .../mount_setattr/mount_setattr_test.c | 77 +++++++++++++++---- 1 file changed, 64 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index b1e4618399be..a688871a98eb 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -107,6 +107,26 @@ #endif #endif +#ifndef __NR_open_tree_attr + #if defined __alpha__ + #define __NR_open_tree_attr 577 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_open_tree_attr (467 + 4000) + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_open_tree_attr (467 + 6000) + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_open_tree_attr (467 + 5000) + #endif + #elif defined __ia64__ + #define __NR_open_tree_attr (467 + 1024) + #else + #define __NR_open_tree_attr 467 + #endif +#endif + #ifndef MOUNT_ATTR_IDMAP #define MOUNT_ATTR_IDMAP 0x00100000 #endif @@ -121,6 +141,12 @@ static inline int sys_mount_setattr(int dfd, const char *path, unsigned int flag return syscall(__NR_mount_setattr, dfd, path, flags, attr, size); } +static inline int sys_open_tree_attr(int dfd, const char *path, unsigned int flags, + struct mount_attr *attr, size_t size) +{ + return syscall(__NR_open_tree_attr, dfd, path, flags, attr, size); +} + static ssize_t write_nointr(int fd, const void *buf, size_t count) { ssize_t ret; @@ -1222,6 +1248,12 @@ TEST_F(mount_setattr_idmapped, attached_mount_inside_current_mount_namespace) attr.userns_fd = get_userns_fd(0, 10000, 10000); ASSERT_GE(attr.userns_fd, 0); ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + /* + * Make sure that open_tree_attr() without OPEN_TREE_CLONE is not a way + * to bypass this mount_setattr() restriction. + */ + ASSERT_LT(sys_open_tree_attr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + ASSERT_EQ(close(attr.userns_fd), 0); ASSERT_EQ(close(open_tree_fd), 0); } @@ -1255,6 +1287,12 @@ TEST_F(mount_setattr_idmapped, attached_mount_outside_current_mount_namespace) ASSERT_GE(attr.userns_fd, 0); ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + /* + * Make sure that open_tree_attr() without OPEN_TREE_CLONE is not a way + * to bypass this mount_setattr() restriction. + */ + ASSERT_LT(sys_open_tree_attr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + ASSERT_EQ(close(attr.userns_fd), 0); ASSERT_EQ(close(open_tree_fd), 0); } @@ -1321,6 +1359,19 @@ TEST_F(mount_setattr_idmapped, detached_mount_outside_current_mount_namespace) ASSERT_EQ(close(open_tree_fd), 0); } +static bool expected_uid_gid(int dfd, const char *path, int flags, + uid_t expected_uid, gid_t expected_gid) +{ + int ret; + struct stat st; + + ret = fstatat(dfd, path, &st, flags); + if (ret < 0) + return false; + + return st.st_uid == expected_uid && st.st_gid == expected_gid; +} + /** * Validate that currently changing the idmapping of an idmapped mount fails. */ @@ -1331,6 +1382,8 @@ TEST_F(mount_setattr_idmapped, change_idmapping) .attr_set = MOUNT_ATTR_IDMAP, }; + ASSERT_TRUE(expected_uid_gid(-EBADF, "/mnt/D", 0, 0, 0)); + if (!mount_setattr_supported()) SKIP(return, "mount_setattr syscall not supported"); @@ -1348,27 +1401,25 @@ TEST_F(mount_setattr_idmapped, change_idmapping) AT_EMPTY_PATH, &attr, sizeof(attr)), 0); ASSERT_EQ(close(attr.userns_fd), 0); + EXPECT_FALSE(expected_uid_gid(open_tree_fd, ".", 0, 0, 0)); + EXPECT_TRUE(expected_uid_gid(open_tree_fd, ".", 0, 10000, 10000)); + /* Change idmapping on a detached mount that is already idmapped. */ attr.userns_fd = get_userns_fd(0, 20000, 10000); ASSERT_GE(attr.userns_fd, 0); ASSERT_NE(sys_mount_setattr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + /* + * Make sure that open_tree_attr() without OPEN_TREE_CLONE is not a way + * to bypass this mount_setattr() restriction. + */ + EXPECT_LT(sys_open_tree_attr(open_tree_fd, "", AT_EMPTY_PATH, &attr, sizeof(attr)), 0); + EXPECT_FALSE(expected_uid_gid(open_tree_fd, ".", 0, 20000, 20000)); + EXPECT_TRUE(expected_uid_gid(open_tree_fd, ".", 0, 10000, 10000)); + ASSERT_EQ(close(attr.userns_fd), 0); ASSERT_EQ(close(open_tree_fd), 0); } -static bool expected_uid_gid(int dfd, const char *path, int flags, - uid_t expected_uid, gid_t expected_gid) -{ - int ret; - struct stat st; - - ret = fstatat(dfd, path, &st, flags); - if (ret < 0) - return false; - - return st.st_uid == expected_uid && st.st_gid == expected_gid; -} - TEST_F(mount_setattr_idmapped, idmap_mount_tree_invalid) { int open_tree_fd = -EBADF; From 6b65028e2b51c023a816eabffea88980fdd5564e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 30 Jul 2025 12:28:41 +0200 Subject: [PATCH 040/246] iomap: Fix broken data integrity guarantees for O_SYNC writes Commit d279c80e0bac ("iomap: inline iomap_dio_bio_opflags()") has broken the logic in iomap_dio_bio_iter() in a way that when the device does support FUA (or has no writeback cache) and the direct IO happens to freshly allocated or unwritten extents, we will *not* issue fsync after completing direct IO O_SYNC / O_DSYNC write because the IOMAP_DIO_WRITE_THROUGH flag stays mistakenly set. Fix the problem by clearing IOMAP_DIO_WRITE_THROUGH whenever we do not perform FUA write as it was originally intended. CC: John Garry CC: Ritesh Harjani (IBM) Fixes: d279c80e0bac ("iomap: inline iomap_dio_bio_opflags()") CC: stable@vger.kernel.org Signed-off-by: Jan Kara Link: https://lore.kernel.org/20250730102840.20470-2-jack@suse.cz Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: John Garry Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/direct-io.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 6f25d4cfea9f..b84f6af2eb4c 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -363,14 +363,14 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) if (iomap->flags & IOMAP_F_SHARED) dio->flags |= IOMAP_DIO_COW; - if (iomap->flags & IOMAP_F_NEW) { + if (iomap->flags & IOMAP_F_NEW) need_zeroout = true; - } else if (iomap->type == IOMAP_MAPPED) { - if (iomap_dio_can_use_fua(iomap, dio)) - bio_opf |= REQ_FUA; - else - dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; - } + else if (iomap->type == IOMAP_MAPPED && + iomap_dio_can_use_fua(iomap, dio)) + bio_opf |= REQ_FUA; + + if (!(bio_opf & REQ_FUA)) + dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; /* * We can only do deferred completion for pure overwrites that From 542ede096e48436dbd70869640c0d88180565933 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 7 Aug 2025 10:50:15 -0700 Subject: [PATCH 041/246] fuse: keep inode->i_blkbits constant With fuse now using iomap for writeback handling, inode blkbits changes are problematic because iomap relies on inode->i_blkbits for its internal bitmap logic. Currently we change inode->i_blkbits in fuse to match the attr->blksize value passed in by the server. This commit keeps inode->i_blkbits constant in fuse. Any attr->blksize values passed in by the server will not update inode->i_blkbits. The client-side behavior for stat is unaffected, stat will still reflect the blocksize passed in by the server. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/20250807175015.515192-1-joannelkoong@gmail.com Fixes: ef7e7cbb32 ("fuse: use iomap for writeback") Signed-off-by: Christian Brauner --- fs/fuse/inode.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ecb869e895ab..67c2318bfc42 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -289,11 +289,6 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, } } - if (attr->blksize != 0) - inode->i_blkbits = ilog2(attr->blksize); - else - inode->i_blkbits = inode->i_sb->s_blocksize_bits; - /* * Don't set the sticky bit in i_mode, unless we want the VFS * to check permissions. This prevents failures due to the From 2319f9d0aa644eb9666c7be903078f50ecc2eb5b Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Mon, 11 Aug 2025 09:49:57 +0200 Subject: [PATCH 042/246] selftests/coredump: Remove the read() that fails the test Resolve a conflict between commit 6a68d28066b6 ("selftests/coredump: Fix "socket_detect_userspace_client" test failure") and commit 994dc26302ed ("selftests/coredump: fix build") The first commit adds a read() to wait for write() from another thread to finish. But the second commit removes the write(). Now that the two commits are in the same tree, the read() now gets EOF and the test fails. Remove this read() so that the test passes. Signed-off-by: Nam Cao Link: https://lore.kernel.org/20250811074957.4079616-1-namcao@linutronix.de Signed-off-by: Christian Brauner --- tools/testing/selftests/coredump/stackdump_test.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/testing/selftests/coredump/stackdump_test.c b/tools/testing/selftests/coredump/stackdump_test.c index 5a5a7a5f7e1d..a4ac80bb1003 100644 --- a/tools/testing/selftests/coredump/stackdump_test.c +++ b/tools/testing/selftests/coredump/stackdump_test.c @@ -446,9 +446,6 @@ TEST_F(coredump, socket_detect_userspace_client) if (info.coredump_mask & PIDFD_COREDUMPED) goto out; - if (read(fd_coredump, &c, 1) < 1) - goto out; - exit_code = EXIT_SUCCESS; out: if (fd_peer_pidfd >= 0) From d5dd409812eca084e68208926bb629c8f708651f Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 5 Jun 2025 12:38:52 +0200 Subject: [PATCH 043/246] drbd: Remove the open-coded page pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the network stack keeps a reference for too long, DRBD keeps references on a higher number of pages as a consequence. Fix all that by no longer relying on page reference counts dropping to an expected value. Instead, DRBD gives up its reference and lets the system handle everything else. While at it, remove the open-coded custom page pool mechanism and use the page_pool included in the kernel. Signed-off-by: Philipp Reisner Signed-off-by: Christoph Böhmwalder Tested-by: Eric Hagberg Link: https://lore.kernel.org/r/20250605103852.23029-1-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 39 +---- drivers/block/drbd/drbd_main.c | 59 ++----- drivers/block/drbd/drbd_receiver.c | 262 ++++------------------------- drivers/block/drbd/drbd_worker.c | 56 ++---- 4 files changed, 70 insertions(+), 346 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e21492981f7d..f6d6276974ee 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -380,6 +380,9 @@ enum { /* this is/was a write request */ __EE_WRITE, + /* hand back using mempool_free(e, drbd_buffer_page_pool) */ + __EE_RELEASE_TO_MEMPOOL, + /* this is/was a write same request */ __EE_WRITE_SAME, @@ -402,6 +405,7 @@ enum { #define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) #define EE_SUBMITTED (1<<__EE_SUBMITTED) #define EE_WRITE (1<<__EE_WRITE) +#define EE_RELEASE_TO_MEMPOOL (1<<__EE_RELEASE_TO_MEMPOOL) #define EE_WRITE_SAME (1<<__EE_WRITE_SAME) #define EE_APPLICATION (1<<__EE_APPLICATION) #define EE_RS_THIN_REQ (1<<__EE_RS_THIN_REQ) @@ -858,7 +862,6 @@ struct drbd_device { struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */ struct list_head done_ee; /* need to send P_WRITE_ACK */ struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */ - struct list_head net_ee; /* zero-copy network send in progress */ struct list_head resync_reads; atomic_t pp_in_use; /* allocated from page pool */ @@ -1329,24 +1332,6 @@ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ extern mempool_t drbd_request_mempool; extern mempool_t drbd_ee_mempool; -/* drbd's page pool, used to buffer data received from the peer, - * or data requested by the peer. - * - * This does not have an emergency reserve. - * - * When allocating from this pool, it first takes pages from the pool. - * Only if the pool is depleted will try to allocate from the system. - * - * The assumption is that pages taken from this pool will be processed, - * and given back, "quickly", and then can be recycled, so we can avoid - * frequent calls to alloc_page(), and still will be able to make progress even - * under memory pressure. - */ -extern struct page *drbd_pp_pool; -extern spinlock_t drbd_pp_lock; -extern int drbd_pp_vacant; -extern wait_queue_head_t drbd_pp_wait; - /* We also need a standard (emergency-reserve backed) page pool * for meta data IO (activity log, bitmap). * We can keep it global, as long as it is used as "N pages at a time". @@ -1354,6 +1339,7 @@ extern wait_queue_head_t drbd_pp_wait; */ #define DRBD_MIN_POOL_PAGES 128 extern mempool_t drbd_md_io_page_pool; +extern mempool_t drbd_buffer_page_pool; /* We also need to make sure we get a bio * when we need it for housekeeping purposes */ @@ -1488,10 +1474,7 @@ extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, sector_t, unsigned int, unsigned int, gfp_t) __must_hold(local); -extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *, - int); -#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0) -#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1) +extern void drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *req); extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); @@ -1610,16 +1593,6 @@ static inline struct page *page_chain_next(struct page *page) for (; page && ({ n = page_chain_next(page); 1; }); page = n) -static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req) -{ - struct page *page = peer_req->pages; - page_chain_for_each(page) { - if (page_count(page) > 1) - return 1; - } - return 0; -} - static inline union drbd_state drbd_read_state(struct drbd_device *device) { struct drbd_resource *resource = device->resource; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 52724b79be30..c73376886e7a 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -114,20 +114,10 @@ struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ mempool_t drbd_request_mempool; mempool_t drbd_ee_mempool; mempool_t drbd_md_io_page_pool; +mempool_t drbd_buffer_page_pool; struct bio_set drbd_md_io_bio_set; struct bio_set drbd_io_bio_set; -/* I do not use a standard mempool, because: - 1) I want to hand out the pre-allocated objects first. - 2) I want to be able to interrupt sleeping allocation with a signal. - Note: This is a single linked list, the next pointer is the private - member of struct page. - */ -struct page *drbd_pp_pool; -DEFINE_SPINLOCK(drbd_pp_lock); -int drbd_pp_vacant; -wait_queue_head_t drbd_pp_wait; - DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5); static const struct block_device_operations drbd_ops = { @@ -1611,6 +1601,7 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device, struct drbd_peer_request *peer_req) { + bool use_sendpage = !(peer_req->flags & EE_RELEASE_TO_MEMPOOL); struct page *page = peer_req->pages; unsigned len = peer_req->i.size; int err; @@ -1619,8 +1610,13 @@ static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device, page_chain_for_each(page) { unsigned l = min_t(unsigned, len, PAGE_SIZE); - err = _drbd_send_page(peer_device, page, 0, l, - page_chain_next(page) ? MSG_MORE : 0); + if (likely(use_sendpage)) + err = _drbd_send_page(peer_device, page, 0, l, + page_chain_next(page) ? MSG_MORE : 0); + else + err = _drbd_no_send_page(peer_device, page, 0, l, + page_chain_next(page) ? MSG_MORE : 0); + if (err) return err; len -= l; @@ -1962,7 +1958,6 @@ void drbd_init_set_defaults(struct drbd_device *device) INIT_LIST_HEAD(&device->sync_ee); INIT_LIST_HEAD(&device->done_ee); INIT_LIST_HEAD(&device->read_ee); - INIT_LIST_HEAD(&device->net_ee); INIT_LIST_HEAD(&device->resync_reads); INIT_LIST_HEAD(&device->resync_work.list); INIT_LIST_HEAD(&device->unplug_work.list); @@ -2043,7 +2038,6 @@ void drbd_device_cleanup(struct drbd_device *device) D_ASSERT(device, list_empty(&device->sync_ee)); D_ASSERT(device, list_empty(&device->done_ee)); D_ASSERT(device, list_empty(&device->read_ee)); - D_ASSERT(device, list_empty(&device->net_ee)); D_ASSERT(device, list_empty(&device->resync_reads)); D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q)); D_ASSERT(device, list_empty(&device->resync_work.list)); @@ -2055,19 +2049,11 @@ void drbd_device_cleanup(struct drbd_device *device) static void drbd_destroy_mempools(void) { - struct page *page; - - while (drbd_pp_pool) { - page = drbd_pp_pool; - drbd_pp_pool = (struct page *)page_private(page); - __free_page(page); - drbd_pp_vacant--; - } - /* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */ bioset_exit(&drbd_io_bio_set); bioset_exit(&drbd_md_io_bio_set); + mempool_exit(&drbd_buffer_page_pool); mempool_exit(&drbd_md_io_page_pool); mempool_exit(&drbd_ee_mempool); mempool_exit(&drbd_request_mempool); @@ -2086,9 +2072,8 @@ static void drbd_destroy_mempools(void) static int drbd_create_mempools(void) { - struct page *page; const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count; - int i, ret; + int ret; /* caches */ drbd_request_cache = kmem_cache_create( @@ -2125,6 +2110,10 @@ static int drbd_create_mempools(void) if (ret) goto Enomem; + ret = mempool_init_page_pool(&drbd_buffer_page_pool, number, 0); + if (ret) + goto Enomem; + ret = mempool_init_slab_pool(&drbd_request_mempool, number, drbd_request_cache); if (ret) @@ -2134,15 +2123,6 @@ static int drbd_create_mempools(void) if (ret) goto Enomem; - for (i = 0; i < number; i++) { - page = alloc_page(GFP_HIGHUSER); - if (!page) - goto Enomem; - set_page_private(page, (unsigned long)drbd_pp_pool); - drbd_pp_pool = page; - } - drbd_pp_vacant = number; - return 0; Enomem: @@ -2169,10 +2149,6 @@ static void drbd_release_all_peer_reqs(struct drbd_device *device) rr = drbd_free_peer_reqs(device, &device->done_ee); if (rr) drbd_err(device, "%d EEs in done list found!\n", rr); - - rr = drbd_free_peer_reqs(device, &device->net_ee); - if (rr) - drbd_err(device, "%d EEs in net list found!\n", rr); } /* caution. no locking. */ @@ -2863,11 +2839,6 @@ static int __init drbd_init(void) return err; } - /* - * allocate all necessary structs - */ - init_waitqueue_head(&drbd_pp_wait); - drbd_proc = NULL; /* play safe for drbd_cleanup */ idr_init(&drbd_devices); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 975024cf03c5..caaf2781136d 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" @@ -63,182 +64,31 @@ static int e_end_block(struct drbd_work *, int); #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) -/* - * some helper functions to deal with single linked page lists, - * page->private being our "next" pointer. - */ - -/* If at least n pages are linked at head, get n pages off. - * Otherwise, don't modify head, and return NULL. - * Locking is the responsibility of the caller. - */ -static struct page *page_chain_del(struct page **head, int n) -{ - struct page *page; - struct page *tmp; - - BUG_ON(!n); - BUG_ON(!head); - - page = *head; - - if (!page) - return NULL; - - while (page) { - tmp = page_chain_next(page); - if (--n == 0) - break; /* found sufficient pages */ - if (tmp == NULL) - /* insufficient pages, don't use any of them. */ - return NULL; - page = tmp; - } - - /* add end of list marker for the returned list */ - set_page_private(page, 0); - /* actual return value, and adjustment of head */ - page = *head; - *head = tmp; - return page; -} - -/* may be used outside of locks to find the tail of a (usually short) - * "private" page chain, before adding it back to a global chain head - * with page_chain_add() under a spinlock. */ -static struct page *page_chain_tail(struct page *page, int *len) -{ - struct page *tmp; - int i = 1; - while ((tmp = page_chain_next(page))) { - ++i; - page = tmp; - } - if (len) - *len = i; - return page; -} - -static int page_chain_free(struct page *page) -{ - struct page *tmp; - int i = 0; - page_chain_for_each_safe(page, tmp) { - put_page(page); - ++i; - } - return i; -} - -static void page_chain_add(struct page **head, - struct page *chain_first, struct page *chain_last) -{ -#if 1 - struct page *tmp; - tmp = page_chain_tail(chain_first, NULL); - BUG_ON(tmp != chain_last); -#endif - - /* add chain to head */ - set_page_private(chain_last, (unsigned long)*head); - *head = chain_first; -} - -static struct page *__drbd_alloc_pages(struct drbd_device *device, - unsigned int number) +static struct page *__drbd_alloc_pages(unsigned int number) { struct page *page = NULL; struct page *tmp = NULL; unsigned int i = 0; - /* Yes, testing drbd_pp_vacant outside the lock is racy. - * So what. It saves a spin_lock. */ - if (drbd_pp_vacant >= number) { - spin_lock(&drbd_pp_lock); - page = page_chain_del(&drbd_pp_pool, number); - if (page) - drbd_pp_vacant -= number; - spin_unlock(&drbd_pp_lock); - if (page) - return page; - } - /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ for (i = 0; i < number; i++) { - tmp = alloc_page(GFP_TRY); + tmp = mempool_alloc(&drbd_buffer_page_pool, GFP_TRY); if (!tmp) - break; + goto fail; set_page_private(tmp, (unsigned long)page); page = tmp; } - - if (i == number) - return page; - - /* Not enough pages immediately available this time. - * No need to jump around here, drbd_alloc_pages will retry this - * function "soon". */ - if (page) { - tmp = page_chain_tail(page, NULL); - spin_lock(&drbd_pp_lock); - page_chain_add(&drbd_pp_pool, page, tmp); - drbd_pp_vacant += i; - spin_unlock(&drbd_pp_lock); + return page; +fail: + page_chain_for_each_safe(page, tmp) { + set_page_private(page, 0); + mempool_free(page, &drbd_buffer_page_pool); } return NULL; } -static void reclaim_finished_net_peer_reqs(struct drbd_device *device, - struct list_head *to_be_freed) -{ - struct drbd_peer_request *peer_req, *tmp; - - /* The EEs are always appended to the end of the list. Since - they are sent in order over the wire, they have to finish - in order. As soon as we see the first not finished we can - stop to examine the list... */ - - list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) { - if (drbd_peer_req_has_active_page(peer_req)) - break; - list_move(&peer_req->w.list, to_be_freed); - } -} - -static void drbd_reclaim_net_peer_reqs(struct drbd_device *device) -{ - LIST_HEAD(reclaimed); - struct drbd_peer_request *peer_req, *t; - - spin_lock_irq(&device->resource->req_lock); - reclaim_finished_net_peer_reqs(device, &reclaimed); - spin_unlock_irq(&device->resource->req_lock); - list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) - drbd_free_net_peer_req(device, peer_req); -} - -static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection) -{ - struct drbd_peer_device *peer_device; - int vnr; - - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - if (!atomic_read(&device->pp_in_use_by_net)) - continue; - - kref_get(&device->kref); - rcu_read_unlock(); - drbd_reclaim_net_peer_reqs(device); - kref_put(&device->kref, drbd_destroy_device); - rcu_read_lock(); - } - rcu_read_unlock(); -} - /** * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) * @peer_device: DRBD device. @@ -263,9 +113,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int bool retry) { struct drbd_device *device = peer_device->device; - struct page *page = NULL; + struct page *page; struct net_conf *nc; - DEFINE_WAIT(wait); unsigned int mxb; rcu_read_lock(); @@ -273,37 +122,9 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int mxb = nc ? nc->max_buffers : 1000000; rcu_read_unlock(); - if (atomic_read(&device->pp_in_use) < mxb) - page = __drbd_alloc_pages(device, number); - - /* Try to keep the fast path fast, but occasionally we need - * to reclaim the pages we lended to the network stack. */ - if (page && atomic_read(&device->pp_in_use_by_net) > 512) - drbd_reclaim_net_peer_reqs(device); - - while (page == NULL) { - prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); - - drbd_reclaim_net_peer_reqs(device); - - if (atomic_read(&device->pp_in_use) < mxb) { - page = __drbd_alloc_pages(device, number); - if (page) - break; - } - - if (!retry) - break; - - if (signal_pending(current)) { - drbd_warn(device, "drbd_alloc_pages interrupted!\n"); - break; - } - - if (schedule_timeout(HZ/10) == 0) - mxb = UINT_MAX; - } - finish_wait(&drbd_pp_wait, &wait); + if (atomic_read(&device->pp_in_use) >= mxb) + schedule_timeout_interruptible(HZ / 10); + page = __drbd_alloc_pages(number); if (page) atomic_add(number, &device->pp_in_use); @@ -314,29 +135,25 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int * Is also used from inside an other spin_lock_irq(&resource->req_lock); * Either links the page chain back to the global pool, * or returns all pages to the system. */ -static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net) +static void drbd_free_pages(struct drbd_device *device, struct page *page) { - atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use; - int i; + struct page *tmp; + int i = 0; if (page == NULL) return; - if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count) - i = page_chain_free(page); - else { - struct page *tmp; - tmp = page_chain_tail(page, &i); - spin_lock(&drbd_pp_lock); - page_chain_add(&drbd_pp_pool, page, tmp); - drbd_pp_vacant += i; - spin_unlock(&drbd_pp_lock); + page_chain_for_each_safe(page, tmp) { + set_page_private(page, 0); + if (page_count(page) == 1) + mempool_free(page, &drbd_buffer_page_pool); + else + put_page(page); + i++; } - i = atomic_sub_return(i, a); + i = atomic_sub_return(i, &device->pp_in_use); if (i < 0) - drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n", - is_net ? "pp_in_use_by_net" : "pp_in_use", i); - wake_up(&drbd_pp_wait); + drbd_warn(device, "ASSERTION FAILED: pp_in_use: %d < 0\n", i); } /* @@ -380,6 +197,8 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto gfpflags_allow_blocking(gfp_mask)); if (!page) goto fail; + if (!mempool_is_saturated(&drbd_buffer_page_pool)) + peer_req->flags |= EE_RELEASE_TO_MEMPOOL; } memset(peer_req, 0, sizeof(*peer_req)); @@ -403,13 +222,12 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto return NULL; } -void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req, - int is_net) +void drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req) { might_sleep(); if (peer_req->flags & EE_HAS_DIGEST) kfree(peer_req->digest); - drbd_free_pages(device, peer_req->pages, is_net); + drbd_free_pages(device, peer_req->pages); D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0); D_ASSERT(device, drbd_interval_empty(&peer_req->i)); if (!expect(device, !(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) { @@ -424,14 +242,13 @@ int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list) LIST_HEAD(work_list); struct drbd_peer_request *peer_req, *t; int count = 0; - int is_net = list == &device->net_ee; spin_lock_irq(&device->resource->req_lock); list_splice_init(list, &work_list); spin_unlock_irq(&device->resource->req_lock); list_for_each_entry_safe(peer_req, t, &work_list, w.list) { - __drbd_free_peer_req(device, peer_req, is_net); + drbd_free_peer_req(device, peer_req); count++; } return count; @@ -443,18 +260,13 @@ int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list) static int drbd_finish_peer_reqs(struct drbd_device *device) { LIST_HEAD(work_list); - LIST_HEAD(reclaimed); struct drbd_peer_request *peer_req, *t; int err = 0; spin_lock_irq(&device->resource->req_lock); - reclaim_finished_net_peer_reqs(device, &reclaimed); list_splice_init(&device->done_ee, &work_list); spin_unlock_irq(&device->resource->req_lock); - list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) - drbd_free_net_peer_req(device, peer_req); - /* possible callbacks here: * e_end_block, and e_end_resync_block, e_send_superseded. * all ignore the last argument. @@ -1975,7 +1787,7 @@ static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size) data_size -= len; } kunmap(page); - drbd_free_pages(peer_device->device, page, 0); + drbd_free_pages(peer_device->device, page); return err; } @@ -5224,16 +5036,6 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device) put_ldev(device); } - /* tcp_close and release of sendpage pages can be deferred. I don't - * want to use SO_LINGER, because apparently it can be deferred for - * more than 20 seconds (longest time I checked). - * - * Actually we don't care for exactly when the network stack does its - * put_page(), but release our reference on these pages right here. - */ - i = drbd_free_peer_reqs(device, &device->net_ee); - if (i) - drbd_info(device, "net_ee not empty, killed %u entries\n", i); i = atomic_read(&device->pp_in_use_by_net); if (i) drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i); @@ -5980,8 +5782,6 @@ int drbd_ack_receiver(struct drbd_thread *thi) while (get_t_state(thi) == RUNNING) { drbd_thread_current_set_cpu(thi); - conn_reclaim_net_peer_reqs(connection); - if (test_and_clear_bit(SEND_PING, &connection->flags)) { if (drbd_send_ping(connection)) { drbd_err(connection, "drbd_send_ping has failed\n"); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index a6ea737b3b71..dea3e79d044f 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1030,22 +1030,6 @@ out: return 1; } -/* helper */ -static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req) -{ - if (drbd_peer_req_has_active_page(peer_req)) { - /* This might happen if sendpage() has not finished */ - int i = PFN_UP(peer_req->i.size); - atomic_add(i, &device->pp_in_use_by_net); - atomic_sub(i, &device->pp_in_use); - spin_lock_irq(&device->resource->req_lock); - list_add_tail(&peer_req->w.list, &device->net_ee); - spin_unlock_irq(&device->resource->req_lock); - wake_up(&drbd_pp_wait); - } else - drbd_free_peer_req(device, peer_req); -} - /** * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST * @w: work object. @@ -1059,9 +1043,8 @@ int w_e_end_data_req(struct drbd_work *w, int cancel) int err; if (unlikely(cancel)) { - drbd_free_peer_req(device, peer_req); - dec_unacked(device); - return 0; + err = 0; + goto out; } if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { @@ -1074,12 +1057,12 @@ int w_e_end_data_req(struct drbd_work *w, int cancel) err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req); } - dec_unacked(device); - - move_to_net_ee_or_free(device, peer_req); - if (unlikely(err)) drbd_err(device, "drbd_send_block() failed\n"); +out: + dec_unacked(device); + drbd_free_peer_req(device, peer_req); + return err; } @@ -1120,9 +1103,8 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel) int err; if (unlikely(cancel)) { - drbd_free_peer_req(device, peer_req); - dec_unacked(device); - return 0; + err = 0; + goto out; } if (get_ldev_if_state(device, D_FAILED)) { @@ -1155,13 +1137,12 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel) /* update resync data with failure */ drbd_rs_failed_io(peer_device, peer_req->i.sector, peer_req->i.size); } - - dec_unacked(device); - - move_to_net_ee_or_free(device, peer_req); - if (unlikely(err)) drbd_err(device, "drbd_send_block() failed\n"); +out: + dec_unacked(device); + drbd_free_peer_req(device, peer_req); + return err; } @@ -1176,9 +1157,8 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel) int err, eq = 0; if (unlikely(cancel)) { - drbd_free_peer_req(device, peer_req); - dec_unacked(device); - return 0; + err = 0; + goto out; } if (get_ldev(device)) { @@ -1220,12 +1200,12 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel) if (drbd_ratelimit()) drbd_err(device, "Sending NegDReply. I guess it gets messy.\n"); } - - dec_unacked(device); - move_to_net_ee_or_free(device, peer_req); - if (unlikely(err)) drbd_err(device, "drbd_send_block/ack() failed\n"); +out: + dec_unacked(device); + drbd_free_peer_req(device, peer_req); + return err; } From 212c928d01e9ea1d1c46a114650b551da8ca823e Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Fri, 8 Aug 2025 15:44:43 -0600 Subject: [PATCH 044/246] ublk: don't quiesce in ublk_ch_release ublk_ch_release currently quiesces the device's request_queue while setting force_abort/fail_io. This avoids data races by preventing concurrent reads from the I/O path, but is not strictly needed - at this point, canceling is already set and guaranteed to be observed by any concurrently executing I/Os, so they will be handled properly even if the changes to force_abort/fail_io propagate to the I/O path later. Remove the quiesce/unquiesce calls from ublk_ch_release. This makes the writes to force_abort/fail_io concurrent with the reads in the I/O path, so make the accesses atomic. Before this change, the call to blk_mq_quiesce_queue was responsible for most (90%) of the runtime of ublk_ch_release. With that call eliminated, ublk_ch_release runs much faster. Here is a comparison of the total time spent in calls to ublk_ch_release when a server handling 128 devices exits, before and after this change: before: 1.11s after: 0.09s Signed-off-by: Uday Shankar Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250808-ublk_quiesce2-v1-1-f87ade33fa3d@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 6561d2a561fa..6b95cf48ae77 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1389,7 +1389,7 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, { blk_status_t res; - if (unlikely(ubq->fail_io)) + if (unlikely(READ_ONCE(ubq->fail_io))) return BLK_STS_TARGET; /* With recovery feature enabled, force_abort is set in @@ -1401,7 +1401,8 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, * Note: force_abort is guaranteed to be seen because it is set * before request queue is unqiuesced. */ - if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort)) + if (ublk_nosrv_should_queue_io(ubq) && + unlikely(READ_ONCE(ubq->force_abort))) return BLK_STS_IOERR; if (check_cancel && unlikely(ubq->canceling)) @@ -1644,7 +1645,6 @@ static int ublk_ch_release(struct inode *inode, struct file *filp) * Transition the device to the nosrv state. What exactly this * means depends on the recovery flags */ - blk_mq_quiesce_queue(disk->queue); if (ublk_nosrv_should_stop_dev(ub)) { /* * Allow any pending/future I/O to pass through quickly @@ -1652,8 +1652,7 @@ static int ublk_ch_release(struct inode *inode, struct file *filp) * waits for all pending I/O to complete */ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) - ublk_get_queue(ub, i)->force_abort = true; - blk_mq_unquiesce_queue(disk->queue); + WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true); ublk_stop_dev_unlocked(ub); } else { @@ -1663,9 +1662,8 @@ static int ublk_ch_release(struct inode *inode, struct file *filp) } else { ub->dev_info.state = UBLK_S_DEV_FAIL_IO; for (i = 0; i < ub->dev_info.nr_hw_queues; i++) - ublk_get_queue(ub, i)->fail_io = true; + WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true); } - blk_mq_unquiesce_queue(disk->queue); } unlock: mutex_unlock(&ub->mutex); From 5058a62875e1916e5133a1639f0207ea2148c0bc Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 8 Aug 2025 09:52:15 -0600 Subject: [PATCH 045/246] ublk: check for unprivileged daemon on each I/O fetch Commit ab03a61c6614 ("ublk: have a per-io daemon instead of a per-queue daemon") allowed each ublk I/O to have an independent daemon task. However, nr_privileged_daemon is only computed based on whether the last I/O fetched in each ublk queue has an unprivileged daemon task. Fix this by checking whether every fetched I/O's daemon is privileged. Change nr_privileged_daemon from a count of queues to a boolean indicating whether any I/Os have an unprivileged daemon. Signed-off-by: Caleb Sander Mateos Fixes: ab03a61c6614 ("ublk: have a per-io daemon instead of a per-queue daemon") Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250808155216.296170-1-csander@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 6b95cf48ae77..99abd67b708b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -235,7 +235,7 @@ struct ublk_device { struct completion completion; unsigned int nr_queues_ready; - unsigned int nr_privileged_daemon; + bool unprivileged_daemons; struct mutex cancel_mutex; bool canceling; pid_t ublksrv_tgid; @@ -1551,7 +1551,7 @@ static void ublk_reset_ch_dev(struct ublk_device *ub) /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */ ub->mm = NULL; ub->nr_queues_ready = 0; - ub->nr_privileged_daemon = 0; + ub->unprivileged_daemons = false; ub->ublksrv_tgid = -1; } @@ -1978,12 +1978,10 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) __must_hold(&ub->mutex) { ubq->nr_io_ready++; - if (ublk_queue_ready(ubq)) { + if (ublk_queue_ready(ubq)) ub->nr_queues_ready++; - - if (capable(CAP_SYS_ADMIN)) - ub->nr_privileged_daemon++; - } + if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN)) + ub->unprivileged_daemons = true; if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) { /* now we are ready for handling ublk io request */ @@ -2878,8 +2876,8 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, ublk_apply_params(ub); - /* don't probe partitions if any one ubq daemon is un-trusted */ - if (ub->nr_privileged_daemon != ub->nr_queues_ready) + /* don't probe partitions if any daemon task is un-trusted */ + if (ub->unprivileged_daemons) set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); ublk_get_device(ub); From 8f3e4e87b0945aeea8b5a5aa43c419f4a1b4ca6a Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Mon, 11 Aug 2025 16:11:35 +0800 Subject: [PATCH 046/246] block, bfq: remove redundant __GFP_NOWARN Commit 16f5dfbc851b ("gfp: include __GFP_NOWARN in GFP_NOWAIT") made GFP_NOWAIT implicitly include __GFP_NOWARN. Therefore, explicit __GFP_NOWARN combined with GFP_NOWAIT (e.g., `GFP_NOWAIT | __GFP_NOWARN`) is now redundant. Let's clean up these redundant flags across subsystems. Reviewed-by: Yu Kuai Signed-off-by: Qianfeng Rong Link: https://lore.kernel.org/r/20250811081135.374315-1-rongqianfeng@vivo.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3bf76902f07f..50e51047e1fe 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5847,8 +5847,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, goto out; } - bfqq = kmem_cache_alloc_node(bfq_pool, - GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, + bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO, bfqd->queue->node); if (bfqq) { From 196447c712dd486f4315356c572a1d13dd743f08 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Sat, 9 Aug 2025 22:13:58 +0800 Subject: [PATCH 047/246] blk-cgroup: remove redundant __GFP_NOWARN Commit 16f5dfbc851b ("gfp: include __GFP_NOWARN in GFP_NOWAIT") made GFP_NOWAIT implicitly include __GFP_NOWARN. Therefore, explicit __GFP_NOWARN combined with GFP_NOWAIT (e.g., `GFP_NOWAIT | __GFP_NOWARN`) is now redundant. Let's clean up these redundant flags across subsystems. Signed-off-by: Qianfeng Rong Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250809141358.168781-1-rongqianfeng@vivo.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5936db7f8475..fe9ebd6a2e14 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -394,7 +394,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN); + new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_css; @@ -1467,7 +1467,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) spin_lock_init(&blkcg->lock); refcount_set(&blkcg->online_pin, 1); - INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); @@ -1630,7 +1630,7 @@ retry: pd_prealloc = NULL; } else { pd = pol->pd_alloc_fn(disk, blkg->blkcg, - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); } if (!pd) { From 343dc5423bfe876c12bb80c56f5e44286e442a07 Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Fri, 8 Aug 2025 13:36:09 +0800 Subject: [PATCH 048/246] block: fix kobject double initialization in add_disk Device-mapper can call add_disk() multiple times for the same gendisk due to its two-phase creation process (dm create + dm load). This leads to kobject double initialization errors when the underlying iSCSI devices become temporarily unavailable and then reappear. However, if the first add_disk() call fails and is retried, the queue_kobj gets initialized twice, causing: kobject: kobject (ffff88810c27bb90): tried to init an initialized object, something is seriously wrong. Call Trace: dump_stack_lvl+0x5b/0x80 kobject_init.cold+0x43/0x51 blk_register_queue+0x46/0x280 add_disk_fwnode+0xb5/0x280 dm_setup_md_queue+0x194/0x1c0 table_load+0x297/0x2d0 ctl_ioctl+0x2a2/0x480 dm_ctl_ioctl+0xe/0x20 __x64_sys_ioctl+0xc7/0x110 do_syscall_64+0x72/0x390 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix this by separating kobject initialization from sysfs registration: - Initialize queue_kobj early during gendisk allocation - add_disk() only adds the already-initialized kobject to sysfs - del_gendisk() removes from sysfs but doesn't destroy the kobject - Final cleanup happens when the disk is released Fixes: 2bd85221a625 ("block: untangle request_queue refcounting from sysfs") Reported-by: Li Lingfeng Closes: https://lore.kernel.org/all/83591d0b-2467-433c-bce0-5581298eb161@huawei.com/ Signed-off-by: Zheng Qixing Reviewed-by: Ming Lei Reviewed-by: Yu Kuai Reviewed-by: Nilay Shroff Link: https://lore.kernel.org/r/20250808053609.3237836-1-zhengqixing@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 12 +++++------- block/blk.h | 1 + block/genhd.c | 2 ++ 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 396cded255ea..c5cf79a20842 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -847,7 +847,7 @@ static void blk_queue_release(struct kobject *kobj) /* nothing to do here, all data is associated with the parent gendisk */ } -static const struct kobj_type blk_queue_ktype = { +const struct kobj_type blk_queue_ktype = { .default_groups = blk_queue_attr_groups, .sysfs_ops = &queue_sysfs_ops, .release = blk_queue_release, @@ -875,15 +875,14 @@ int blk_register_queue(struct gendisk *disk) struct request_queue *q = disk->queue; int ret; - kobject_init(&disk->queue_kobj, &blk_queue_ktype); ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue"); if (ret < 0) - goto out_put_queue_kobj; + return ret; if (queue_is_mq(q)) { ret = blk_mq_sysfs_register(disk); if (ret) - goto out_put_queue_kobj; + goto out_del_queue_kobj; } mutex_lock(&q->sysfs_lock); @@ -934,8 +933,8 @@ out_debugfs_remove: mutex_unlock(&q->sysfs_lock); if (queue_is_mq(q)) blk_mq_sysfs_unregister(disk); -out_put_queue_kobj: - kobject_put(&disk->queue_kobj); +out_del_queue_kobj: + kobject_del(&disk->queue_kobj); return ret; } @@ -986,5 +985,4 @@ void blk_unregister_queue(struct gendisk *disk) elevator_set_none(q); blk_debugfs_remove(disk); - kobject_put(&disk->queue_kobj); } diff --git a/block/blk.h b/block/blk.h index 0a2eccf28ca4..46f566f9b126 100644 --- a/block/blk.h +++ b/block/blk.h @@ -29,6 +29,7 @@ struct elevator_tags; /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) +extern const struct kobj_type blk_queue_ktype; extern struct dentry *blk_debugfs_root; struct blk_flush_queue { diff --git a/block/genhd.c b/block/genhd.c index c26733f6324b..9bbc38d12792 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1303,6 +1303,7 @@ static void disk_release(struct device *dev) disk_free_zone_resources(disk); xa_destroy(&disk->part_tbl); + kobject_put(&disk->queue_kobj); disk->queue->disk = NULL; blk_put_queue(disk->queue); @@ -1486,6 +1487,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, INIT_LIST_HEAD(&disk->slave_bdevs); #endif mutex_init(&disk->rqos_state_mutex); + kobject_init(&disk->queue_kobj, &blk_queue_ktype); return disk; out_erase_part0: From 593d9e4c3d634c370f226f55453c376bf43b3684 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Mon, 11 Aug 2025 13:24:26 +0800 Subject: [PATCH 049/246] fs: fix incorrect lflags value in the move_mount syscall The lflags value used to look up from_path was overwritten by the one used to look up to_path. In other words, from_path was looked up with the wrong lflags value. Fix it. Fixes: f9fde814de37 ("fs: support getname_maybe_null() in move_mount()") Signed-off-by: Yuntao Wang Link: https://lore.kernel.org/20250811052426.129188-1-yuntao.wang@linux.dev [Christian Brauner : massage patch] Signed-off-by: Christian Brauner --- fs/namespace.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index ceb6b57e6a57..43f32ee9f95c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4551,20 +4551,10 @@ SYSCALL_DEFINE5(move_mount, if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION; if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH; - lflags = 0; - if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW; - if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; uflags = 0; - if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH; - from_name = getname_maybe_null(from_pathname, uflags); - if (IS_ERR(from_name)) - return PTR_ERR(from_name); + if (flags & MOVE_MOUNT_T_EMPTY_PATH) + uflags = AT_EMPTY_PATH; - lflags = 0; - if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW; - if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; - uflags = 0; - if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH; to_name = getname_maybe_null(to_pathname, uflags); if (IS_ERR(to_name)) return PTR_ERR(to_name); @@ -4577,11 +4567,24 @@ SYSCALL_DEFINE5(move_mount, to_path = fd_file(f_to)->f_path; path_get(&to_path); } else { + lflags = 0; + if (flags & MOVE_MOUNT_T_SYMLINKS) + lflags |= LOOKUP_FOLLOW; + if (flags & MOVE_MOUNT_T_AUTOMOUNTS) + lflags |= LOOKUP_AUTOMOUNT; ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL); if (ret) return ret; } + uflags = 0; + if (flags & MOVE_MOUNT_F_EMPTY_PATH) + uflags = AT_EMPTY_PATH; + + from_name = getname_maybe_null(from_pathname, uflags); + if (IS_ERR(from_name)) + return PTR_ERR(from_name); + if (!from_name && from_dfd >= 0) { CLASS(fd_raw, f_from)(from_dfd); if (fd_empty(f_from)) @@ -4590,6 +4593,11 @@ SYSCALL_DEFINE5(move_mount, return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags); } + lflags = 0; + if (flags & MOVE_MOUNT_F_SYMLINKS) + lflags |= LOOKUP_FOLLOW; + if (flags & MOVE_MOUNT_F_AUTOMOUNTS) + lflags |= LOOKUP_AUTOMOUNT; ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL); if (ret) return ret; From 6d3c3ca4c77e93660cce5819bf707f75df03e0c8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 8 Aug 2025 15:28:47 +0200 Subject: [PATCH 050/246] module: Rename EXPORT_SYMBOL_GPL_FOR_MODULES to EXPORT_SYMBOL_FOR_MODULES Christoph suggested that the explicit _GPL_ can be dropped from the module namespace export macro, as it's intended for in-tree modules only. It would be possible to restrict it technically, but it was pointed out [2] that some cases of using an out-of-tree build of an in-tree module with the same name are legitimate. But in that case those also have to be GPL anyway so it's unnecessary to spell it out in the macro name. Link: https://lore.kernel.org/all/aFleJN_fE-RbSoFD@infradead.org/ [1] Link: https://lore.kernel.org/all/CAK7LNATRkZHwJGpojCnvdiaoDnP%2BaeUXgdey5sb_8muzdWTMkA@mail.gmail.com/ [2] Suggested-by: Christoph Hellwig Reviewed-by: Shivank Garg Acked-by: David Hildenbrand Acked-by: Nicolas Schier Reviewed-by: Daniel Gomez Reviewed-by: Christian Brauner Signed-off-by: Vlastimil Babka Link: https://lore.kernel.org/20250808-export_modules-v4-1-426945bcc5e1@suse.cz Signed-off-by: Christian Brauner --- Documentation/core-api/symbol-namespaces.rst | 11 ++++++----- drivers/tty/serial/8250/8250_rsa.c | 8 ++++---- fs/anon_inodes.c | 2 +- include/linux/export.h | 2 +- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/Documentation/core-api/symbol-namespaces.rst b/Documentation/core-api/symbol-namespaces.rst index 32fc73dc5529..034898e81ba2 100644 --- a/Documentation/core-api/symbol-namespaces.rst +++ b/Documentation/core-api/symbol-namespaces.rst @@ -76,20 +76,21 @@ unit as preprocessor statement. The above example would then read:: within the corresponding compilation unit before the #include for . Typically it's placed before the first #include statement. -Using the EXPORT_SYMBOL_GPL_FOR_MODULES() macro ------------------------------------------------ +Using the EXPORT_SYMBOL_FOR_MODULES() macro +------------------------------------------- Symbols exported using this macro are put into a module namespace. This -namespace cannot be imported. +namespace cannot be imported. These exports are GPL-only as they are only +intended for in-tree modules. The macro takes a comma separated list of module names, allowing only those modules to access this symbol. Simple tail-globs are supported. For example:: - EXPORT_SYMBOL_GPL_FOR_MODULES(preempt_notifier_inc, "kvm,kvm-*") + EXPORT_SYMBOL_FOR_MODULES(preempt_notifier_inc, "kvm,kvm-*") -will limit usage of this symbol to modules whoes name matches the given +will limit usage of this symbol to modules whose name matches the given patterns. How to use Symbols exported in Namespaces diff --git a/drivers/tty/serial/8250/8250_rsa.c b/drivers/tty/serial/8250/8250_rsa.c index d34093cc03ad..12a65b79583c 100644 --- a/drivers/tty/serial/8250/8250_rsa.c +++ b/drivers/tty/serial/8250/8250_rsa.c @@ -147,7 +147,7 @@ void rsa_enable(struct uart_8250_port *up) if (up->port.uartclk == SERIAL_RSA_BAUD_BASE * 16) serial_out(up, UART_RSA_FRR, 0); } -EXPORT_SYMBOL_GPL_FOR_MODULES(rsa_enable, "8250_base"); +EXPORT_SYMBOL_FOR_MODULES(rsa_enable, "8250_base"); /* * Attempts to turn off the RSA FIFO and resets the RSA board back to 115kbps compat mode. It is @@ -179,7 +179,7 @@ void rsa_disable(struct uart_8250_port *up) up->port.uartclk = SERIAL_RSA_BAUD_BASE_LO * 16; uart_port_unlock_irq(&up->port); } -EXPORT_SYMBOL_GPL_FOR_MODULES(rsa_disable, "8250_base"); +EXPORT_SYMBOL_FOR_MODULES(rsa_disable, "8250_base"); void rsa_autoconfig(struct uart_8250_port *up) { @@ -192,7 +192,7 @@ void rsa_autoconfig(struct uart_8250_port *up) if (__rsa_enable(up)) up->port.type = PORT_RSA; } -EXPORT_SYMBOL_GPL_FOR_MODULES(rsa_autoconfig, "8250_base"); +EXPORT_SYMBOL_FOR_MODULES(rsa_autoconfig, "8250_base"); void rsa_reset(struct uart_8250_port *up) { @@ -201,7 +201,7 @@ void rsa_reset(struct uart_8250_port *up) serial_out(up, UART_RSA_FRR, 0); } -EXPORT_SYMBOL_GPL_FOR_MODULES(rsa_reset, "8250_base"); +EXPORT_SYMBOL_FOR_MODULES(rsa_reset, "8250_base"); #ifdef CONFIG_SERIAL_8250_DEPRECATED_OPTIONS #ifndef MODULE diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 1d847a939f29..180a458fc4f7 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -129,7 +129,7 @@ struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *n } return inode; } -EXPORT_SYMBOL_GPL_FOR_MODULES(anon_inode_make_secure_inode, "kvm"); +EXPORT_SYMBOL_FOR_MODULES(anon_inode_make_secure_inode, "kvm"); static struct file *__anon_inode_getfile(const char *name, const struct file_operations *fops, diff --git a/include/linux/export.h b/include/linux/export.h index f35d03b4113b..a686fd0ba406 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -91,6 +91,6 @@ #define EXPORT_SYMBOL_NS(sym, ns) __EXPORT_SYMBOL(sym, "", ns) #define EXPORT_SYMBOL_NS_GPL(sym, ns) __EXPORT_SYMBOL(sym, "GPL", ns) -#define EXPORT_SYMBOL_GPL_FOR_MODULES(sym, mods) __EXPORT_SYMBOL(sym, "GPL", "module:" mods) +#define EXPORT_SYMBOL_FOR_MODULES(sym, mods) __EXPORT_SYMBOL(sym, "GPL", "module:" mods) #endif /* _LINUX_EXPORT_H */ From b26e2afb3834d4a61ce54c8484ff6014bef0b4b7 Mon Sep 17 00:00:00 2001 From: Vasiliy Kovalev Date: Mon, 11 Aug 2025 16:27:16 +0300 Subject: [PATCH 051/246] ALSA: hda/realtek: Fix headset mic on HONOR BRB-X Add a PCI quirk to enable microphone input on the headphone jack on the HONOR BRB-X M1010 laptop. Signed-off-by: Vasiliy Kovalev Cc: Link: https://patch.msgid.link/20250811132716.45076-1-kovalev@altlinux.org Signed-off-by: Takashi Iwai --- sound/hda/codecs/realtek/alc269.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/hda/codecs/realtek/alc269.c b/sound/hda/codecs/realtek/alc269.c index 337e33a59de8..e90c4047ea62 100644 --- a/sound/hda/codecs/realtek/alc269.c +++ b/sound/hda/codecs/realtek/alc269.c @@ -7140,6 +7140,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1d72, 0x1901, "RedmiBook 14", ALC256_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1945, "Redmi G", ALC256_FIXUP_ASUS_HEADSET_MIC), SND_PCI_QUIRK(0x1d72, 0x1947, "RedmiBook Air", ALC255_FIXUP_XIAOMI_HEADSET_MIC), + SND_PCI_QUIRK(0x1ee7, 0x2078, "HONOR BRB-X M1010", ALC2XX_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1f66, 0x0105, "Ayaneo Portable Game Player", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x2014, 0x800a, "Positivo ARN50", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x2782, 0x0214, "VAIO VJFE-CL", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), From 4fa7d880aeb8cdbdaa4fb72be3e53ac1d6bcc088 Mon Sep 17 00:00:00 2001 From: David Kaplan Date: Mon, 21 Jul 2025 11:03:10 -0500 Subject: [PATCH 052/246] x86/bugs: Select best SRSO mitigation The SRSO bug can theoretically be used to conduct user->user or guest->guest attacks and requires a mitigation (namely IBPB instead of SBPB on context switch) for these. So mark SRSO as being applicable to the user->user and guest->guest attack vectors. Additionally, SRSO supports multiple mitigations which mitigate different potential attack vectors. Some CPUs are also immune to SRSO from certain attack vectors (like user->kernel). Use the specific attack vectors requiring mitigation to select the best SRSO mitigation to avoid unnecessary performance hits. Signed-off-by: David Kaplan Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250721160310.1804203-1-david.kaplan@amd.com --- .../admin-guide/hw-vuln/attack_vector_controls.rst | 2 +- arch/x86/kernel/cpu/bugs.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst b/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst index b4de16f5ec44..6dd0800146f6 100644 --- a/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst +++ b/Documentation/admin-guide/hw-vuln/attack_vector_controls.rst @@ -214,7 +214,7 @@ Spectre_v1 X Spectre_v2 X X Spectre_v2_user X X * (Note 1) SRBDS X X X X -SRSO X X +SRSO X X X X SSB (Note 4) TAA X X X X * (Note 2) TSA X X X X diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index b74bf937cd9f..2186a771b9fc 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -386,7 +386,6 @@ static bool __init should_mitigate_vuln(unsigned int bug) case X86_BUG_SPECTRE_V2: case X86_BUG_RETBLEED: - case X86_BUG_SRSO: case X86_BUG_L1TF: case X86_BUG_ITS: return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) || @@ -3184,8 +3183,18 @@ static void __init srso_select_mitigation(void) } if (srso_mitigation == SRSO_MITIGATION_AUTO) { - if (should_mitigate_vuln(X86_BUG_SRSO)) { + /* + * Use safe-RET if user->kernel or guest->host protection is + * required. Otherwise the 'microcode' mitigation is sufficient + * to protect the user->user and guest->guest vectors. + */ + if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) || + (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) && + !boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO))) { srso_mitigation = SRSO_MITIGATION_SAFE_RET; + } else if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) { + srso_mitigation = SRSO_MITIGATION_MICROCODE; } else { srso_mitigation = SRSO_MITIGATION_NONE; return; From dfb36e4a8db0cd56f92d4cb445f54e85a9b40897 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 11 Aug 2025 10:11:47 -0400 Subject: [PATCH 053/246] futex: Use user_write_access_begin/_end() in futex_put_value() Commit cec199c5e39b ("futex: Implement FUTEX2_NUMA") introduced the futex_put_value() helper to write a value to the given user address. However, it uses user_read_access_begin() before the write. For architectures that differentiate between read and write accesses, like PowerPC, futex_put_value() fails with -EFAULT. Fix that by using the user_write_access_begin/user_write_access_end() pair instead. Fixes: cec199c5e39b ("futex: Implement FUTEX2_NUMA") Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20250811141147.322261-1-longman@redhat.com --- kernel/futex/futex.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index c74eac572acd..2cd57096c38e 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -319,13 +319,13 @@ static __always_inline int futex_put_value(u32 val, u32 __user *to) { if (can_do_masked_user_access()) to = masked_user_access_begin(to); - else if (!user_read_access_begin(to, sizeof(*to))) + else if (!user_write_access_begin(to, sizeof(*to))) return -EFAULT; unsafe_put_user(val, to, Efault); - user_read_access_end(); + user_write_access_end(); return 0; Efault: - user_read_access_end(); + user_write_access_end(); return -EFAULT; } From d8b96a79622e03813c221450498ca9742704ebf2 Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Mon, 28 Jul 2025 01:39:57 +0800 Subject: [PATCH 054/246] blk-wbt: Optimize wbt_done() for non-throttled writes In the current implementation, the sync_cookie and last_cookie members of struct rq_wb are used only by read requests and not by non-throttled write requests. Based on this, we can optimize wbt_done() by removing one if condition check for non-throttled write requests. Signed-off-by: Tang Yizhou Reviewed-by: Jan Kara Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20250727173959.160835-2-yizhou.tang@shopee.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index a50d4cd55f41..30886d44f6cd 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -248,13 +248,14 @@ static void wbt_done(struct rq_qos *rqos, struct request *rq) struct rq_wb *rwb = RQWB(rqos); if (!wbt_is_tracked(rq)) { - if (rwb->sync_cookie == rq) { - rwb->sync_issue = 0; - rwb->sync_cookie = NULL; - } + if (wbt_is_read(rq)) { + if (rwb->sync_cookie == rq) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } - if (wbt_is_read(rq)) wb_timestamp(rwb, &rwb->last_comp); + } } else { WARN_ON_ONCE(rq == rwb->sync_cookie); __wbt_done(rqos, wbt_flags(rq)); From bccdfcd56d4b5b78d0d76f46d0e89a51330dfd75 Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Mon, 28 Jul 2025 01:39:58 +0800 Subject: [PATCH 055/246] blk-wbt: Eliminate ambiguity in the comments of struct rq_wb In the current implementation, the last_issue and last_comp members of struct rq_wb are used only by read requests and not by non-throttled write requests. Therefore, eliminate the ambiguity here. Signed-off-by: Tang Yizhou Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20250727173959.160835-3-yizhou.tang@shopee.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 30886d44f6cd..eb8037bae0bd 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -85,8 +85,8 @@ struct rq_wb { u64 sync_issue; void *sync_cookie; - unsigned long last_issue; /* last non-throttled issue */ - unsigned long last_comp; /* last non-throttled comp */ + unsigned long last_issue; /* issue time of last read rq */ + unsigned long last_comp; /* completion time of last read rq */ unsigned long min_lat_nsec; struct rq_qos rqos; struct rq_wait rq_wait[WBT_NUM_RWQ]; From 0452f08395f8e7d04fe3744443dad396b3330d0c Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Mon, 28 Jul 2025 01:39:59 +0800 Subject: [PATCH 056/246] blk-wbt: doc: Update the doc of the wbt_lat_usec interface The symbol wb_window_usec cannot be found. Update the doc to reflect the latest implementation, in other words, the debugfs interface 'curr_win_nsec'. Signed-off-by: Tang Yizhou Reviewed-by: Jan Kara Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20250727173959.160835-4-yizhou.tang@shopee.com Signed-off-by: Jens Axboe --- Documentation/ABI/stable/sysfs-block | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 803f578dc023..0ddffc9133d0 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -731,7 +731,7 @@ Contact: linux-block@vger.kernel.org Description: [RW] If the device is registered for writeback throttling, then this file shows the target minimum read latency. If this latency - is exceeded in a given window of time (see wb_window_usec), then + is exceeded in a given window of time (see curr_win_nsec), then the writeback throttling will start scaling back writes. Writing a value of '0' to this file disables the feature. Writing a value of '-1' to this file resets the value to the default From e91a158b694d7f4bd937763dde79ed0afa472d8a Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 8 Aug 2025 15:37:14 -0400 Subject: [PATCH 057/246] intel_idle: Allow loading ACPI tables for any family There is no reason to limit intel_idle's loading of ACPI tables to family 6. Upcoming Intel processors are not in family 6. Below "Fixes" really means "applies cleanly until". That syntax commit didn't change the previous logic, but shows this patch applies back 5-years. Fixes: 4a9f45a0533f ("intel_idle: Convert to new X86 CPU match macros") Signed-off-by: Len Brown Link: https://patch.msgid.link/06101aa4fe784e5b0be1cb2c0bdd9afcf16bd9d4.1754681697.git.len.brown@intel.com Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 73747d20df85..91a7b7e7c0c8 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1679,7 +1679,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { }; static const struct x86_cpu_id intel_mwait_ids[] __initconst = { - X86_MATCH_VENDOR_FAM_FEATURE(INTEL, 6, X86_FEATURE_MWAIT, NULL), + X86_MATCH_VENDOR_FAM_FEATURE(INTEL, X86_FAMILY_ANY, X86_FEATURE_MWAIT, NULL), {} }; From fa3fa55de0d6177fdcaf6fc254f13cc8f33c3eed Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 11 Aug 2025 17:03:11 +0200 Subject: [PATCH 058/246] cpuidle: governors: menu: Avoid using invalid recent intervals data Marc has reported that commit 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information") caused the number of wakeup interrupts to increase on an idle system [1], which was not expected to happen after merely allowing shallower idle states to be selected by the governor in some cases. However, on the system in question, all of the idle states deeper than WFI are rejected by the driver due to a firmware issue [2]. This causes the governor to only consider the recent interval duriation data corresponding to attempts to enter WFI that are successful and the recent invervals table is filled with values lower than the scheduler tick period. Consequently, the governor predicts an idle duration below the scheduler tick period length and avoids stopping the tick more often which leads to the observed symptom. Address it by modifying the governor to update the recent intervals table also when entering the previously selected idle state fails, so it knows that the short idle intervals might have been the minority had the selected idle states been actually entered every time. Fixes: 85975daeaa4d ("cpuidle: menu: Avoid discarding useful information") Link: https://lore.kernel.org/linux-pm/86o6sv6n94.wl-maz@kernel.org/ [1] Link: https://lore.kernel.org/linux-pm/7ffcb716-9a1b-48c2-aaa4-469d0df7c792@arm.com/ [2] Signed-off-by: Rafael J. Wysocki Tested-by: Christian Loehle Tested-by: Marc Zyngier Reviewed-by: Christian Loehle Link: https://patch.msgid.link/2793874.mvXUDI8C0e@rafael.j.wysocki --- drivers/cpuidle/governors/menu.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 52d5d26fc7c6..81306612a5c6 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -97,6 +97,14 @@ static inline int which_bucket(u64 duration_ns) static DEFINE_PER_CPU(struct menu_device, menu_devices); +static void menu_update_intervals(struct menu_device *data, unsigned int interval_us) +{ + /* Update the repeating-pattern data. */ + data->intervals[data->interval_ptr++] = interval_us; + if (data->interval_ptr >= INTERVALS) + data->interval_ptr = 0; +} + static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); /* @@ -222,6 +230,14 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (data->needs_update) { menu_update(drv, dev); data->needs_update = 0; + } else if (!dev->last_residency_ns) { + /* + * This happens when the driver rejects the previously selected + * idle state and returns an error, so update the recent + * intervals table to prevent invalid information from being + * used going forward. + */ + menu_update_intervals(data, UINT_MAX); } /* Find the shortest expected idle interval. */ @@ -482,10 +498,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) data->correction_factor[data->bucket] = new_factor; - /* update the repeating-pattern data */ - data->intervals[data->interval_ptr++] = ktime_to_us(measured_ns); - if (data->interval_ptr >= INTERVALS) - data->interval_ptr = 0; + menu_update_intervals(data, ktime_to_us(measured_ns)); } /** From 3ead77989c20cb2d774a3b6045d7a928b6fb53ed Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 8 Aug 2025 07:51:22 -0700 Subject: [PATCH 059/246] cpufreq: intel_pstate: Support Clearwater Forest OOB mode Prevent intel_pstate from loading when OOB (Out Of Band) P-states mode is enabled. Signed-off-by: Srinivas Pandruvada Link: https://patch.msgid.link/20250808145122.4057208-1-srinivas.pandruvada@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 06a1c7dd081f..f366d35c5840 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2793,6 +2793,7 @@ static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = { X86_MATCH(INTEL_GRANITERAPIDS_X, core_funcs), X86_MATCH(INTEL_ATOM_CRESTMONT, core_funcs), X86_MATCH(INTEL_ATOM_CRESTMONT_X, core_funcs), + X86_MATCH(INTEL_ATOM_DARKMONT_X, core_funcs), {} }; #endif From 31cd31c9e17ece125aad27259501a2af69ccb020 Mon Sep 17 00:00:00 2001 From: Fushuai Wang Date: Mon, 11 Aug 2025 11:50:44 -0700 Subject: [PATCH 060/246] x86/fpu: Fix NULL dereference in avx512_status() Problem ------- With CONFIG_X86_DEBUG_FPU enabled, reading /proc/[kthread]/arch_status causes a warning and a NULL pointer dereference. This is because the AVX-512 timestamp code uses x86_task_fpu() but doesn't check it for NULL. CONFIG_X86_DEBUG_FPU addles that function for kernel threads (PF_KTHREAD specifically), making it return NULL. The point of the warning was to ensure that kernel threads only access task->fpu after going through kernel_fpu_begin()/_end(). Note: all kernel tasks exposed in /proc have a valid task->fpu. Solution -------- One option is to silence the warning and check for NULL from x86_task_fpu(). However, that warning is fairly fresh and seems like a defense against misuse of the FPU state in kernel threads. Instead, stop outputting AVX-512_elapsed_ms for kernel threads altogether. The data was garbage anyway because avx512_timestamp is only updated for user threads, not kernel threads. If anyone ever wants to track kernel thread AVX-512 use, they can come back later and do it properly, separate from this bug fix. [ dhansen: mostly rewrite changelog ] Fixes: 22aafe3bcb67 ("x86/fpu: Remove init_task FPU state dependencies, add debugging warning for PF_KTHREAD tasks") Co-developed-by: Sohil Mehta Signed-off-by: Sohil Mehta Signed-off-by: Fushuai Wang Signed-off-by: Dave Hansen Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20250811185044.2227268-1-sohil.mehta%40intel.com --- arch/x86/kernel/fpu/xstate.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 12ed75c1b567..28e4fd65c9da 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1881,19 +1881,20 @@ long fpu_xstate_prctl(int option, unsigned long arg2) #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 - * use in the task. + * use in the task. Report -1 if no AVX-512 usage. */ static void avx512_status(struct seq_file *m, struct task_struct *task) { - unsigned long timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); - long delta; + unsigned long timestamp; + long delta = -1; - if (!timestamp) { - /* - * Report -1 if no AVX512 usage - */ - delta = -1; - } else { + /* AVX-512 usage is not tracked for kernel threads. Don't report anything. */ + if (task->flags & (PF_KTHREAD | PF_USER_WORKER)) + return; + + timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); + + if (timestamp) { delta = (long)(jiffies - timestamp); /* * Cap to LONG_MAX if time difference > LONG_MAX From b63335fb3d32579c5ff0b7038b9cc23688fff528 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 11 Aug 2025 08:34:04 +0100 Subject: [PATCH 061/246] cifs: Fix collect_sample() to handle any iterator type collect_sample() is used to gather samples of the data in a Write op for analysis to try and determine if the compression algorithm is likely to achieve anything more quickly than actually running the compression algorithm. However, collect_sample() assumes that the data it is going to be sampling is stored in an ITER_XARRAY-type iterator (which it now should never be) and doesn't actually check that it is before accessing the underlying xarray directly. Fix this by replacing the code with a loop that just uses the standard iterator functions to sample every other 2KiB block, skipping the intervening ones. It's not quite the same as the previous algorithm as it doesn't necessarily align to the pages within an ordinary write from the pagecache. Note that the btrfs code from which this was derived samples the inode's pagecache directly rather than the iterator - but that doesn't necessarily work for network filesystems if O_DIRECT is in operation. Fixes: 94ae8c3fee94 ("smb: client: compress: LZ77 code improvements cleanup") Signed-off-by: David Howells Acked-by: Paulo Alcantara (Red Hat) cc: Enzo Matsumiya cc: Shyam Prasad N cc: Tom Talpey cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/compress.c | 61 +++++++++++----------------------------- 1 file changed, 16 insertions(+), 45 deletions(-) diff --git a/fs/smb/client/compress.c b/fs/smb/client/compress.c index 766b4de13da7..db709f5cd2e1 100644 --- a/fs/smb/client/compress.c +++ b/fs/smb/client/compress.c @@ -155,58 +155,29 @@ static int cmp_bkt(const void *_a, const void *_b) } /* - * TODO: - * Support other iter types, if required. - * Only ITER_XARRAY is supported for now. + * Collect some 2K samples with 2K gaps between. */ -static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample) +static int collect_sample(const struct iov_iter *source, ssize_t max, u8 *sample) { - struct folio *folios[16], *folio; - unsigned int nr, i, j, npages; - loff_t start = iter->xarray_start + iter->iov_offset; - pgoff_t last, index = start / PAGE_SIZE; - size_t len, off, foff; - void *p; - int s = 0; + struct iov_iter iter = *source; + size_t s = 0; - last = (start + max - 1) / PAGE_SIZE; - do { - nr = xa_extract(iter->xarray, (void **)folios, index, last, ARRAY_SIZE(folios), - XA_PRESENT); - if (nr == 0) - return -EIO; + while (iov_iter_count(&iter) >= SZ_2K) { + size_t part = umin(umin(iov_iter_count(&iter), SZ_2K), max); + size_t n; - for (i = 0; i < nr; i++) { - folio = folios[i]; - npages = folio_nr_pages(folio); - foff = start - folio_pos(folio); - off = foff % PAGE_SIZE; + n = copy_from_iter(sample + s, part, &iter); + if (n != part) + return -EFAULT; - for (j = foff / PAGE_SIZE; j < npages; j++) { - size_t len2; + s += n; + max -= n; - len = min_t(size_t, max, PAGE_SIZE - off); - len2 = min_t(size_t, len, SZ_2K); + if (iov_iter_count(&iter) < PAGE_SIZE - SZ_2K) + break; - p = kmap_local_page(folio_page(folio, j)); - memcpy(&sample[s], p, len2); - kunmap_local(p); - - s += len2; - - if (len2 < SZ_2K || s >= max - SZ_2K) - return s; - - max -= len; - if (max <= 0) - return s; - - start += len; - off = 0; - index++; - } - } - } while (nr == ARRAY_SIZE(folios)); + iov_iter_advance(&iter, SZ_2K); + } return s; } From d7f1affc556e07208453224a025bd67583671ae2 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 8 Aug 2025 17:52:21 +0300 Subject: [PATCH 062/246] cifs: avoid extra calls to strlen() in cifs_get_spnego_key() Since 'snprintf()' returns the number of characters emitted, an output position may be advanced with this return value rather than using an explicit calls to 'strlen()'. Compile tested only. Signed-off-by: Dmitry Antipov Signed-off-by: Steve French --- fs/smb/client/cifs_spnego.c | 47 ++++++++++++++----------------------- 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c index bc1c1e9b288a..43b86fa4d695 100644 --- a/fs/smb/client/cifs_spnego.c +++ b/fs/smb/client/cifs_spnego.c @@ -124,55 +124,44 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo, dp = description; /* start with version and hostname portion of UNC string */ spnego_key = ERR_PTR(-EINVAL); - sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION, - hostname); - dp = description + strlen(description); + dp += sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION, + hostname); /* add the server address */ if (server->dstaddr.ss_family == AF_INET) - sprintf(dp, "ip4=%pI4", &sa->sin_addr); + dp += sprintf(dp, "ip4=%pI4", &sa->sin_addr); else if (server->dstaddr.ss_family == AF_INET6) - sprintf(dp, "ip6=%pI6", &sa6->sin6_addr); + dp += sprintf(dp, "ip6=%pI6", &sa6->sin6_addr); else goto out; - dp = description + strlen(description); - /* for now, only sec=krb5 and sec=mskrb5 and iakerb are valid */ if (server->sec_kerberos) - sprintf(dp, ";sec=krb5"); + dp += sprintf(dp, ";sec=krb5"); else if (server->sec_mskerberos) - sprintf(dp, ";sec=mskrb5"); + dp += sprintf(dp, ";sec=mskrb5"); else if (server->sec_iakerb) - sprintf(dp, ";sec=iakerb"); + dp += sprintf(dp, ";sec=iakerb"); else { cifs_dbg(VFS, "unknown or missing server auth type, use krb5\n"); - sprintf(dp, ";sec=krb5"); + dp += sprintf(dp, ";sec=krb5"); } - dp = description + strlen(description); - sprintf(dp, ";uid=0x%x", - from_kuid_munged(&init_user_ns, sesInfo->linux_uid)); + dp += sprintf(dp, ";uid=0x%x", + from_kuid_munged(&init_user_ns, sesInfo->linux_uid)); - dp = description + strlen(description); - sprintf(dp, ";creduid=0x%x", + dp += sprintf(dp, ";creduid=0x%x", from_kuid_munged(&init_user_ns, sesInfo->cred_uid)); - if (sesInfo->user_name) { - dp = description + strlen(description); - sprintf(dp, ";user=%s", sesInfo->user_name); - } + if (sesInfo->user_name) + dp += sprintf(dp, ";user=%s", sesInfo->user_name); - dp = description + strlen(description); - sprintf(dp, ";pid=0x%x", current->pid); + dp += sprintf(dp, ";pid=0x%x", current->pid); - if (sesInfo->upcall_target == UPTARGET_MOUNT) { - dp = description + strlen(description); - sprintf(dp, ";upcall_target=mount"); - } else { - dp = description + strlen(description); - sprintf(dp, ";upcall_target=app"); - } + if (sesInfo->upcall_target == UPTARGET_MOUNT) + dp += sprintf(dp, ";upcall_target=mount"); + else + dp += sprintf(dp, ";upcall_target=app"); cifs_dbg(FYI, "key description = %s\n", description); saved_cred = override_creds(spnego_cred); From 54d4f445517fe8350d735624d7f4225e7511d9eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Larumbe?= Date: Fri, 8 Aug 2025 02:02:34 +0100 Subject: [PATCH 063/246] drm/panfrost: Print RSS for tiler heap BO's in debugfs GEMS file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise it would display the virtual allocation size, which is often much bigger than the RSS. Signed-off-by: Adrián Larumbe Fixes: e48ade5e23ba ("drm/panfrost: show device-wide list of DRM GEM objects over DebugFS") Tested-by: Christopher Healy Reviewed-by: Daniel Stone Signed-off-by: Thomas Zimmermann Link: https://lore.kernel.org/r/20250808010235.2831853-1-adrian.larumbe@collabora.com --- drivers/gpu/drm/panfrost/panfrost_gem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.c b/drivers/gpu/drm/panfrost/panfrost_gem.c index bb73f2a68a12..85d6289a6eda 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem.c +++ b/drivers/gpu/drm/panfrost/panfrost_gem.c @@ -432,7 +432,7 @@ static void panfrost_gem_debugfs_bo_print(struct panfrost_gem_object *bo, if (!refcount) return; - resident_size = bo->base.pages ? bo->base.base.size : 0; + resident_size = panfrost_gem_rss(&bo->base.base); snprintf(creator_info, sizeof(creator_info), "%s/%d", bo->debugfs.creator.process_name, bo->debugfs.creator.tgid); From fd56b9c9507f32b16159f9a922e1af5628254567 Mon Sep 17 00:00:00 2001 From: Vinod Govindapillai Date: Tue, 29 Jul 2025 15:46:48 +0300 Subject: [PATCH 064/246] drm/i915/fbc: fix the implementation of wa_18038517565 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As per the wa_18038517565, we need to disable FBC compressor clock gating before enabling FBC and enable after disabling FBC. Placing the enabling of clock gating in the fbc deactivate function can make the above wa logic go wrong in case of frontbuffer rendering FBC mechanism. FBC deactivate can get called during fb invalidate and then the corresponding FBC activate can get called without properly disabling the clock gating and can result in compression stalled. So move the enable clock gating at the end of one FBC session after FBC is completely disabled for a pipe. Bspec: 74212, 72197, 69741, 65555 Fixes: 010363c46189 ("drm/i915/display: implement wa_18038517565") Signed-off-by: Vinod Govindapillai Reviewed-by: Jouni Högander Link: https://lore.kernel.org/r/20250729124648.288497-1-vinod.govindapillai@intel.com (cherry picked from commit 82dde0407ab126f8413fd6c51429e5057ced5ba2) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_fbc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_fbc.c b/drivers/gpu/drm/i915/display/intel_fbc.c index 6e26cb4c5724..685ac98bd001 100644 --- a/drivers/gpu/drm/i915/display/intel_fbc.c +++ b/drivers/gpu/drm/i915/display/intel_fbc.c @@ -552,10 +552,6 @@ static void ilk_fbc_deactivate(struct intel_fbc *fbc) if (dpfc_ctl & DPFC_CTL_EN) { dpfc_ctl &= ~DPFC_CTL_EN; intel_de_write(display, ILK_DPFC_CONTROL(fbc->id), dpfc_ctl); - - /* wa_18038517565 Enable DPFC clock gating after FBC disable */ - if (display->platform.dg2 || DISPLAY_VER(display) >= 14) - fbc_compressor_clkgate_disable_wa(fbc, false); } } @@ -1710,6 +1706,10 @@ static void __intel_fbc_disable(struct intel_fbc *fbc) __intel_fbc_cleanup_cfb(fbc); + /* wa_18038517565 Enable DPFC clock gating after FBC disable */ + if (display->platform.dg2 || DISPLAY_VER(display) >= 14) + fbc_compressor_clkgate_disable_wa(fbc, false); + fbc->state.plane = NULL; fbc->flip_pending = false; fbc->busy_bits = 0; From 184889dfe0568528fd6d14bba864dd57ed45bbf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jouni=20H=C3=B6gander?= Date: Fri, 1 Aug 2025 09:29:05 +0300 Subject: [PATCH 065/246] drm/i915/psr: Do not trigger Frame Change events from frontbuffer flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to get rid of triggering "Frame Change" events from frontbuffer flush calls. We are about to move using TRANS_PUSH register for this on LunarLake and onwards. Touching TRANS_PUSH register from fronbuffer flush would be problematic as it's written by DSB as well. Fix this by using intel_psr_exit when flush or invalidate is done on LunarLake and onwards. This is not possible on AlderLake and MeteorLake due to HW bug in PSR2 disable. This patch is also fixing problems with cursor plane where cursor is disappearing or duplicate cursor is seen on the screen. v2: Commit message updated Bspec: 68927, 68934, 66624 Reported-by: Janna Martl Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/5522 Fixes: 411ad63877bb ("drm/i915/psr: Use SFF_CTL on invalidate/flush for LunarLake onwards") Tested-by: Janna Martl Signed-off-by: Jouni Högander Reviewed-by: Suraj Kandpal Link: https://lore.kernel.org/r/20250801062905.564453-1-jouni.hogander@intel.com (cherry picked from commit 46fb38cb20c0d185a6391ab524b23e0e0219c41f) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_psr.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c index ae9053919211..41988e193a41 100644 --- a/drivers/gpu/drm/i915/display/intel_psr.c +++ b/drivers/gpu/drm/i915/display/intel_psr.c @@ -3275,7 +3275,9 @@ static void intel_psr_configure_full_frame_update(struct intel_dp *intel_dp) static void _psr_invalidate_handle(struct intel_dp *intel_dp) { - if (intel_dp->psr.psr2_sel_fetch_enabled) { + struct intel_display *display = to_intel_display(intel_dp); + + if (DISPLAY_VER(display) < 20 && intel_dp->psr.psr2_sel_fetch_enabled) { if (!intel_dp->psr.psr2_sel_fetch_cff_enabled) { intel_dp->psr.psr2_sel_fetch_cff_enabled = true; intel_psr_configure_full_frame_update(intel_dp); @@ -3361,7 +3363,7 @@ static void _psr_flush_handle(struct intel_dp *intel_dp) { struct intel_display *display = to_intel_display(intel_dp); - if (intel_dp->psr.psr2_sel_fetch_enabled) { + if (DISPLAY_VER(display) < 20 && intel_dp->psr.psr2_sel_fetch_enabled) { if (intel_dp->psr.psr2_sel_fetch_cff_enabled) { /* can we turn CFF off? */ if (intel_dp->psr.busy_frontbuffer_bits == 0) @@ -3378,11 +3380,13 @@ static void _psr_flush_handle(struct intel_dp *intel_dp) * existing SU configuration */ intel_psr_configure_full_frame_update(intel_dp); + + intel_psr_force_update(intel_dp); + } else { + intel_psr_exit(intel_dp); } - intel_psr_force_update(intel_dp); - - if (!intel_dp->psr.psr2_sel_fetch_enabled && !intel_dp->psr.active && + if ((!intel_dp->psr.psr2_sel_fetch_enabled || DISPLAY_VER(display) >= 20) && !intel_dp->psr.busy_frontbuffer_bits) queue_work(display->wq.unordered, &intel_dp->psr.work); } From 5eb1bcdb6a8c088514019c3a9bda5d565beed1af Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Tue, 22 Jul 2025 13:18:53 +0530 Subject: [PATCH 066/246] x86/sev: Improve handling of writes to intercepted TSC MSRs Currently, when a Secure TSC enabled SNP guest attempts to write to the intercepted GUEST_TSC_FREQ MSR (a read-only MSR), the guest kernel response incorrectly implies a VMM configuration error, when in fact it is the usual VMM configuration to intercept writes to read-only MSRs, unless explicitly documented. Modify the intercepted TSC MSR #VC handling: * Write to GUEST_TSC_FREQ will generate a #GP instead of terminating the guest * Write to MSR_IA32_TSC will generate a #GP instead of silently ignoring it However, continue to terminate the guest when reading from intercepted GUEST_TSC_FREQ MSR with Secure TSC enabled, as intercepted reads indicate an improper VMM configuration for Secure TSC enabled SNP guests. [ bp: simplify comment. ] Fixes: 38cc6495cdec ("x86/sev: Prevent GUEST_TSC_FREQ MSR interception for Secure TSC enabled guests") Suggested-by: Sean Christopherson Signed-off-by: Nikunj A Dadhania Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Link: https://lore.kernel.org/20250722074853.22253-1-nikunj@amd.com --- arch/x86/coco/sev/vc-handle.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/arch/x86/coco/sev/vc-handle.c b/arch/x86/coco/sev/vc-handle.c index faf1fce89ed4..c3b4acbde0d8 100644 --- a/arch/x86/coco/sev/vc-handle.c +++ b/arch/x86/coco/sev/vc-handle.c @@ -371,29 +371,30 @@ static enum es_result __vc_handle_msr_caa(struct pt_regs *regs, bool write) * executing with Secure TSC enabled, so special handling is required for * accesses of MSR_IA32_TSC and MSR_AMD64_GUEST_TSC_FREQ. */ -static enum es_result __vc_handle_secure_tsc_msrs(struct pt_regs *regs, bool write) +static enum es_result __vc_handle_secure_tsc_msrs(struct es_em_ctxt *ctxt, bool write) { + struct pt_regs *regs = ctxt->regs; u64 tsc; /* - * GUEST_TSC_FREQ should not be intercepted when Secure TSC is enabled. - * Terminate the SNP guest when the interception is enabled. + * Writing to MSR_IA32_TSC can cause subsequent reads of the TSC to + * return undefined values, and GUEST_TSC_FREQ is read-only. Generate + * a #GP on all writes. + */ + if (write) { + ctxt->fi.vector = X86_TRAP_GP; + ctxt->fi.error_code = 0; + return ES_EXCEPTION; + } + + /* + * GUEST_TSC_FREQ read should not be intercepted when Secure TSC is + * enabled. Terminate the guest if a read is attempted. */ if (regs->cx == MSR_AMD64_GUEST_TSC_FREQ) return ES_VMM_ERROR; - /* - * Writes: Writing to MSR_IA32_TSC can cause subsequent reads of the TSC - * to return undefined values, so ignore all writes. - * - * Reads: Reads of MSR_IA32_TSC should return the current TSC value, use - * the value returned by rdtsc_ordered(). - */ - if (write) { - WARN_ONCE(1, "TSC MSR writes are verboten!\n"); - return ES_OK; - } - + /* Reads of MSR_IA32_TSC should return the current TSC value. */ tsc = rdtsc_ordered(); regs->ax = lower_32_bits(tsc); regs->dx = upper_32_bits(tsc); @@ -416,7 +417,7 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) case MSR_IA32_TSC: case MSR_AMD64_GUEST_TSC_FREQ: if (sev_status & MSR_AMD64_SNP_SECURE_TSC) - return __vc_handle_secure_tsc_msrs(regs, write); + return __vc_handle_secure_tsc_msrs(ctxt, write); break; default: break; From dcb82900b12f5809e66835918d4043284ce1d39c Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 10 Aug 2025 23:41:05 +0200 Subject: [PATCH 067/246] ASoC: codecs: Call strscpy() with correct size argument In aw8xxxx_profile_info(), strscpy() is called with the length of the source string "null" rather than the size of the destination buffer. This is fine as long as the destination buffer is larger than the source string, but we should still use the destination buffer size instead to call strscpy() as intended. And since 'name' points to the fixed-size buffer 'uinfo->value.enumerated.name', we can safely omit the size argument and let strscpy() infer it using sizeof() and remove 'name'. Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20250810214144.1985-2-thorsten.blum@linux.dev Signed-off-by: Mark Brown --- sound/soc/codecs/aw87390.c | 8 +++----- sound/soc/codecs/aw88081.c | 5 ++--- sound/soc/codecs/aw88166.c | 8 +++----- sound/soc/codecs/aw88261.c | 8 +++----- sound/soc/codecs/aw88395/aw88395.c | 8 +++----- sound/soc/codecs/aw88399.c | 8 +++----- 6 files changed, 17 insertions(+), 28 deletions(-) diff --git a/sound/soc/codecs/aw87390.c b/sound/soc/codecs/aw87390.c index 110009616966..ef6f64856988 100644 --- a/sound/soc/codecs/aw87390.c +++ b/sound/soc/codecs/aw87390.c @@ -177,7 +177,7 @@ static int aw87390_profile_info(struct snd_kcontrol *kcontrol, { struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); struct aw87390 *aw87390 = snd_soc_component_get_drvdata(codec); - char *prof_name, *name; + char *prof_name; int count, ret; uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED; @@ -194,17 +194,15 @@ static int aw87390_profile_info(struct snd_kcontrol *kcontrol, if (uinfo->value.enumerated.item >= count) uinfo->value.enumerated.item = count - 1; - name = uinfo->value.enumerated.name; count = uinfo->value.enumerated.item; ret = aw87390_dev_get_prof_name(aw87390->aw_pa, count, &prof_name); if (ret) { - strscpy(uinfo->value.enumerated.name, "null", - strlen("null") + 1); + strscpy(uinfo->value.enumerated.name, "null"); return 0; } - strscpy(name, prof_name, sizeof(uinfo->value.enumerated.name)); + strscpy(uinfo->value.enumerated.name, prof_name); return 0; } diff --git a/sound/soc/codecs/aw88081.c b/sound/soc/codecs/aw88081.c index 3dd8428f08cc..d61a7b8c5470 100644 --- a/sound/soc/codecs/aw88081.c +++ b/sound/soc/codecs/aw88081.c @@ -914,12 +914,11 @@ static int aw88081_profile_info(struct snd_kcontrol *kcontrol, ret = aw88081_dev_get_prof_name(aw88081->aw_pa, count, &prof_name); if (ret) { - strscpy(uinfo->value.enumerated.name, "null", - sizeof(uinfo->value.enumerated.name)); + strscpy(uinfo->value.enumerated.name, "null"); return 0; } - strscpy(uinfo->value.enumerated.name, prof_name, sizeof(uinfo->value.enumerated.name)); + strscpy(uinfo->value.enumerated.name, prof_name); return 0; } diff --git a/sound/soc/codecs/aw88166.c b/sound/soc/codecs/aw88166.c index 4f76ebe11cc7..28f62b991ef2 100644 --- a/sound/soc/codecs/aw88166.c +++ b/sound/soc/codecs/aw88166.c @@ -1478,7 +1478,7 @@ static int aw88166_profile_info(struct snd_kcontrol *kcontrol, { struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); struct aw88166 *aw88166 = snd_soc_component_get_drvdata(codec); - char *prof_name, *name; + char *prof_name; int count, ret; uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED; @@ -1495,17 +1495,15 @@ static int aw88166_profile_info(struct snd_kcontrol *kcontrol, if (uinfo->value.enumerated.item >= count) uinfo->value.enumerated.item = count - 1; - name = uinfo->value.enumerated.name; count = uinfo->value.enumerated.item; ret = aw88166_dev_get_prof_name(aw88166->aw_pa, count, &prof_name); if (ret) { - strscpy(uinfo->value.enumerated.name, "null", - strlen("null") + 1); + strscpy(uinfo->value.enumerated.name, "null"); return 0; } - strscpy(name, prof_name, sizeof(uinfo->value.enumerated.name)); + strscpy(uinfo->value.enumerated.name, prof_name); return 0; } diff --git a/sound/soc/codecs/aw88261.c b/sound/soc/codecs/aw88261.c index fb99871578c5..de11ae8dd9d9 100644 --- a/sound/soc/codecs/aw88261.c +++ b/sound/soc/codecs/aw88261.c @@ -819,7 +819,7 @@ static int aw88261_profile_info(struct snd_kcontrol *kcontrol, { struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); struct aw88261 *aw88261 = snd_soc_component_get_drvdata(codec); - char *prof_name, *name; + char *prof_name; int count, ret; uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED; @@ -836,17 +836,15 @@ static int aw88261_profile_info(struct snd_kcontrol *kcontrol, if (uinfo->value.enumerated.item >= count) uinfo->value.enumerated.item = count - 1; - name = uinfo->value.enumerated.name; count = uinfo->value.enumerated.item; ret = aw88261_dev_get_prof_name(aw88261->aw_pa, count, &prof_name); if (ret) { - strscpy(uinfo->value.enumerated.name, "null", - strlen("null") + 1); + strscpy(uinfo->value.enumerated.name, "null"); return 0; } - strscpy(name, prof_name, sizeof(uinfo->value.enumerated.name)); + strscpy(uinfo->value.enumerated.name, prof_name); return 0; } diff --git a/sound/soc/codecs/aw88395/aw88395.c b/sound/soc/codecs/aw88395/aw88395.c index aea44a199b98..fb563b4c6971 100644 --- a/sound/soc/codecs/aw88395/aw88395.c +++ b/sound/soc/codecs/aw88395/aw88395.c @@ -175,7 +175,7 @@ static int aw88395_profile_info(struct snd_kcontrol *kcontrol, { struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); struct aw88395 *aw88395 = snd_soc_component_get_drvdata(codec); - char *prof_name, *name; + char *prof_name; int count, ret; uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED; @@ -192,17 +192,15 @@ static int aw88395_profile_info(struct snd_kcontrol *kcontrol, if (uinfo->value.enumerated.item >= count) uinfo->value.enumerated.item = count - 1; - name = uinfo->value.enumerated.name; count = uinfo->value.enumerated.item; ret = aw88395_dev_get_prof_name(aw88395->aw_pa, count, &prof_name); if (ret) { - strscpy(uinfo->value.enumerated.name, "null", - strlen("null") + 1); + strscpy(uinfo->value.enumerated.name, "null"); return 0; } - strscpy(name, prof_name, sizeof(uinfo->value.enumerated.name)); + strscpy(uinfo->value.enumerated.name, prof_name); return 0; } diff --git a/sound/soc/codecs/aw88399.c b/sound/soc/codecs/aw88399.c index c23e70d64d0c..58846feb013d 100644 --- a/sound/soc/codecs/aw88399.c +++ b/sound/soc/codecs/aw88399.c @@ -1831,7 +1831,7 @@ static int aw88399_profile_info(struct snd_kcontrol *kcontrol, { struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); struct aw88399 *aw88399 = snd_soc_component_get_drvdata(codec); - char *prof_name, *name; + char *prof_name; int count, ret; uinfo->type = SNDRV_CTL_ELEM_TYPE_ENUMERATED; @@ -1848,17 +1848,15 @@ static int aw88399_profile_info(struct snd_kcontrol *kcontrol, if (uinfo->value.enumerated.item >= count) uinfo->value.enumerated.item = count - 1; - name = uinfo->value.enumerated.name; count = uinfo->value.enumerated.item; ret = aw88399_dev_get_prof_name(aw88399->aw_pa, count, &prof_name); if (ret) { - strscpy(uinfo->value.enumerated.name, "null", - strlen("null") + 1); + strscpy(uinfo->value.enumerated.name, "null"); return 0; } - strscpy(name, prof_name, sizeof(uinfo->value.enumerated.name)); + strscpy(uinfo->value.enumerated.name, prof_name); return 0; } From d405ec23df13e6df599f5bd965a55d13420366b8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 12 Aug 2025 14:57:06 +0200 Subject: [PATCH 068/246] ACPI: processor: perflib: Move problematic pr->performance check Commit d33bd88ac0eb ("ACPI: processor: perflib: Fix initial _PPC limit application") added a pr->performance check that prevents the frequency QoS request from being added when the given processor has no performance object. Unfortunately, this causes a WARN() in freq_qos_remove_request() to trigger on an attempt to take the given CPU offline later because the frequency QoS object has not been added for it due to the missing performance object. Address this by moving the pr->performance check before calling acpi_processor_get_platform_limit() so it only prevents a limit from being set for the CPU if the performance object is not present. This way, the frequency QoS request is added as it was before the above commit and it is present all the time along with the CPU's cpufreq policy regardless of whether or not the CPU is online. Fixes: d33bd88ac0eb ("ACPI: processor: perflib: Fix initial _PPC limit application") Tested-by: Rafael J. Wysocki Cc: 5.4+ # 5.4+ Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/2801421.mvXUDI8C0e@rafael.j.wysocki --- drivers/acpi/processor_perflib.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c index 755003bf3a45..8972446b7162 100644 --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -180,7 +180,7 @@ void acpi_processor_ppc_init(struct cpufreq_policy *policy) struct acpi_processor *pr = per_cpu(processors, cpu); int ret; - if (!pr || !pr->performance) + if (!pr) continue; /* @@ -197,6 +197,9 @@ void acpi_processor_ppc_init(struct cpufreq_policy *policy) pr_err("Failed to add freq constraint for CPU%d (%d)\n", cpu, ret); + if (!pr->performance) + continue; + ret = acpi_processor_get_platform_limit(pr); if (ret) pr_err("Failed to update freq constraint for CPU%d (%d)\n", From 56bdf7270ff4f870e2d4bfacdc00161e766dba2d Mon Sep 17 00:00:00 2001 From: David Thompson Date: Mon, 11 Aug 2025 13:50:44 -0400 Subject: [PATCH 069/246] Revert "gpio: mlxbf3: only get IRQ for device instance 0" This reverts commit 10af0273a35ab4513ca1546644b8c853044da134. While this change was merged, it is not the preferred solution. During review of a similar change to the gpio-mlxbf2 driver, the use of "platform_get_irq_optional" was identified as the preferred solution, so let's use it for gpio-mlxbf3 driver as well. Cc: stable@vger.kernel.org Fixes: 10af0273a35a ("gpio: mlxbf3: only get IRQ for device instance 0") Signed-off-by: David Thompson Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/8d2b630c71b3742f2c74242cf7d602706a6108e6.1754928650.git.davthompson@nvidia.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mlxbf3.c | 52 +++++++++++++------------------------- 1 file changed, 18 insertions(+), 34 deletions(-) diff --git a/drivers/gpio/gpio-mlxbf3.c b/drivers/gpio/gpio-mlxbf3.c index 9875e34bde72..10ea71273c89 100644 --- a/drivers/gpio/gpio-mlxbf3.c +++ b/drivers/gpio/gpio-mlxbf3.c @@ -190,9 +190,7 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev) struct mlxbf3_gpio_context *gs; struct gpio_irq_chip *girq; struct gpio_chip *gc; - char *colon_ptr; int ret, irq; - long num; gs = devm_kzalloc(dev, sizeof(*gs), GFP_KERNEL); if (!gs) @@ -229,39 +227,25 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev) gc->owner = THIS_MODULE; gc->add_pin_ranges = mlxbf3_gpio_add_pin_ranges; - colon_ptr = strchr(dev_name(dev), ':'); - if (!colon_ptr) { - dev_err(dev, "invalid device name format\n"); - return -EINVAL; - } + irq = platform_get_irq(pdev, 0); + if (irq >= 0) { + girq = &gs->gc.irq; + gpio_irq_chip_set_chip(girq, &gpio_mlxbf3_irqchip); + girq->default_type = IRQ_TYPE_NONE; + /* This will let us handle the parent IRQ in the driver */ + girq->num_parents = 0; + girq->parents = NULL; + girq->parent_handler = NULL; + girq->handler = handle_bad_irq; - ret = kstrtol(++colon_ptr, 16, &num); - if (ret) { - dev_err(dev, "invalid device instance\n"); - return ret; - } - - if (!num) { - irq = platform_get_irq(pdev, 0); - if (irq >= 0) { - girq = &gs->gc.irq; - gpio_irq_chip_set_chip(girq, &gpio_mlxbf3_irqchip); - girq->default_type = IRQ_TYPE_NONE; - /* This will let us handle the parent IRQ in the driver */ - girq->num_parents = 0; - girq->parents = NULL; - girq->parent_handler = NULL; - girq->handler = handle_bad_irq; - - /* - * Directly request the irq here instead of passing - * a flow-handler because the irq is shared. - */ - ret = devm_request_irq(dev, irq, mlxbf3_gpio_irq_handler, - IRQF_SHARED, dev_name(dev), gs); - if (ret) - return dev_err_probe(dev, ret, "failed to request IRQ"); - } + /* + * Directly request the irq here instead of passing + * a flow-handler because the irq is shared. + */ + ret = devm_request_irq(dev, irq, mlxbf3_gpio_irq_handler, + IRQF_SHARED, dev_name(dev), gs); + if (ret) + return dev_err_probe(dev, ret, "failed to request IRQ"); } platform_set_drvdata(pdev, gs); From 810bd9066fb1871b8a9528f31f2fdbf2a8b73bf2 Mon Sep 17 00:00:00 2001 From: David Thompson Date: Mon, 11 Aug 2025 13:50:45 -0400 Subject: [PATCH 070/246] gpio: mlxbf3: use platform_get_irq_optional() The gpio-mlxbf3 driver interfaces with two GPIO controllers, device instance 0 and 1. There is a single IRQ resource shared between the two controllers, and it is found in the ACPI table for device instance 0. The driver should not use platform_get_irq(), otherwise this error is logged when probing instance 1: mlxbf3_gpio MLNXBF33:01: error -ENXIO: IRQ index 0 not found Cc: stable@vger.kernel.org Fixes: cd33f216d241 ("gpio: mlxbf3: Add gpio driver support") Signed-off-by: David Thompson Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/ce70b98a201ce82b9df9aa80ac7a5eeaa2268e52.1754928650.git.davthompson@nvidia.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mlxbf3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-mlxbf3.c b/drivers/gpio/gpio-mlxbf3.c index 10ea71273c89..ed29b07d16c1 100644 --- a/drivers/gpio/gpio-mlxbf3.c +++ b/drivers/gpio/gpio-mlxbf3.c @@ -227,7 +227,7 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev) gc->owner = THIS_MODULE; gc->add_pin_ranges = mlxbf3_gpio_add_pin_ranges; - irq = platform_get_irq(pdev, 0); + irq = platform_get_irq_optional(pdev, 0); if (irq >= 0) { girq = &gs->gc.irq; gpio_irq_chip_set_chip(girq, &gpio_mlxbf3_irqchip); From 3f69f2e78799bf76e5dfe74f2eda4d67812d4edc Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Mon, 11 Aug 2025 07:41:44 +0200 Subject: [PATCH 071/246] PCI: xilinx: Fix NULL pointer dereference in xilinx_pcie_intr_handler() f29861aa301c5 ("PCI: xilinx: Switch to msi_create_parent_irq_domain()") changed xilinx_pcie::msi_domain from child devices' interrupt domain to Xilinx AXI bridge's interrupt domain. However, xilinx_pcie_intr_handler() wasn't changed and still reads Xilinx AXI bridge's interrupt domain from xilinx_pcie::msi_domain->parent. This pointer is NULL now. Update xilinx_pcie_intr_handler() to read the correct interrupt domain pointer. Fixes: f29861aa301c5 ("PCI: xilinx: Switch to msi_create_parent_irq_domain()") Signed-off-by: Nam Cao Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20250811054144.4049448-1-namcao@linutronix.de --- drivers/pci/controller/pcie-xilinx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pcie-xilinx.c b/drivers/pci/controller/pcie-xilinx.c index f121836c3cf4..937ea6ae1ac4 100644 --- a/drivers/pci/controller/pcie-xilinx.c +++ b/drivers/pci/controller/pcie-xilinx.c @@ -400,7 +400,7 @@ static irqreturn_t xilinx_pcie_intr_handler(int irq, void *data) if (val & XILINX_PCIE_RPIFR1_MSI_INTR) { val = pcie_read(pcie, XILINX_PCIE_REG_RPIFR2) & XILINX_PCIE_RPIFR2_MSG_DATA; - domain = pcie->msi_domain->parent; + domain = pcie->msi_domain; } else { val = (val & XILINX_PCIE_RPIFR1_INTR_MASK) >> XILINX_PCIE_RPIFR1_INTR_SHIFT; From 9d7a1cbebbb691891671def57407ba2f8ee914e8 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Thu, 31 Jul 2025 10:38:09 +0100 Subject: [PATCH 072/246] drm/xe/migrate: prevent infinite recursion If the buf + offset is not aligned to XE_CAHELINE_BYTES we fallback to using a bounce buffer. However the bounce buffer here is allocated on the stack, and the only alignment requirement here is that it's naturally aligned to u8, and not XE_CACHELINE_BYTES. If the bounce buffer is also misaligned we then recurse back into the function again, however the new bounce buffer might also not be aligned, and might never be until we eventually blow through the stack, as we keep recursing. Instead of using the stack use kmalloc, which should respect the power-of-two alignment request here. Fixes a kernel panic when triggering this path through eudebug. v2 (Stuart): - Add build bug check for power-of-two restriction - s/EINVAL/ENOMEM/ Fixes: 270172f64b11 ("drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access") Signed-off-by: Matthew Auld Cc: Maciej Patelczyk Cc: Stuart Summers Cc: Matthew Brost Reviewed-by: Stuart Summers Link: https://lore.kernel.org/r/20250731093807.207572-6-matthew.auld@intel.com (cherry picked from commit 38b34e928a08ba594c4bbf7118aa3aadacd62fff) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_migrate.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index ba1cff2e4cda..6193e2ca3741 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -1820,15 +1820,19 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, if (!IS_ALIGNED(len, XE_CACHELINE_BYTES) || !IS_ALIGNED((unsigned long)buf + offset, XE_CACHELINE_BYTES)) { int buf_offset = 0; + void *bounce; + int err; + + BUILD_BUG_ON(!is_power_of_2(XE_CACHELINE_BYTES)); + bounce = kmalloc(XE_CACHELINE_BYTES, GFP_KERNEL); + if (!bounce) + return -ENOMEM; /* * Less than ideal for large unaligned access but this should be * fairly rare, can fixup if this becomes common. */ do { - u8 bounce[XE_CACHELINE_BYTES]; - void *ptr = (void *)bounce; - int err; int copy_bytes = min_t(int, bytes_left, XE_CACHELINE_BYTES - (offset & XE_CACHELINE_MASK)); @@ -1837,22 +1841,22 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, err = xe_migrate_access_memory(m, bo, offset & ~XE_CACHELINE_MASK, - (void *)ptr, - sizeof(bounce), 0); + bounce, + XE_CACHELINE_BYTES, 0); if (err) - return err; + break; if (write) { - memcpy(ptr + ptr_offset, buf + buf_offset, copy_bytes); + memcpy(bounce + ptr_offset, buf + buf_offset, copy_bytes); err = xe_migrate_access_memory(m, bo, offset & ~XE_CACHELINE_MASK, - (void *)ptr, - sizeof(bounce), write); + bounce, + XE_CACHELINE_BYTES, write); if (err) - return err; + break; } else { - memcpy(buf + buf_offset, ptr + ptr_offset, + memcpy(buf + buf_offset, bounce + ptr_offset, copy_bytes); } @@ -1861,7 +1865,8 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, offset += copy_bytes; } while (bytes_left); - return 0; + kfree(bounce); + return err; } dma_addr = xe_migrate_dma_map(xe, buf, len + page_offset, write); From 4126cb327a2e3273c81fcef1c594c5b7b645c44c Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Thu, 31 Jul 2025 10:38:10 +0100 Subject: [PATCH 073/246] drm/xe/migrate: don't overflow max copy size With non-page aligned copy, we need to use 4 byte aligned pitch, however the size itself might still be close to our maximum of ~8M, and so the dimensions of the copy can easily exceed the S16_MAX limit of the copy command leading to the following assert: xe 0000:03:00.0: [drm] Assertion `size / pitch <= ((s16)(((u16)~0U) >> 1))` failed! platform: BATTLEMAGE subplatform: 1 graphics: Xe2_HPG 20.01 step A0 media: Xe2_HPM 13.01 step A1 tile: 0 VRAM 10.0 GiB GT: 0 type 1 WARNING: CPU: 23 PID: 10605 at drivers/gpu/drm/xe/xe_migrate.c:673 emit_copy+0x4b5/0x4e0 [xe] To fix this account for the pitch when calculating the number of current bytes to copy. Fixes: 270172f64b11 ("drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access") Signed-off-by: Matthew Auld Cc: Maciej Patelczyk Cc: Matthew Brost Reviewed-by: Stuart Summers Link: https://lore.kernel.org/r/20250731093807.207572-7-matthew.auld@intel.com (cherry picked from commit 8c2d61e0e916e077fda7e7b8e67f25ffe0f361fc) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_migrate.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 6193e2ca3741..95bcaa427d26 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -1887,6 +1887,12 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, else current_bytes = min_t(int, bytes_left, cursor.size); + if (current_bytes & ~PAGE_MASK) { + int pitch = 4; + + current_bytes = min_t(int, current_bytes, S16_MAX * pitch); + } + if (fence) dma_fence_put(fence); From 145832fbdd17b1d77ffd6cdd1642259e101d1b7e Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Thu, 31 Jul 2025 10:38:11 +0100 Subject: [PATCH 074/246] drm/xe/migrate: prevent potential UAF If we hit the error path, the previous fence (if there is one) has already been put() prior to this, so doing a fence_wait could lead to UAF. Tweak the flow to do to the put() until after we do the wait. Fixes: 270172f64b11 ("drm/xe: Update xe_ttm_access_memory to use GPU for non-visible access") Signed-off-by: Matthew Auld Cc: Maciej Patelczyk Cc: Matthew Brost Reviewed-by: Stuart Summers Link: https://lore.kernel.org/r/20250731093807.207572-8-matthew.auld@intel.com (cherry picked from commit 9b7ca35ed28fe5fad86e9d9c24ebd1271e4c9c3e) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_migrate.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 95bcaa427d26..7d20ac4bb633 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -1893,9 +1893,6 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, current_bytes = min_t(int, current_bytes, S16_MAX * pitch); } - if (fence) - dma_fence_put(fence); - __fence = xe_migrate_vram(m, current_bytes, (unsigned long)buf & ~PAGE_MASK, dma_addr + current_page, @@ -1903,11 +1900,15 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, XE_MIGRATE_COPY_TO_VRAM : XE_MIGRATE_COPY_TO_SRAM); if (IS_ERR(__fence)) { - if (fence) + if (fence) { dma_fence_wait(fence, false); + dma_fence_put(fence); + } fence = __fence; goto out_err; } + + dma_fence_put(fence); fence = __fence; buf += current_bytes; From 2dd7a47669ae6c1da18c55f8e89c4a44418c7006 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Tue, 5 Aug 2025 09:48:42 +0200 Subject: [PATCH 075/246] drm/xe: Defer buffer object shrinker write-backs and GPU waits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the xe buffer-object shrinker allows GPU waits and write-back, (typically from kswapd), perform multiple passes, skipping subsequent passes if the shrinker number of scanned objects target is reached. 1) Without GPU waits and write-back 2) Without write-back 3) With both GPU-waits and write-back This is to avoid stalls and costly write- and readbacks unless they are really necessary. v2: - Don't test for scan completion twice. (Stuart Summers) - Update tags. Reported-by: melvyn Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/5557 Cc: Summers Stuart Fixes: 00c8efc3180f ("drm/xe: Add a shrinker for xe bos") Cc: # v6.15+ Signed-off-by: Thomas Hellström Reviewed-by: Stuart Summers Link: https://lore.kernel.org/r/20250805074842.11359-1-thomas.hellstrom@linux.intel.com (cherry picked from commit 80944d334182ce5eb27d00e2bf20a88bfc32dea1) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_shrinker.c | 51 +++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_shrinker.c b/drivers/gpu/drm/xe/xe_shrinker.c index 1c3c04d52f55..90244fe59b59 100644 --- a/drivers/gpu/drm/xe/xe_shrinker.c +++ b/drivers/gpu/drm/xe/xe_shrinker.c @@ -54,10 +54,10 @@ xe_shrinker_mod_pages(struct xe_shrinker *shrinker, long shrinkable, long purgea write_unlock(&shrinker->lock); } -static s64 xe_shrinker_walk(struct xe_device *xe, - struct ttm_operation_ctx *ctx, - const struct xe_bo_shrink_flags flags, - unsigned long to_scan, unsigned long *scanned) +static s64 __xe_shrinker_walk(struct xe_device *xe, + struct ttm_operation_ctx *ctx, + const struct xe_bo_shrink_flags flags, + unsigned long to_scan, unsigned long *scanned) { unsigned int mem_type; s64 freed = 0, lret; @@ -93,6 +93,48 @@ static s64 xe_shrinker_walk(struct xe_device *xe, return freed; } +/* + * Try shrinking idle objects without writeback first, then if not sufficient, + * try also non-idle objects and finally if that's not sufficient either, + * add writeback. This avoids stalls and explicit writebacks with light or + * moderate memory pressure. + */ +static s64 xe_shrinker_walk(struct xe_device *xe, + struct ttm_operation_ctx *ctx, + const struct xe_bo_shrink_flags flags, + unsigned long to_scan, unsigned long *scanned) +{ + bool no_wait_gpu = true; + struct xe_bo_shrink_flags save_flags = flags; + s64 lret, freed; + + swap(no_wait_gpu, ctx->no_wait_gpu); + save_flags.writeback = false; + lret = __xe_shrinker_walk(xe, ctx, save_flags, to_scan, scanned); + swap(no_wait_gpu, ctx->no_wait_gpu); + if (lret < 0 || *scanned >= to_scan) + return lret; + + freed = lret; + if (!ctx->no_wait_gpu) { + lret = __xe_shrinker_walk(xe, ctx, save_flags, to_scan, scanned); + if (lret < 0) + return lret; + freed += lret; + if (*scanned >= to_scan) + return freed; + } + + if (flags.writeback) { + lret = __xe_shrinker_walk(xe, ctx, flags, to_scan, scanned); + if (lret < 0) + return lret; + freed += lret; + } + + return freed; +} + static unsigned long xe_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -199,6 +241,7 @@ static unsigned long xe_shrinker_scan(struct shrinker *shrink, struct shrink_con runtime_pm = xe_shrinker_runtime_pm_get(shrinker, true, 0, can_backup); shrink_flags.purge = false; + lret = xe_shrinker_walk(shrinker->xe, &ctx, shrink_flags, nr_to_scan, &nr_scanned); if (lret >= 0) From 55d49f06162e45686399df4ae6292167f0deab7c Mon Sep 17 00:00:00 2001 From: Karthik Poosa Date: Sat, 9 Aug 2025 00:23:10 +0530 Subject: [PATCH 076/246] drm/xe/hwmon: Add SW clamp for power limits writes Clamp writes to power limits powerX_crit/currX_crit, powerX_cap, powerX_max, to the maximum supported by the pcode mailbox when sysfs-provided values exceed this limit. Although the pcode already performs clamping, values beyond the pcode mailbox's supported range get truncated, leading to incorrect critical power settings. This patch ensures proper clamping to prevent such truncation. v2: - Address below review comments. (Riana) - Split comments into multiple sentences. - Use local variables for readability. - Add a debug log. - Use u64 instead of unsigned long. v3: - Change drm_dbg logs to drm_info. (Badal) v4: - Rephrase the drm_info log. (Rodrigo, Riana) - Rename variable max_mbx_power_limit to max_supp_power_limit, as limit is same for platforms with and without mailbox power limit support. Signed-off-by: Karthik Poosa Fixes: 92d44a422d0d ("drm/xe/hwmon: Expose card reactive critical power") Fixes: fb1b70607f73 ("drm/xe/hwmon: Expose power attributes") Reviewed-by: Riana Tauro Link: https://lore.kernel.org/r/20250808185310.3466529-1-karthik.poosa@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit d301eb950da59f962bafe874cf5eb6d61a85b2c2) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_hwmon.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c index f08fc4377d25..c17ed1ae8649 100644 --- a/drivers/gpu/drm/xe/xe_hwmon.c +++ b/drivers/gpu/drm/xe/xe_hwmon.c @@ -332,6 +332,7 @@ static int xe_hwmon_power_max_write(struct xe_hwmon *hwmon, u32 attr, int channe int ret = 0; u32 reg_val, max; struct xe_reg rapl_limit; + u64 max_supp_power_limit = 0; mutex_lock(&hwmon->hwmon_lock); @@ -356,6 +357,20 @@ static int xe_hwmon_power_max_write(struct xe_hwmon *hwmon, u32 attr, int channe goto unlock; } + /* + * If the sysfs value exceeds the maximum pcode supported power limit value, clamp it to + * the supported maximum (U12.3 format). + * This is to avoid truncation during reg_val calculation below and ensure the valid + * power limit is sent for pcode which would clamp it to card-supported value. + */ + max_supp_power_limit = ((PWR_LIM_VAL) >> hwmon->scl_shift_power) * SF_POWER; + if (value > max_supp_power_limit) { + value = max_supp_power_limit; + drm_info(&hwmon->xe->drm, + "Power limit clamped as selected %s exceeds channel %d limit\n", + PWR_ATTR_TO_STR(attr), channel); + } + /* Computation in 64-bits to avoid overflow. Round to nearest. */ reg_val = DIV_ROUND_CLOSEST_ULL((u64)value << hwmon->scl_shift_power, SF_POWER); @@ -739,9 +754,23 @@ static int xe_hwmon_power_curr_crit_write(struct xe_hwmon *hwmon, int channel, { int ret; u32 uval; + u64 max_crit_power_curr = 0; mutex_lock(&hwmon->hwmon_lock); + /* + * If the sysfs value exceeds the pcode mailbox cmd POWER_SETUP_SUBCOMMAND_WRITE_I1 + * max supported value, clamp it to the command's max (U10.6 format). + * This is to avoid truncation during uval calculation below and ensure the valid power + * limit is sent for pcode which would clamp it to card-supported value. + */ + max_crit_power_curr = (POWER_SETUP_I1_DATA_MASK >> POWER_SETUP_I1_SHIFT) * scale_factor; + if (value > max_crit_power_curr) { + value = max_crit_power_curr; + drm_info(&hwmon->xe->drm, + "Power limit clamped as selected exceeds channel %d limit\n", + channel); + } uval = DIV_ROUND_CLOSEST_ULL(value << POWER_SETUP_I1_SHIFT, scale_factor); ret = xe_hwmon_pcode_write_i1(hwmon, uval); From 963e22c084c2b6097e1e635d29c6336881f67708 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Tue, 29 Jul 2025 08:20:38 +0200 Subject: [PATCH 077/246] ACPI: EC: Relax sanity check of the ECDT ID string It turns out that the ECDT table inside the ThinkBook 14 G7 IML contains a valid EC description but an invalid ID string ("_SB.PC00.LPCB.EC0"). Ignoring this ECDT based on the invalid ID string prevents the kernel from detecting the built-in touchpad, so relax the sanity check of the ID string and only reject ECDTs with empty ID strings. Reported-by: Ilya K Fixes: 7a0d59f6a913 ("ACPI: EC: Ignore ECDT tables with an invalid ID string") Signed-off-by: Armin Wolf Tested-by: Ilya K Link: https://patch.msgid.link/20250729062038.303734-1-W_Armin@gmx.de Cc: 6.16+ # 6.16+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/ec.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 75c7db8b156a..7855bbf752b1 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -2033,7 +2033,7 @@ void __init acpi_ec_ecdt_probe(void) goto out; } - if (!strstarts(ecdt_ptr->id, "\\")) { + if (!strlen(ecdt_ptr->id)) { /* * The ECDT table on some MSI notebooks contains invalid data, together * with an empty ID string (""). @@ -2042,9 +2042,13 @@ void __init acpi_ec_ecdt_probe(void) * a "fully qualified reference to the (...) embedded controller device", * so this string always has to start with a backslash. * - * By verifying this we can avoid such faulty ECDT tables in a safe way. + * However some ThinkBook machines have a ECDT table with a valid EC + * description but an invalid ID string ("_SB.PC00.LPCB.EC0"). + * + * Because of this we only check if the ID string is empty in order to + * avoid the obvious cases. */ - pr_err(FW_BUG "Ignoring ECDT due to invalid ID string \"%s\"\n", ecdt_ptr->id); + pr_err(FW_BUG "Ignoring ECDT due to empty ID string\n"); goto out; } From 5149bbb56bdcf5c5f72904025fbb502217580b63 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Mon, 11 Aug 2025 07:39:35 +0200 Subject: [PATCH 078/246] PCI: vmd: Remove MSI-X check on child devices d7d8ab87e3e7 ("PCI: vmd: Switch to msi_create_parent_irq_domain()") added a WARN_ON sanity check that child devices support MSI-X, because VMD document says [1]: Intel VMD only supports MSIx Interrupts from child devices and therefore the BIOS must enable PCIe Hot Plug and MSIx interrups [sic]. However, the VMD device can't even tell the difference between a child device using MSI and one using MSI-X. Per 185a383ada2e ("x86/PCI: Add driver for Intel Volume Management Device (VMD)"), VMD does not support INTx interrupts, but does support child devices using either MSI or MSI-X. Remove the sanity check to avoid the WARN_ON and allow child devices to use MSI, reported by Ammar. Fixes: d7d8ab87e3e7 ("PCI: vmd: Switch to msi_create_parent_irq_domain()") Link: https://cdrdv2-public.intel.com/776857/VMD_White_Paper.pdf [1] Reported-by: Ammar Faizi Closes: https://lore.kernel.org/linux-pci/aJXYhfc%2F6DfcqfqF@linux.gnuweeb.org/ Signed-off-by: Nam Cao [bhelgaas: commit log] Signed-off-by: Bjorn Helgaas Tested-by: Ammar Faizi Link: https://patch.msgid.link/20250811053935.4049211-1-namcao@linutronix.de --- drivers/pci/controller/vmd.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index b679c7f28f51..1bd5bf4a6097 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -306,9 +306,6 @@ static bool vmd_init_dev_msi_info(struct device *dev, struct irq_domain *domain, struct irq_domain *real_parent, struct msi_domain_info *info) { - if (WARN_ON_ONCE(info->bus_token != DOMAIN_BUS_PCI_DEVICE_MSIX)) - return false; - if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) return false; From abbf9a44944171ca99c150adad9361a2f517d3b6 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sun, 27 Jul 2025 11:23:17 +0200 Subject: [PATCH 079/246] rust: workaround `rustdoc` target modifiers bug Starting with Rust 1.88.0 (released 2025-06-26), `rustdoc` complains about a target modifier mismatch in configurations where `-Zfixed-x18` is passed: error: mixing `-Zfixed-x18` will cause an ABI mismatch in crate `rust_out` | = help: the `-Zfixed-x18` flag modifies the ABI so Rust crates compiled with different values of this flag cannot be used together safely = note: unset `-Zfixed-x18` in this crate is incompatible with `-Zfixed-x18=` in dependency `core` = help: set `-Zfixed-x18=` in this crate or unset `-Zfixed-x18` in `core` = help: if you are sure this will not cause problems, you may use `-Cunsafe-allow-abi-mismatch=fixed-x18` to silence this error The reason is that `rustdoc` was not passing the target modifiers when configuring the session options, and thus it would report a mismatch that did not exist as soon as a target modifier is used in a dependency. We did not notice it in the kernel until now because `-Zfixed-x18` has been a target modifier only since 1.88.0 (and it is the only one we use so far). The issue has been reported upstream [1] and a fix has been submitted [2], including a test similar to the kernel case. [ This is now fixed upstream (thanks Guillaume for the quick review), so it will be fixed in Rust 1.90.0 (expected 2025-09-18). - Miguel ] Meanwhile, conditionally pass `-Cunsafe-allow-abi-mismatch=fixed-x18` to workaround the issue on our side. Cc: stable@vger.kernel.org # Needed in 6.12.y and later (Rust is pinned in older LTSs). Reported-by: Konrad Dybcio Closes: https://lore.kernel.org/rust-for-linux/36cdc798-524f-4910-8b77-d7b9fac08d77@oss.qualcomm.com/ Link: https://github.com/rust-lang/rust/issues/144521 [1] Link: https://github.com/rust-lang/rust/pull/144523 [2] Reviewed-by: Alice Ryhl Link: https://lore.kernel.org/r/20250727092317.2930617-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- rust/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/rust/Makefile b/rust/Makefile index 4263462b8470..d47f82588d78 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -65,6 +65,10 @@ core-cfgs = \ core-edition := $(if $(call rustc-min-version,108700),2024,2021) +# `rustdoc` did not save the target modifiers, thus workaround for +# the time being (https://github.com/rust-lang/rust/issues/144521). +rustdoc_modifiers_workaround := $(if $(call rustc-min-version,108800),-Cunsafe-allow-abi-mismatch=fixed-x18) + # `rustc` recognizes `--remap-path-prefix` since 1.26.0, but `rustdoc` only # since Rust 1.81.0. Moreover, `rustdoc` ICEs on out-of-tree builds since Rust # 1.82.0 (https://github.com/rust-lang/rust/issues/138520). Thus workaround both @@ -77,6 +81,7 @@ quiet_cmd_rustdoc = RUSTDOC $(if $(rustdoc_host),H, ) $< -Zunstable-options --generate-link-to-definition \ --output $(rustdoc_output) \ --crate-name $(subst rustdoc-,,$@) \ + $(rustdoc_modifiers_workaround) \ $(if $(rustdoc_host),,--sysroot=/dev/null) \ @$(objtree)/include/generated/rustc_cfg $< @@ -215,6 +220,7 @@ quiet_cmd_rustdoc_test_kernel = RUSTDOC TK $< --extern bindings --extern uapi \ --no-run --crate-name kernel -Zunstable-options \ --sysroot=/dev/null \ + $(rustdoc_modifiers_workaround) \ --test-builder $(objtree)/scripts/rustdoc_test_builder \ $< $(rustdoc_test_kernel_quiet); \ $(objtree)/scripts/rustdoc_test_gen From 252fea131e15aba2cd487119d1a8f546471199e2 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sat, 26 Jul 2025 15:34:35 +0200 Subject: [PATCH 080/246] rust: kbuild: clean output before running `rustdoc` `rustdoc` can get confused when generating documentation into a folder that contains generated files from other `rustdoc` versions. For instance, running something like: rustup default 1.78.0 make LLVM=1 rustdoc rustup default 1.88.0 make LLVM=1 rustdoc may generate errors like: error: couldn't generate documentation: invalid template: last line expected to start with a comment | = note: failed to create or modify "./Documentation/output/rust/rustdoc/src-files.js" Thus just always clean the output folder before generating the documentation -- we are anyway regenerating it every time the `rustdoc` target gets called, at least for the time being. Cc: stable@vger.kernel.org # Needed in 6.12.y and later (Rust is pinned in older LTSs). Reported-by: Daniel Almeida Closes: https://rust-for-linux.zulipchat.com/#narrow/channel/288089/topic/x/near/527201113 Reviewed-by: Tamir Duberstein Link: https://lore.kernel.org/r/20250726133435.2460085-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- rust/Makefile | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/rust/Makefile b/rust/Makefile index d47f82588d78..bfa915b0e588 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -111,14 +111,14 @@ rustdoc: rustdoc-core rustdoc-macros rustdoc-compiler_builtins \ rustdoc-macros: private rustdoc_host = yes rustdoc-macros: private rustc_target_flags = --crate-type proc-macro \ --extern proc_macro -rustdoc-macros: $(src)/macros/lib.rs FORCE +rustdoc-macros: $(src)/macros/lib.rs rustdoc-clean FORCE +$(call if_changed,rustdoc) # Starting with Rust 1.82.0, skipping `-Wrustdoc::unescaped_backticks` should # not be needed -- see https://github.com/rust-lang/rust/pull/128307. rustdoc-core: private skip_flags = --edition=2021 -Wrustdoc::unescaped_backticks rustdoc-core: private rustc_target_flags = --edition=$(core-edition) $(core-cfgs) -rustdoc-core: $(RUST_LIB_SRC)/core/src/lib.rs FORCE +rustdoc-core: $(RUST_LIB_SRC)/core/src/lib.rs rustdoc-clean FORCE +$(call if_changed,rustdoc) rustdoc-compiler_builtins: $(src)/compiler_builtins.rs rustdoc-core FORCE @@ -130,7 +130,8 @@ rustdoc-ffi: $(src)/ffi.rs rustdoc-core FORCE rustdoc-pin_init_internal: private rustdoc_host = yes rustdoc-pin_init_internal: private rustc_target_flags = --cfg kernel \ --extern proc_macro --crate-type proc-macro -rustdoc-pin_init_internal: $(src)/pin-init/internal/src/lib.rs FORCE +rustdoc-pin_init_internal: $(src)/pin-init/internal/src/lib.rs \ + rustdoc-clean FORCE +$(call if_changed,rustdoc) rustdoc-pin_init: private rustdoc_host = yes @@ -148,6 +149,9 @@ rustdoc-kernel: $(src)/kernel/lib.rs rustdoc-core rustdoc-ffi rustdoc-macros \ $(obj)/bindings.o FORCE +$(call if_changed,rustdoc) +rustdoc-clean: FORCE + $(Q)rm -rf $(rustdoc_output) + quiet_cmd_rustc_test_library = $(RUSTC_OR_CLIPPY_QUIET) TL $< cmd_rustc_test_library = \ OBJTREE=$(abspath $(objtree)) \ From 41b70df5b38bc80967d2e0ed55cc3c3896bba781 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Aug 2025 08:30:11 -0600 Subject: [PATCH 081/246] io_uring/net: commit partial buffers on retry Ring provided buffers are potentially only valid within the single execution context in which they were acquired. io_uring deals with this and invalidates them on retry. But on the networking side, if MSG_WAITALL is set, or if the socket is of the streaming type and too little was processed, then it will hang on to the buffer rather than recycle or commit it. This is problematic for two reasons: 1) If someone unregisters the provided buffer ring before a later retry, then the req->buf_list will no longer be valid. 2) If multiple sockers are using the same buffer group, then multiple receives can consume the same memory. This can cause data corruption in the application, as either receive could land in the same userspace buffer. Fix this by disallowing partial retries from pinning a provided buffer across multiple executions, if ring provided buffers are used. Cc: stable@vger.kernel.org Reported-by: pt x Fixes: c56e022c0a27 ("io_uring: add support for user mapped provided buffer ring") Signed-off-by: Jens Axboe --- io_uring/net.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index dd96e355982f..d69f2afa4f7a 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -494,6 +494,15 @@ static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) return nbufs; } +static int io_net_kbuf_recyle(struct io_kiocb *req, + struct io_async_msghdr *kmsg, int len) +{ + req->flags |= REQ_F_BL_NO_RECYCLE; + if (req->flags & REQ_F_BUFFERS_COMMIT) + io_kbuf_commit(req, req->buf_list, len, io_bundle_nbufs(kmsg, len)); + return IOU_RETRY; +} + static inline bool io_send_finish(struct io_kiocb *req, int *ret, struct io_async_msghdr *kmsg, unsigned issue_flags) @@ -562,8 +571,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg->msg.msg_controllen = 0; kmsg->msg.msg_control = NULL; sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -674,8 +682,7 @@ retry_bundle: sr->len -= ret; sr->buf += ret; sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1071,8 +1078,7 @@ retry_multishot: } if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return IOU_RETRY; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1218,8 +1224,7 @@ retry_multishot: sr->len -= ret; sr->buf += ret; sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1500,8 +1505,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) zc->len -= ret; zc->buf += ret; zc->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1571,8 +1575,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; From e67b8afcb6d86f1bc6a69e4e52cf9dcfe636f995 Mon Sep 17 00:00:00 2001 From: Frank Min Date: Tue, 5 Aug 2025 22:30:54 +0800 Subject: [PATCH 082/246] drm/amdgpu: Add PSP fw version check for fw reserve GFX command The fw reserved GFX command is only supported starting from PSP fw version 0x3a0e14 and 0x3b0e0d. Older versions do not support this command. Add a version guard to ensure the command is only used when the running PSP fw meets the minimum version requirement. This ensures backward compatibility and safe operation across fw revisions. Fixes: a3b7f9c306e1 ("drm/amdgpu: reclaim psp fw reservation memory region") Signed-off-by: Frank Min Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit 065e23170a1e09bc9104b761183e59562a029619) --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 0bd51a04be79..23484317a5fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1039,15 +1039,28 @@ int psp_update_fw_reservation(struct psp_context *psp) { int ret; uint64_t reserv_addr, reserv_addr_ext; - uint32_t reserv_size, reserv_size_ext; + uint32_t reserv_size, reserv_size_ext, mp0_ip_ver; struct amdgpu_device *adev = psp->adev; + mp0_ip_ver = amdgpu_ip_version(adev, MP0_HWIP, 0); + if (amdgpu_sriov_vf(psp->adev)) return 0; - if ((amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(14, 0, 2)) && - (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(14, 0, 3))) + switch (mp0_ip_ver) { + case IP_VERSION(14, 0, 2): + if (adev->psp.sos.fw_version < 0x3b0e0d) + return 0; + break; + + case IP_VERSION(14, 0, 3): + if (adev->psp.sos.fw_version < 0x3a0e14) + return 0; + break; + + default: return 0; + } ret = psp_get_fw_reservation_info(psp, GFX_CMD_ID_FB_FW_RESERV_ADDR, &reserv_addr, &reserv_size); if (ret) From 10ef476aad1c848449934e7bec2ab2374333c7b6 Mon Sep 17 00:00:00 2001 From: YiPeng Chai Date: Tue, 12 Aug 2025 09:17:58 +0800 Subject: [PATCH 083/246] drm/amdgpu: fix vram reservation issue The vram block allocation flag must be cleared before making vram reservation, otherwise reserving addresses within the currently freed memory range will always fail. Fixes: c9cad937c0c5 ("drm/amdgpu: add drm buddy support to amdgpu") Signed-off-by: YiPeng Chai Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit d38eaf27de1b8584f42d6fb3f717b7ec44b3a7a1) --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 07c936e90d8e..78f9e86ccc09 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -648,9 +648,8 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, list_for_each_entry(block, &vres->blocks, link) vis_usage += amdgpu_vram_mgr_vis_size(adev, block); - amdgpu_vram_mgr_do_reserve(man); - drm_buddy_free_list(mm, &vres->blocks, vres->flags); + amdgpu_vram_mgr_do_reserve(man); mutex_unlock(&mgr->lock); atomic64_sub(vis_usage, &mgr->vis_usage); From 040bc6d0e0e9c814c9c663f6f1544ebaff6824a8 Mon Sep 17 00:00:00 2001 From: Jack Xiao Date: Mon, 11 Aug 2025 15:20:55 +0800 Subject: [PATCH 084/246] drm/amdgpu: fix incorrect vm flags to map bo It should use vm flags instead of pte flags to specify bo vm attributes. Fixes: 7946340fa389 ("drm/amdgpu: Move csa related code to separate file") Signed-off-by: Jack Xiao Reviewed-by: Likun Gao Signed-off-by: Alex Deucher (cherry picked from commit b08425fa77ad2f305fe57a33dceb456be03b653f) --- drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c index 02138aa55793..dfb6cfd83760 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_csa.c @@ -88,8 +88,8 @@ int amdgpu_map_static_csa(struct amdgpu_device *adev, struct amdgpu_vm *vm, } r = amdgpu_vm_bo_map(adev, *bo_va, csa_addr, 0, size, - AMDGPU_PTE_READABLE | AMDGPU_PTE_WRITEABLE | - AMDGPU_PTE_EXECUTABLE); + AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | + AMDGPU_VM_PAGE_EXECUTABLE); if (r) { DRM_ERROR("failed to do bo_map on static CSA, err=%d\n", r); From 0ebbab41fba1bae6ccd96c0eec17026700ac6534 Mon Sep 17 00:00:00 2001 From: Sergio Perez Gonzalez Date: Mon, 28 Jul 2025 20:00:49 -0600 Subject: [PATCH 085/246] ASoC: stm: stm32_i2s: Fix calc_clk_div() error handling in determine_rate() calc_clk_div() will only return a non-zero value (-EINVAL) in case of error. On the other hand, req->rate is an unsigned long. It seems quite odd that req->rate would be assigned a negative value, which is clearly not a rate, and success would be returned. Reinstate previous logic, which would just return error. Fixes: afd529d74002 ("ASoC: stm: stm32_i2s: convert from round_rate() to determine_rate()") Link: https://scan7.scan.coverity.com/#/project-view/53936/11354?selectedIssue=1647702 Signed-off-by: Sergio Perez Gonzalez Link: https://patch.msgid.link/20250729020052.404617-1-sperezglz@gmail.com Signed-off-by: Mark Brown --- sound/soc/stm/stm32_i2s.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sound/soc/stm/stm32_i2s.c b/sound/soc/stm/stm32_i2s.c index 0e489097d9c1..6ca21780f21d 100644 --- a/sound/soc/stm/stm32_i2s.c +++ b/sound/soc/stm/stm32_i2s.c @@ -469,11 +469,8 @@ static int stm32_i2smclk_determine_rate(struct clk_hw *hw, int ret; ret = stm32_i2s_calc_clk_div(i2s, req->best_parent_rate, req->rate); - if (ret) { - req->rate = ret; - - return 0; - } + if (ret) + return ret; mclk->freq = req->best_parent_rate / i2s->divider; From aa5fc4362fac9351557eb27c745579159a2e4520 Mon Sep 17 00:00:00 2001 From: Liu01 Tong Date: Mon, 11 Aug 2025 14:52:37 +0800 Subject: [PATCH 086/246] drm/amdgpu: fix task hang from failed job submission during process kill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During process kill, drm_sched_entity_flush() will kill the vm entities. The following job submissions of this process will fail, and the resources of these jobs have not been released, nor have the fences been signalled, causing tasks to hang and timeout. Fix by check entity status in amdgpu_vm_ready() and avoid submit jobs to stopped entity. v2: add amdgpu_vm_ready() check before amdgpu_vm_clear_freed() in function amdgpu_cs_vm_handling(). Fixes: 1f02f2044bda ("drm/amdgpu: Avoid extra evict-restore process.") Signed-off-by: Liu01 Tong Signed-off-by: Lin.Cao Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit f101c13a8720c73e67f8f9d511fbbeda95bcedb1) --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 15 +++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index a2adaacf6adb..d3f220be2ef9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1139,6 +1139,9 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) } } + if (!amdgpu_vm_ready(vm)) + return -EINVAL; + r = amdgpu_vm_clear_freed(adev, vm, NULL); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 5cacf5717016..0b87798daebd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -654,11 +654,10 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, * Check if all VM PDs/PTs are ready for updates * * Returns: - * True if VM is not evicting. + * True if VM is not evicting and all VM entities are not stopped */ bool amdgpu_vm_ready(struct amdgpu_vm *vm) { - bool empty; bool ret; amdgpu_vm_eviction_lock(vm); @@ -666,10 +665,18 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm) amdgpu_vm_eviction_unlock(vm); spin_lock(&vm->status_lock); - empty = list_empty(&vm->evicted); + ret &= list_empty(&vm->evicted); spin_unlock(&vm->status_lock); - return ret && empty; + spin_lock(&vm->immediate.lock); + ret &= !vm->immediate.stopped; + spin_unlock(&vm->immediate.lock); + + spin_lock(&vm->delayed.lock); + ret &= !vm->delayed.stopped; + spin_unlock(&vm->delayed.lock); + + return ret; } /** From 6a912e8aa2b2fba2519e93a2eac197d16f137c9a Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 3 Jul 2025 15:39:02 +0800 Subject: [PATCH 087/246] ext4: show the default enabled i_version option Display `i_version` in `/proc/fs/ext4/sdx/options`, even though it's default enabled. This aids users managing multi-version scenarios and simplifies debugging. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250703073903.6952-1-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c7d39da7e733..9203518786e4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2975,6 +2975,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); + if (nodefs && sb->s_flags & SB_I_VERSION) + SEQ_OPTS_PUTS("i_version"); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); if (nodefs || EXT4_MOUNT_DATA_FLAGS & From f2326fd14a224e4cccbab89e14c52279ff79b7ec Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 3 Jul 2025 15:39:03 +0800 Subject: [PATCH 088/246] ext4: preserve SB_I_VERSION on remount IMA testing revealed that after an ext4 remount, file accesses triggered full measurements even without modifications, instead of skipping as expected when i_version is unchanged. Debugging showed `SB_I_VERSION` was cleared in reconfigure_super() during remount due to commit 1ff20307393e ("ext4: unconditionally enable the i_version counter") removing the fix from commit 960e0ab63b2e ("ext4: fix i_version handling on remount"). To rectify this, `SB_I_VERSION` is always set for `fc->sb_flags` in ext4_init_fs_context(), instead of `sb->s_flags` in __ext4_fill_super(), ensuring it persists across all mounts. Cc: stable@kernel.org Fixes: 1ff20307393e ("ext4: unconditionally enable the i_version counter") Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250703073903.6952-2-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9203518786e4..ed1b36bd51c8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1998,6 +1998,9 @@ int ext4_init_fs_context(struct fs_context *fc) fc->fs_private = ctx; fc->ops = &ext4_context_ops; + /* i_version is always enabled now */ + fc->sb_flags |= SB_I_VERSION; + return 0; } @@ -5316,9 +5319,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); - /* i_version is always enabled now */ - sb->s_flags |= SB_I_VERSION; - /* HSM events are allowed by default. */ sb->s_iflags |= SB_I_ALLOW_HSM; From b4cc4a4077268522e3d0d34de4b2dc144e2330fa Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Wed, 16 Jul 2025 19:36:42 -0600 Subject: [PATCH 089/246] ext4: check fast symlink for ea_inode correctly The check for a fast symlink in the presence of only an external xattr inode is incorrect. If a fast symlink does not have an xattr block (i_file_acl == 0), but does have an external xattr inode that increases inode i_blocks, then the check for a fast symlink will incorrectly fail and __ext4_iget()->ext4_ind_check_inode() will report the inode is corrupt when it "validates" i_data[] on the next read: # ln -s foo /mnt/tmp/bar # setfattr -h -n trusted.test \ -v "$(yes | head -n 4000)" /mnt/tmp/bar # umount /mnt/tmp # mount /mnt/tmp # ls -l /mnt/tmp ls: cannot access '/mnt/tmp/bar': Structure needs cleaning total 4 ? l?????????? ? ? ? ? ? bar # dmesg | tail -1 EXT4-fs error (device dm-8): __ext4_iget:5098: inode #24578: block 7303014: comm ls: invalid block (note that "block 7303014" = 0x6f6f66 = "foo" in LE order). ext4_inode_is_fast_symlink() should check the superblock EXT4_FEATURE_INCOMPAT_EA_INODE feature flag, not the inode EXT4_EA_INODE_FL, since the latter is only set on the xattr inode itself, and not on the inode that uses this xattr. Cc: stable@vger.kernel.org Fixes: fc82228a5e38 ("ext4: support fast symlinks from ext3 file systems") Signed-off-by: Andreas Dilger Reviewed-by: Li Dongyang Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin Reviewed-on: https://review.whamcloud.com/59879 Lustre-bug-id: https://jira.whamcloud.com/browse/LU-19121 Link: https://patch.msgid.link/20250717063709.757077-1-adilger@dilger.ca Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index eeccb6f588f9..731a800d9c14 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -146,7 +146,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, */ int ext4_inode_is_fast_symlink(struct inode *inode) { - if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + if (!ext4_has_feature_ea_inode(inode->i_sb)) { int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; From f3fbaa74d999c16b5caeca779e6d7e6e6e7feed8 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Mon, 21 Jul 2025 22:09:02 +0200 Subject: [PATCH 090/246] ext4: remove useless if check This if branch is only jumping to 'out' which is defined just after the branch itself. Hence this is if-check is a no-op and can be removed. Address-Coverity-ID: 1647981 ("Incorrect expression (IDENTICAL_BRANCHES)") Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250721200902.1071-1-antonio@mandelbit.com Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index d83f91b62317..01f379f5fd04 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2984,8 +2984,6 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, return PTR_ERR(dir_block); de = (struct ext4_dir_entry_2 *)dir_block->b_data; err = ext4_init_dirblock(handle, inode, dir_block, dir->i_ino, NULL, 0); - if (err) - goto out; out: brelse(dir_block); return err; From 59d8731c887bf2f5bb8406ace26cbb6f6b36d6cc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 31 Jul 2025 16:00:07 -0400 Subject: [PATCH 091/246] ext4: fix unused variable warning in ext4_init_new_dir Cc: stable@kernel.org Fixes: 90f097b1403f ("ext4: refactor the inline directory conversion and...") Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 01f379f5fd04..2cd36f59c9e3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2965,7 +2965,6 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct inode *inode) { struct buffer_head *dir_block = NULL; - struct ext4_dir_entry_2 *de; ext4_lblk_t block = 0; int err; @@ -2982,7 +2981,6 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, dir_block = ext4_append(handle, inode, &block); if (IS_ERR(dir_block)) return PTR_ERR(dir_block); - de = (struct ext4_dir_entry_2 *)dir_block->b_data; err = ext4_init_dirblock(handle, inode, dir_block, dir->i_ino, NULL, 0); out: brelse(dir_block); From 4ba97589ed19210ff808929052696f5636139823 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Sun, 3 Aug 2025 18:22:41 +0800 Subject: [PATCH 092/246] ext4: remove redundant __GFP_NOWARN GFP_NOWAIT already includes __GFP_NOWARN, so let's remove the redundant __GFP_NOWARN. Signed-off-by: Qianfeng Rong Link: https://patch.msgid.link/20250803102243.623705-4-rongqianfeng@vivo.com Signed-off-by: Theodore Ts'o --- fs/ext4/page-io.c | 2 +- fs/ext4/super.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 3d8b0f6d2dea..39abfeec5f36 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -547,7 +547,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, * first page of the bio. Otherwise it can deadlock. */ if (io->io_bio) - gfp_flags = GFP_NOWAIT | __GFP_NOWARN; + gfp_flags = GFP_NOWAIT; retry_encrypt: bounce_page = fscrypt_encrypt_pagecache_blocks(folio, enc_bytes, 0, gfp_flags); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ed1b36bd51c8..b16ffa507b84 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -268,7 +268,7 @@ struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, - sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN); + sb->s_blocksize, GFP_NOWAIT); if (likely(bh)) { if (trylock_buffer(bh)) From bae76c035bf0852844151e68098c9b7cd63ef238 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 5 Aug 2025 14:00:30 +0530 Subject: [PATCH 093/246] ext4: fix fsmap end of range reporting with bigalloc With bigalloc enabled, the logic to report last extent has a bug since we try to use cluster units instead of block units. This can cause an issue where extra incorrect entries might be returned back to the user. This was flagged by generic/365 with 64k bs and -O bigalloc. ** Details of issue ** The issue was noticed on 5G 64k blocksize FS with -O bigalloc which has only 1 bg. $ xfs_io -c "fsmap -d" /mnt/scratch 0: 253:48 [0..127]: static fs metadata 128 /* sb */ 1: 253:48 [128..255]: special 102:1 128 /* gdt */ 3: 253:48 [256..383]: special 102:3 128 /* block bitmap */ 4: 253:48 [384..2303]: unknown 1920 /* flex bg empty space */ 5: 253:48 [2304..2431]: special 102:4 128 /* inode bitmap */ 6: 253:48 [2432..4351]: unknown 1920 /* flex bg empty space */ 7: 253:48 [4352..6911]: inodes 2560 8: 253:48 [6912..538623]: unknown 531712 9: 253:48 [538624..10485759]: free space 9947136 The issue can be seen with: $ xfs_io -c "fsmap -d 0 3" /mnt/scratch 0: 253:48 [0..127]: static fs metadata 128 1: 253:48 [384..2047]: unknown 1664 Only the first entry was expected to be returned but we get 2. This is because: ext4_getfsmap_datadev() first_cluster, last_cluster = 0 ... info->gfi_last = true; ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster + 1, 0, info); fsb = C2B(1) = 16 fslen = 0 ... /* Merge in any relevant extents from the meta_list */ list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { ... // since fsb = 16, considers all metadata which starts before 16 blockno iter 1: error = ext4_getfsmap_helper(sb, info, p); // p = sb (0,1), nop info->gfi_next_fsblk = 1 iter 2: error = ext4_getfsmap_helper(sb, info, p); // p = gdt (1,2), nop info->gfi_next_fsblk = 2 iter 3: error = ext4_getfsmap_helper(sb, info, p); // p = blk bitmap (2,3), nop info->gfi_next_fsblk = 3 iter 4: error = ext4_getfsmap_helper(sb, info, p); // p = ino bitmap (18,19) if (rec_blk > info->gfi_next_fsblk) { // (18 > 3) // emits an extra entry ** BUG ** } } Fix this by directly calling ext4_getfsmap_datadev() with a dummy record that has fmr_physical set to (end_fsb + 1) instead of last_cluster + 1. By using the block instead of cluster we get the correct behavior. Replacing ext4_getfsmap_datadev_helper() with ext4_getfsmap_helper() is okay since the gfi_lastfree and metadata checks in ext4_getfsmap_datadev_helper() are anyways redundant when we only want to emit the last allocated block of the range, as we have already taken care of emitting metadata and any last free blocks. Cc: stable@kernel.org Reported-by: Disha Goel Fixes: 4a622e4d477b ("ext4: fix FS_IOC_GETFSMAP handling") Signed-off-by: Ojaswin Mujoo Reviewed-by: Darrick J. Wong Link: https://patch.msgid.link/e7472c8535c9c5ec10f425f495366864ea12c9da.1754377641.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/fsmap.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index 383c6edea6dd..9d63c39f6077 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -526,6 +526,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb, ext4_group_t end_ag; ext4_grpblk_t first_cluster; ext4_grpblk_t last_cluster; + struct ext4_fsmap irec; int error = 0; bofs = le32_to_cpu(sbi->s_es->s_first_data_block); @@ -609,10 +610,18 @@ static int ext4_getfsmap_datadev(struct super_block *sb, goto err; } - /* Report any gaps at the end of the bg */ + /* + * The dummy record below will cause ext4_getfsmap_helper() to report + * any allocated blocks at the end of the range. + */ + irec.fmr_device = 0; + irec.fmr_physical = end_fsb + 1; + irec.fmr_length = 0; + irec.fmr_owner = EXT4_FMR_OWN_FREE; + irec.fmr_flags = 0; + info->gfi_last = true; - error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster + 1, - 0, info); + error = ext4_getfsmap_helper(sb, info, &irec); if (error) goto err; From 3ffbdd1f1165f1b2d6a94d1b1aabef57120deaf7 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 5 Aug 2025 14:00:31 +0530 Subject: [PATCH 094/246] ext4: fix reserved gdt blocks handling in fsmap In some cases like small FSes with no meta_bg and where the resize doesn't need extra gdt blocks as it can fit in the current one, s_reserved_gdt_blocks is set as 0, which causes fsmap to emit a 0 length entry, which is incorrect. $ mkfs.ext4 -b 65536 -O bigalloc /dev/sda 5G $ mount /dev/sda /mnt/scratch $ xfs_io -c "fsmap -d" /mnt/scartch 0: 253:48 [0..127]: static fs metadata 128 1: 253:48 [128..255]: special 102:1 128 2: 253:48 [256..255]: special 102:2 0 <---- 0 len entry 3: 253:48 [256..383]: special 102:3 128 Fix this by adding a check for this case. Cc: stable@kernel.org Fixes: 0c9ec4beecac ("ext4: support GETFSMAP ioctls") Signed-off-by: Ojaswin Mujoo Reviewed-by: Darrick J. Wong Link: https://patch.msgid.link/08781b796453a5770112aa96ad14c864fbf31935.1754377641.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/fsmap.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index 9d63c39f6077..91185c40f755 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -393,6 +393,14 @@ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb, /* Reserved GDT blocks */ if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) { len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + + /* + * mkfs.ext4 can set s_reserved_gdt_blocks as 0 in some cases, + * check for that. + */ + if (!len) + return 0; + error = ext4_getfsmap_fill(meta_list, fsb, len, EXT4_FMR_OWN_RESV_GDT); if (error) From c5e104a91e7b6fa12c1dc2d8bf84abb7ef9b89ad Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 7 Aug 2025 09:35:20 -0400 Subject: [PATCH 095/246] ext4: don't try to clear the orphan_present feature block device is r/o When the file system is frozen in preparation for taking an LVM snapshot, the journal is checkpointed and if the orphan_file feature is enabled, and the orphan file is empty, we clear the orphan_present feature flag. But if there are pending inodes that need to be removed the orphan_present feature flag can't be cleared. The problem comes if the block device is read-only. In that case, we can't process the orphan inode list, so it is skipped in ext4_orphan_cleanup(). But then in ext4_mark_recovery_complete(), this results in the ext4 error "Orphan file not empty on read-only fs" firing and the file system mount is aborted. Fix this by clearing the needs_recovery flag in the block device is read-only. We do this after the call to ext4_load_and_init-journal() since there are some error checks need to be done in case the journal needs to be replayed and the block device is read-only, or if the block device containing the externa journal is read-only, etc. Cc: stable@kernel.org Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1108271 Cc: stable@vger.kernel.org Fixes: 02f310fcf47f ("ext4: Speedup ext4 orphan inode handling") Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b16ffa507b84..699c15db28a8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5416,6 +5416,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) err = ext4_load_and_init_journal(sb, es, ctx); if (err) goto failed_mount3a; + if (bdev_read_only(sb->s_bdev)) + needs_recovery = 0; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " From 02c7f7219ac0e2277b3379a3a0e9841ef464b6d4 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 11 Aug 2025 14:45:32 +0800 Subject: [PATCH 096/246] ext4: fix hole length calculation overflow in non-extent inodes In a filesystem with a block size larger than 4KB, the hole length calculation for a non-extent inode in ext4_ind_map_blocks() can easily exceed INT_MAX. Then it could return a zero length hole and trigger the following waring and infinite in the iomap infrastructure. ------------[ cut here ]------------ WARNING: CPU: 3 PID: 434101 at fs/iomap/iter.c:34 iomap_iter_done+0x148/0x190 CPU: 3 UID: 0 PID: 434101 Comm: fsstress Not tainted 6.16.0-rc7+ #128 PREEMPT(voluntary) Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : iomap_iter_done+0x148/0x190 lr : iomap_iter+0x174/0x230 sp : ffff8000880af740 x29: ffff8000880af740 x28: ffff0000db8e6840 x27: 0000000000000000 x26: 0000000000000000 x25: ffff8000880af830 x24: 0000004000000000 x23: 0000000000000002 x22: 000001bfdbfa8000 x21: ffffa6a41c002e48 x20: 0000000000000001 x19: ffff8000880af808 x18: 0000000000000000 x17: 0000000000000000 x16: ffffa6a495ee6cd0 x15: 0000000000000000 x14: 00000000000003d4 x13: 00000000fa83b2da x12: 0000b236fc95f18c x11: ffffa6a4978b9c08 x10: 0000000000001da0 x9 : ffffa6a41c1a2a44 x8 : ffff8000880af5c8 x7 : 0000000001000000 x6 : 0000000000000000 x5 : 0000000000000004 x4 : 000001bfdbfa8000 x3 : 0000000000000000 x2 : 0000000000000000 x1 : 0000004004030000 x0 : 0000000000000000 Call trace: iomap_iter_done+0x148/0x190 (P) iomap_iter+0x174/0x230 iomap_fiemap+0x154/0x1d8 ext4_fiemap+0x110/0x140 [ext4] do_vfs_ioctl+0x4b8/0xbc0 __arm64_sys_ioctl+0x8c/0x120 invoke_syscall+0x6c/0x100 el0_svc_common.constprop.0+0x48/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x38/0x120 el0t_64_sync_handler+0x10c/0x138 el0t_64_sync+0x198/0x1a0 ---[ end trace 0000000000000000 ]--- Cc: stable@kernel.org Fixes: facab4d9711e ("ext4: return hole from ext4_map_blocks()") Reported-by: Qu Wenruo Closes: https://lore.kernel.org/linux-ext4/9b650a52-9672-4604-a765-bb6be55d1e4a@gmx.com/ Tested-by: Qu Wenruo Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250811064532.1788289-1-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/indirect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 7de327fa7b1c..d45124318200 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -539,7 +539,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, int indirect_blks; int blocks_to_boundary = 0; int depth; - int count = 0; + u64 count = 0; ext4_fsblk_t first_block = 0; trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); @@ -588,7 +588,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, count++; /* Fill in size of a hole we found */ map->m_pblk = 0; - map->m_len = min_t(unsigned int, map->m_len, count); + map->m_len = umin(map->m_len, count); goto cleanup; } From 76dba1fe277f6befd6ef650e1946f626c547387a Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Mon, 11 Aug 2025 20:58:16 +0800 Subject: [PATCH 097/246] ext4: use kmalloc_array() for array space allocation Replace kmalloc(size * sizeof) with kmalloc_array() for safer memory allocation and overflow prevention. Cc: stable@kernel.org Signed-off-by: Liao Yuanhong Link: https://patch.msgid.link/20250811125816.570142-1-liaoyuanhong@vivo.com Signed-off-by: Theodore Ts'o --- fs/ext4/orphan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 7c7f792ad6ab..524d4658fa40 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -589,8 +589,9 @@ int ext4_init_orphan_info(struct super_block *sb) } oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; - oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block), - GFP_KERNEL); + oi->of_binfo = kmalloc_array(oi->of_blocks, + sizeof(struct ext4_orphan_block), + GFP_KERNEL); if (!oi->of_binfo) { ret = -ENOMEM; goto out_put; From 21924af67d69d7c9fdaf845be69043cfe75196a1 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 5 Aug 2025 00:10:02 +0000 Subject: [PATCH 098/246] locking: Fix __clear_task_blocked_on() warning from __ww_mutex_wound() path The __clear_task_blocked_on() helper added a number of sanity checks ensuring we hold the mutex wait lock and that the task we are clearing blocked_on pointer (if set) matches the mutex. However, there is an edge case in the _ww_mutex_wound() logic where we need to clear the blocked_on pointer for the task that owns the mutex, not the task that is waiting on the mutex. For this case the sanity checks aren't valid, so handle this by allowing a NULL lock to skip the additional checks. K Prateek Nayak and Maarten Lankhorst also pointed out that in this case where we don't hold the owner's mutex wait_lock, we need to be a bit more careful using READ_ONCE/WRITE_ONCE in both the __clear_task_blocked_on() and __set_task_blocked_on() implementations to avoid accidentally tripping WARN_ONs if two instances race. So do that here as well. This issue was easier to miss, I realized, as the test-ww_mutex driver only exercises the wait-die class of ww_mutexes. I've sent a patch[1] to address this so the logic will be easier to test. [1]: https://lore.kernel.org/lkml/20250801023358.562525-2-jstultz@google.com/ Fixes: a4f0b6fef4b0 ("locking/mutex: Add p->blocked_on wrappers for correctness checks") Closes: https://lore.kernel.org/lkml/68894443.a00a0220.26d0e1.0015.GAE@google.com/ Reported-by: syzbot+602c4720aed62576cd79@syzkaller.appspotmail.com Reported-by: Maarten Lankhorst Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Acked-by: Maarten Lankhorst Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250805001026.2247040-1-jstultz@google.com --- include/linux/sched.h | 29 +++++++++++++++++------------ kernel/locking/ww_mutex.h | 6 +++++- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 40d2fa90df42..62103dd6a48e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2144,6 +2144,8 @@ static inline struct mutex *__get_task_blocked_on(struct task_struct *p) static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) { + struct mutex *blocked_on = READ_ONCE(p->blocked_on); + WARN_ON_ONCE(!m); /* The task should only be setting itself as blocked */ WARN_ON_ONCE(p != current); @@ -2154,8 +2156,8 @@ static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) * with a different mutex. Note, setting it to the same * lock repeatedly is ok. */ - WARN_ON_ONCE(p->blocked_on && p->blocked_on != m); - p->blocked_on = m; + WARN_ON_ONCE(blocked_on && blocked_on != m); + WRITE_ONCE(p->blocked_on, m); } static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) @@ -2166,16 +2168,19 @@ static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) { - WARN_ON_ONCE(!m); - /* Currently we serialize blocked_on under the mutex::wait_lock */ - lockdep_assert_held_once(&m->wait_lock); - /* - * There may be cases where we re-clear already cleared - * blocked_on relationships, but make sure we are not - * clearing the relationship with a different lock. - */ - WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m); - p->blocked_on = NULL; + if (m) { + struct mutex *blocked_on = READ_ONCE(p->blocked_on); + + /* Currently we serialize blocked_on under the mutex::wait_lock */ + lockdep_assert_held_once(&m->wait_lock); + /* + * There may be cases where we re-clear already cleared + * blocked_on relationships, but make sure we are not + * clearing the relationship with a different lock. + */ + WARN_ON_ONCE(blocked_on && blocked_on != m); + } + WRITE_ONCE(p->blocked_on, NULL); } static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 086fd5487ca7..31a785afee6c 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -342,8 +342,12 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * When waking up the task to wound, be sure to clear the * blocked_on pointer. Otherwise we can see circular * blocked_on relationships that can't resolve. + * + * NOTE: We pass NULL here instead of lock, because we + * are waking the mutex owner, who may be currently + * blocked on a different mutex. */ - __clear_task_blocked_on(owner, lock); + __clear_task_blocked_on(owner, NULL); wake_q_add(wake_q, owner); } return true; From f0ba0e7172a222ea6043b61ecd86723c46d7bcf2 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 23 Jul 2025 15:38:10 +0200 Subject: [PATCH 099/246] btrfs: zoned: skip ZONE FINISH of conventional zones Don't call ZONE FINISH for conventional zones as this will result in I/O errors. Instead check if the zone that needs finishing is a conventional zone and if yes skip it. Also factor out the actual handling of finishing a single zone into a helper function, as do_zone_finish() is growing ever bigger and the indentations levels are getting higher. Reviewed-by: Naohiro Aota Reviewed-by: Anand Jain Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 55 ++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index db11b5b5f0e6..36de6d0d595f 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2245,6 +2245,40 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group) rcu_read_unlock(); } +static int call_zone_finish(struct btrfs_block_group *block_group, + struct btrfs_io_stripe *stripe) +{ + struct btrfs_device *device = stripe->dev; + const u64 physical = stripe->physical; + struct btrfs_zoned_device_info *zinfo = device->zone_info; + int ret; + + if (!device->bdev) + return 0; + + if (zinfo->max_active_zones == 0) + return 0; + + if (btrfs_dev_is_sequential(device, physical)) { + unsigned int nofs_flags; + + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, + physical >> SECTOR_SHIFT, + zinfo->zone_size >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags); + + if (ret) + return ret; + } + + if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + zinfo->reserved_active_zones++; + btrfs_dev_clear_active_zone(device, physical); + + return 0; +} + static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) { struct btrfs_fs_info *fs_info = block_group->fs_info; @@ -2329,31 +2363,12 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ down_read(&dev_replace->rwsem); map = block_group->physical_map; for (i = 0; i < map->num_stripes; i++) { - struct btrfs_device *device = map->stripes[i].dev; - const u64 physical = map->stripes[i].physical; - struct btrfs_zoned_device_info *zinfo = device->zone_info; - unsigned int nofs_flags; - - if (!device->bdev) - continue; - - if (zinfo->max_active_zones == 0) - continue; - - nofs_flags = memalloc_nofs_save(); - ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, - physical >> SECTOR_SHIFT, - zinfo->zone_size >> SECTOR_SHIFT); - memalloc_nofs_restore(nofs_flags); + ret = call_zone_finish(block_group, &map->stripes[i]); if (ret) { up_read(&dev_replace->rwsem); return ret; } - - if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) - zinfo->reserved_active_zones++; - btrfs_dev_clear_active_zone(device, physical); } up_read(&dev_replace->rwsem); From daa0fde322350b467bc62bc1b141bf62df6123f8 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Wed, 16 Jul 2025 16:59:53 +0900 Subject: [PATCH 100/246] btrfs: zoned: fix data relocation block group reservation btrfs_zoned_reserve_data_reloc_bg() is called on mount and at that point, all data block groups belong to the primary data space_info. So, we don't find anything in the data relocation space_info. Also, the condition "bg->used > 0" can select a block group with full of zone_unusable bytes for the candidate. As we cannot allocate from the block group, it is useless to reserve it as the data relocation block group. Furthermore, because of the space_info separation, we need to migrate the selected block group to the data relocation space_info. If not, the extent allocator cannot use the block group to do the allocation. This commit fixes these three issues. Fixes: e606ff985ec7 ("btrfs: zoned: reserve data_reloc block group on mount") Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 55 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 36de6d0d595f..7a3351b1b0c6 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -17,6 +17,7 @@ #include "accessors.h" #include "bio.h" #include "transaction.h" +#include "sysfs.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -2519,12 +2520,12 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) { struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - struct btrfs_space_info *space_info = data_sinfo->sub_group[0]; + struct btrfs_space_info *space_info = data_sinfo; struct btrfs_trans_handle *trans; struct btrfs_block_group *bg; struct list_head *bg_list; u64 alloc_flags; - bool initial = false; + bool first = true; bool did_chunk_alloc = false; int index; int ret; @@ -2538,21 +2539,52 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) if (sb_rdonly(fs_info->sb)) return; - ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags); index = btrfs_bg_flags_to_raid_index(alloc_flags); - bg_list = &data_sinfo->block_groups[index]; + /* Scan the data space_info to find empty block groups. Take the second one. */ again: + bg_list = &space_info->block_groups[index]; list_for_each_entry(bg, bg_list, list) { - if (bg->used > 0) + if (bg->alloc_offset != 0) continue; - if (!initial) { - initial = true; + if (first) { + first = false; continue; } + if (space_info == data_sinfo) { + /* Migrate the block group to the data relocation space_info. */ + struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0]; + int factor; + + ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + factor = btrfs_bg_type_to_factor(bg->flags); + + down_write(&space_info->groups_sem); + list_del_init(&bg->list); + /* We can assume this as we choose the second empty one. */ + ASSERT(!list_empty(&space_info->block_groups[index])); + up_write(&space_info->groups_sem); + + spin_lock(&space_info->lock); + space_info->total_bytes -= bg->length; + space_info->disk_total -= bg->length * factor; + /* There is no allocation ever happened. */ + ASSERT(bg->used == 0); + ASSERT(bg->zone_unusable == 0); + /* No super block in a block group on the zoned setup. */ + ASSERT(bg->bytes_super == 0); + spin_unlock(&space_info->lock); + + bg->space_info = reloc_sinfo; + if (reloc_sinfo->block_group_kobjs[index] == NULL) + btrfs_sysfs_add_block_group_type(bg); + + btrfs_add_bg_to_space_info(fs_info, bg); + } + fs_info->data_reloc_bg = bg->start; set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags); btrfs_zone_activate(bg); @@ -2567,11 +2599,18 @@ again: if (IS_ERR(trans)) return; + /* Allocate new BG in the data relocation space_info. */ + space_info = data_sinfo->sub_group[0]; + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); if (ret == 1) { + /* + * We allocated a new block group in the data relocation space_info. We + * can take that one. + */ + first = false; did_chunk_alloc = true; - bg_list = &space_info->block_groups[index]; goto again; } } From 5c4b93f4c8e5c53574c1a48d66a27a2c68b414af Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Wed, 16 Jul 2025 16:59:54 +0900 Subject: [PATCH 101/246] btrfs: zoned: fix write time activation failure for metadata block group Since commit 13bb483d32ab ("btrfs: zoned: activate metadata block group on write time"), we activate a metadata block group at the write time. If the zone capacity is small enough, we can allocate the entire region before the first write. Then, we hit the btrfs_zoned_bg_is_full() in btrfs_zone_activate() and the activation fails. For a data block group, we activate it at the allocation time and we should check the fullness condition in the caller side. Add, a WARN to check the fullness condition. For a metadata block group, we don't need the fullness check because we activate it at the write time. Instead, activating it once it is written should be invalid. Catch that with a WARN too. Fixes: 13bb483d32ab ("btrfs: zoned: activate metadata block group on write time") CC: stable@vger.kernel.org # 6.6+ Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 7a3351b1b0c6..ab6844dce8bc 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2169,10 +2169,15 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } - /* No space left */ - if (btrfs_zoned_bg_is_full(block_group)) { - ret = false; - goto out_unlock; + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) { + /* The caller should check if the block group is full. */ + if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) { + ret = false; + goto out_unlock; + } + } else { + /* Since it is already written, it should have been active. */ + WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start); } for (i = 0; i < map->num_stripes; i++) { From 04147d8394e80acaaebf0365f112339e8b606c05 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Wed, 16 Jul 2025 16:59:55 +0900 Subject: [PATCH 102/246] btrfs: zoned: limit active zones to max_open_zones When there is no active zone limit, we can technically write into any number of zones at the same time. However, exceeding the max open zones can degrade performance. To prevent this, set the max_active_zones to bdev_max_open_zones() if there is no active zone limit. Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index ab6844dce8bc..e0ee3aeabd2c 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -43,6 +43,9 @@ /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 +/* Default number of max active zones when the device has no limits. */ +#define BTRFS_DEFAULT_MAX_ACTIVE_ZONES 128 + /* * Minimum of active zones we need: * @@ -417,7 +420,10 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; - max_active_zones = bdev_max_active_zones(bdev); + max_active_zones = min_not_zero(bdev_max_active_zones(bdev), + bdev_max_open_zones(bdev)); + if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES) + max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES; if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { btrfs_err(fs_info, "zoned: %s: max active zones %u is too small, need at least %u active zones", From f7a2e1c08727384cde1c686dd57172f99b5f2e6e Mon Sep 17 00:00:00 2001 From: Erick Karanja Date: Wed, 13 Aug 2025 10:18:36 +0300 Subject: [PATCH 103/246] Docs: admin-guide: Correct spelling mistake Fix spelling mistake directoy to directory Reported-by: codespell Signed-off-by: Erick Karanja Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20250813071837.668613-1-karanja99erick@gmail.com Signed-off-by: Jens Axboe --- Documentation/admin-guide/blockdev/zoned_loop.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/blockdev/zoned_loop.rst b/Documentation/admin-guide/blockdev/zoned_loop.rst index 9c7aa3b482f3..64dcfde7450a 100644 --- a/Documentation/admin-guide/blockdev/zoned_loop.rst +++ b/Documentation/admin-guide/blockdev/zoned_loop.rst @@ -79,7 +79,7 @@ zone_capacity_mb Device zone capacity (must always be equal to or lower than the zone size. Default: zone size. conv_zones Total number of conventioanl zones starting from sector 0. Default: 8. -base_dir Path to the base directoy where to create the directory +base_dir Path to the base directory where to create the directory containing the zone files of the device. Default=/var/local/zloop. The device directory containing the zone files is always From 8f5845e0743bf3512b71b3cb8afe06c192d6acc4 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Tue, 12 Aug 2025 23:42:57 +0800 Subject: [PATCH 104/246] block: restore default wbt enablement The commit 245618f8e45f ("block: protect wbt_lat_usec using q->elevator_lock") protected wbt_enable_default() with q->elevator_lock; however, it also placed wbt_enable_default() before blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);, resulting in wbt failing to be enabled. Moreover, the protection of wbt_enable_default() by q->elevator_lock was removed in commit 78c271344b6f ("block: move wbt_enable_default() out of queue freezing from sched ->exit()"), so we can directly fix this issue by placing wbt_enable_default() after blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);. Additionally, this issue also causes the inability to read the wbt_lat_usec file, and the scenario is as follows: root@q:/sys/block/sda/queue# cat wbt_lat_usec cat: wbt_lat_usec: Invalid argument root@q:/data00/sjc/linux# ls /sys/kernel/debug/block/sda/rqos cannot access '/sys/kernel/debug/block/sda/rqos': No such file or directory root@q:/data00/sjc/linux# find /sys -name wbt /sys/kernel/debug/tracing/events/wbt After testing with this patch, wbt can be enabled normally. Signed-off-by: Julian Sun Cc: stable@vger.kernel.org Fixes: 245618f8e45f ("block: protect wbt_lat_usec using q->elevator_lock") Reviewed-by: Nilay Shroff Reviewed-by: Yu Kuai Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250812154257.57540-1-sunjunchao@bytedance.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index c5cf79a20842..4a7f1a349998 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -902,9 +902,9 @@ int blk_register_queue(struct gendisk *disk) if (queue_is_mq(q)) elevator_set_default(q); - wbt_enable_default(disk); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); + wbt_enable_default(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&disk->queue_kobj, KOBJ_ADD); From 4bcd3061e8154606af7f721cb75ca04ffe191a12 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 29 Jul 2025 19:01:45 +0930 Subject: [PATCH 105/246] btrfs: clear block dirty if submit_one_sector() failed [BUG] If submit_one_sector() failed, the block will be kept dirty, but with their corresponding range finished in the ordered extent. This means if a writeback happens later again, we can hit the following problems: - ASSERT(block_start != EXTENT_MAP_HOLE) in submit_one_sector() If the original extent map is a hole, then we can hit this case, as the new ordered extent failed, we will drop the new extent map and re-read one from the disk. - DEBUG_WARN() in btrfs_writepage_cow_fixup() This is because we no longer have an ordered extent for those dirty blocks. The original for them is already finished with error. [CAUSE] The function submit_one_sector() is not following the regular error handling of writeback. The common practice is to clear the folio dirty, start and finish the writeback for the block. This is normally done by extent_clear_unlock_delalloc() with PAGE_START_WRITEBACK | PAGE_END_WRITEBACK flags during run_delalloc_range(). So if we keep those failed blocks dirty, they will stay in the page cache and wait for the next writeback. And since the original ordered extent is already finished and removed, depending on the original extent map, we either hit the ASSERT() inside submit_one_sector(), or hit the DEBUG_WARN() in btrfs_writepage_cow_fixup(). [FIX] Follow the regular error handling to clear the dirty flag for the block, start and finish writeback for that block instead. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f23d75986947..741c20480099 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1512,7 +1512,7 @@ out: /* * Return 0 if we have submitted or queued the sector for submission. - * Return <0 for critical errors. + * Return <0 for critical errors, and the sector will have its dirty flag cleared. * * Caller should make sure filepos < i_size and handle filepos >= i_size case. */ @@ -1535,8 +1535,17 @@ static int submit_one_sector(struct btrfs_inode *inode, ASSERT(filepos < i_size); em = btrfs_get_extent(inode, NULL, filepos, sectorsize); - if (IS_ERR(em)) + if (IS_ERR(em)) { + /* + * When submission failed, we should still clear the folio dirty. + * Or the folio will be written back again but without any + * ordered extent. + */ + btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); + btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); + btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize); return PTR_ERR(em); + } extent_offset = filepos - em->start; em_end = btrfs_extent_map_end(em); @@ -1666,8 +1675,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, * Here we set writeback and clear for the range. If the full folio * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. * - * If we hit any error, the corresponding sector will still be dirty - * thus no need to clear PAGECACHE_TAG_DIRTY. + * If we hit any error, the corresponding sector will have its dirty + * flag cleared and writeback finished, thus no need to handle the error case. */ if (!submitted_io && !error) { btrfs_folio_set_writeback(fs_info, folio, start, len); From 05b372862600e551bbf86e7f24a1caeed5e06150 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 29 Jul 2025 19:01:46 +0930 Subject: [PATCH 106/246] btrfs: clear block dirty if btrfs_writepage_cow_fixup() failed [BUG] If btrfs_writepage_cow_fixup() failed (returning value -EUCLEAN), the block will be kept dirty, but with its corresponding range finished in the ordered extent. Currently that error pattern is only possible for experimental builds, which places extra check to ensure we shouldn't hit a dirty block without a corresponding ordered extent. This means if later a writeback happens again, we can hit the following problems: - ASSERT(block_start != EXTENT_MAP_HOLE) in submit_one_sector() If the original extent map is a hole, then we can hit this case, as the new ordered extent failed, we will drop the new extent map and re-read one from the disk. - DEBUG_WARN() in btrfs_writepage_cow_fixup() This is because we no longer have an ordered extent for those dirty blocks. The original for them is already finished with error. [CAUSE] The function btrfs_writepage_cow_fixup() is not following the regular error handling of writeback. The common practice is to clear the folio dirty, start and finish the writeback for the block. This is normally done by extent_clear_unlock_delalloc() with PAGE_START_WRITEBACK | PAGE_END_WRITEBACK flags during run_delalloc_range(). So if we keep those failed blocks dirty, they will stay in the page cache and wait for the next writeback. And since the original ordered extent is already finished and removed, depending on the original extent map, we either hit the ASSERT() inside submit_one_sector(), or hit the DEBUG_WARN() in btrfs_writepage_cow_fixup() again (and very ironic). [FIX] Follow the regular error handling to clear the dirty flag for the block range, start and finish writeback for that block range instead. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 741c20480099..be9c9c804952 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1618,8 +1618,12 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, folio_unlock(folio); return 1; } - if (ret < 0) + if (ret < 0) { + btrfs_folio_clear_dirty(fs_info, folio, start, len); + btrfs_folio_set_writeback(fs_info, folio, start, len); + btrfs_folio_clear_writeback(fs_info, folio, start, len); return ret; + } for (cur = start; cur < start + len; cur += fs_info->sectorsize) set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); From f022499f24e520706b9a8238746e1cacc37eb4e0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 1 Aug 2025 16:39:49 +0100 Subject: [PATCH 107/246] btrfs: do not set mtime/ctime to current time when unlinking for log replay If we are doing an unlink for log replay, we are updating the directory's mtime and ctime to the current time, and this is incorrect since it should stay with the mtime and ctime that were set when the directory was logged. This is the same as when adding a link to an inode during log replay (with btrfs_add_link()), where we want the mtime and ctime to be the values that were in place when the inode was logged. This was found with generic/547 using LOAD_FACTOR=20 and TIME_FACTOR=20, where due to large log trees we have longer log replay times and fssum could detect a mismatch of the mtime and ctime of a directory. Fix this by skipping the mtime and ctime update at __btrfs_unlink_inode() if we are in log replay context (just like btrfs_add_link()). Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d740910e071a..9e4aec7330cb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4189,6 +4189,23 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, return ret; } +static void update_time_after_link_or_unlink(struct btrfs_inode *dir) +{ + struct timespec64 now; + + /* + * If we are replaying a log tree, we do not want to update the mtime + * and ctime of the parent directory with the current time, since the + * log replay procedure is responsible for setting them to their correct + * values (the ones it had when the fsync was done). + */ + if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags)) + return; + + now = inode_set_ctime_current(&dir->vfs_inode); + inode_set_mtime_to_ts(&dir->vfs_inode, now); +} + /* * unlink helper that gets used here in inode.c and in the tree logging * recovery code. It remove a link in a directory with a given name, and @@ -4289,7 +4306,7 @@ skip_backref: inode_inc_iversion(&inode->vfs_inode); inode_set_ctime_current(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); - inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); + update_time_after_link_or_unlink(dir); return btrfs_update_inode(trans, dir); } @@ -6683,15 +6700,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + name->len * 2); inode_inc_iversion(&parent_inode->vfs_inode); - /* - * If we are replaying a log tree, we do not want to update the mtime - * and ctime of the parent directory with the current time, since the - * log replay procedure is responsible for setting them to their correct - * values (the ones it had when the fsync was done). - */ - if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) - inode_set_mtime_to_ts(&parent_inode->vfs_inode, - inode_set_ctime_current(&parent_inode->vfs_inode)); + update_time_after_link_or_unlink(parent_inode); ret = btrfs_update_inode(trans, parent_inode); if (ret) From 1f3d56db694cce6dfbffba0f398a06a222204487 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 31 Jul 2025 08:20:01 +0930 Subject: [PATCH 108/246] btrfs: clear TAG_TOWRITE from buffer tree when submitting a tree block [POSSIBLE BUG] After commit 5e121ae687b8 ("btrfs: use buffer xarray for extent buffer writeback operations"), we have a dedicated xarray for extent buffers, and a lot of tags are migrated to that buffer tree, like PAGECACHE_TAG_TOWRITE/DIRTY/WRITEBACK. This frees us from the limits of page flags, but there is a new asymmetric behavior, we call buffer_tree_tag_for_writeback() to set PAGECACHE_TAG_TOWRITE for the involved ranges, but there is no one to clear that tag. Before that rework, we relied on the page cache tag which was cleared when folio_start_writeback() was called. Although this has its own problems (e.g. the first one calling folio_start_writeback() will clear the tag for the whole page), it at least cleared the tag. But now our real tags are stored in the buffer tree, no one is really clearing the PAGECACHE_TAG_TOWRITE tag now. [FIX] Thankfully this is not going to cause any real bug, but just some inefficiency iterating the extent buffers. As if we hit an extent buffer which is not dirty but still has the PAGECACHE_TAG_TOWRITE tag, lock_extent_buffer_for_io() will skip it so we won't writeback the extent buffer again. To properly fix the inefficiency, just clear the PAGECACHE_TAG_TOWRITE inside lock_extent_buffer_for_io(). There is no error path between lock_extent_buffer_for_io() and write_one_eb(), so we're safe to clear the tag there. Reviewed-by: Naohiro Aota Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index be9c9c804952..c953297aa89a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1826,6 +1826,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e xas_load(&xas); xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irqrestore(&xas, flags); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); From b1511360c8ac882b0c52caa263620538e8d73220 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 31 Jul 2025 12:46:56 +0900 Subject: [PATCH 109/246] btrfs: subpage: keep TOWRITE tag until folio is cleaned btrfs_subpage_set_writeback() calls folio_start_writeback() the first time a folio is written back, and it also clears the PAGECACHE_TAG_TOWRITE tag even if there are still dirty blocks in the folio. This can break ordering guarantees, such as those required by btrfs_wait_ordered_extents(). That ordering breakage leads to a real failure. For example, running generic/464 on a zoned setup will hit the following ASSERT. This happens because the broken ordering fails to flush existing dirty pages before the file size is truncated. assertion failed: !list_empty(&ordered->list) :: 0, in fs/btrfs/zoned.c:1899 ------------[ cut here ]------------ kernel BUG at fs/btrfs/zoned.c:1899! Oops: invalid opcode: 0000 [#1] SMP NOPTI CPU: 2 UID: 0 PID: 1906169 Comm: kworker/u130:2 Kdump: loaded Not tainted 6.16.0-rc6-BTRFS-ZNS+ #554 PREEMPT(voluntary) Hardware name: Supermicro Super Server/H12SSL-NT, BIOS 2.0 02/22/2021 Workqueue: btrfs-endio-write btrfs_work_helper [btrfs] RIP: 0010:btrfs_finish_ordered_zoned.cold+0x50/0x52 [btrfs] RSP: 0018:ffffc9002efdbd60 EFLAGS: 00010246 RAX: 000000000000004c RBX: ffff88811923c4e0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff827e38b1 RDI: 00000000ffffffff RBP: ffff88810005d000 R08: 00000000ffffdfff R09: ffffffff831051c8 R10: ffffffff83055220 R11: 0000000000000000 R12: ffff8881c2458c00 R13: ffff88811923c540 R14: ffff88811923c5e8 R15: ffff8881c1bd9680 FS: 0000000000000000(0000) GS:ffff88a04acd0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f907c7a918c CR3: 0000000004024000 CR4: 0000000000350ef0 Call Trace: ? srso_return_thunk+0x5/0x5f btrfs_finish_ordered_io+0x4a/0x60 [btrfs] btrfs_work_helper+0xf9/0x490 [btrfs] process_one_work+0x204/0x590 ? srso_return_thunk+0x5/0x5f worker_thread+0x1d6/0x3d0 ? __pfx_worker_thread+0x10/0x10 kthread+0x118/0x230 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x205/0x260 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Consider process A calling writepages() with WB_SYNC_NONE. In zoned mode or for compressed writes, it locks several folios for delalloc and starts writing them out. Let's call the last locked folio folio X. Suppose the write range only partially covers folio X, leaving some pages dirty. Process A calls btrfs_subpage_set_writeback() when building a bio. This function call clears the TOWRITE tag of folio X, whose size = 8K and the block size = 4K. It is following state. 0 4K 8K |/////|/////| (flag: DIRTY, tag: DIRTY) <-----> Process A will write this range. Now suppose process B concurrently calls writepages() with WB_SYNC_ALL. It calls tag_pages_for_writeback() to tag dirty folios with PAGECACHE_TAG_TOWRITE. Since folio X is still dirty, it gets tagged. Then, B collects tagged folios using filemap_get_folios_tag() and must wait for folio X to be written before returning from writepages(). 0 4K 8K |/////|/////| (flag: DIRTY, tag: DIRTY|TOWRITE) However, between tagging and collecting, process A may call btrfs_subpage_set_writeback() and clear folio X's TOWRITE tag. 0 4K 8K | |/////| (flag: DIRTY|WRITEBACK, tag: DIRTY) As a result, process B won't see folio X in its batch, and returns without waiting for it. This breaks the WB_SYNC_ALL ordering requirement. Fix this by using btrfs_subpage_set_writeback_keepwrite(), which retains the TOWRITE tag. We now manually clear the tag only after the folio becomes clean, via the xas operation. Fixes: 3470da3b7d87 ("btrfs: subpage: introduce helpers for writeback status") CC: stable@vger.kernel.org # 6.12+ Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index c9b3821957f7..cb4f97833dc3 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -448,8 +448,25 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&bfs->lock, flags); bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + + /* + * Don't clear the TOWRITE tag when starting writeback on a still-dirty + * folio. Doing so can cause WB_SYNC_ALL writepages() to overlook it, + * assume writeback is complete, and exit too early — violating sync + * ordering guarantees. + */ if (!folio_test_writeback(folio)) - folio_start_writeback(folio); + __folio_start_writeback(folio, true); + if (!folio_test_dirty(folio)) { + struct address_space *mapping = folio_mapping(folio); + XA_STATE(xas, &mapping->i_pages, folio->index); + unsigned long flags; + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); + xas_unlock_irqrestore(&xas, flags); + } spin_unlock_irqrestore(&bfs->lock, flags); } From dc61d97b0ba064fb21b01fbfa7436873948277bd Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 12 Aug 2025 01:32:58 +0900 Subject: [PATCH 110/246] btrfs: fix buffer index in wait_eb_writebacks() The commit f2cb97ee964a ("btrfs: index buffer_tree using node size") changed the index of buffer_tree from "start >> sectorsize_bits" to "start >> nodesize_bits". However, the change is not applied for wait_eb_writebacks() and caused IO failures by writing in a full zone. Use the index properly. Fixes: f2cb97ee964a ("btrfs: index buffer_tree using node size") Reviewed-by: Qu Wenruo Reviewed-by: Boris Burkov Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index e0ee3aeabd2c..ea662036f441 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2242,7 +2242,7 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group) struct btrfs_fs_info *fs_info = block_group->fs_info; const u64 end = block_group->start + block_group->length; struct extent_buffer *eb; - unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits); + unsigned long index, start = (block_group->start >> fs_info->nodesize_bits); rcu_read_lock(); xa_for_each_start(&fs_info->buffer_tree, index, eb, start) { From edf842abe4368ce3c423343cf4b23b210fcf1622 Mon Sep 17 00:00:00 2001 From: Kyoji Ogasawara Date: Wed, 23 Jul 2025 00:38:37 +0900 Subject: [PATCH 111/246] btrfs: fix incorrect log message for nobarrier mount option Fix a wrong log message that appears when the "nobarrier" mount option is unset. When "nobarrier" is unset, barrier is actually enabled. However, the log incorrectly stated "turning off barriers". Fixes: eddb1a433f26 ("btrfs: add reconfigure callback for fs_context") CC: stable@vger.kernel.org # 6.12+ Reviewed-by: Qu Wenruo Signed-off-by: Kyoji Ogasawara Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 466d0450269c..768a2532fa4a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1462,7 +1462,7 @@ static void btrfs_emit_options(struct btrfs_fs_info *info, btrfs_info_if_unset(info, old, NODATACOW, "setting datacow"); btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations"); btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme"); - btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers"); + btrfs_info_if_unset(info, old, NOBARRIER, "turning on barriers"); btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log"); btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching"); btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree"); From b435ab556bea875c088485f271ef2709ca1d75f5 Mon Sep 17 00:00:00 2001 From: Kyoji Ogasawara Date: Wed, 13 Aug 2025 03:00:06 +0900 Subject: [PATCH 112/246] btrfs: restore mount option info messages during mount After the fsconfig migration in 6.8, mount option info messages are no longer displayed during mount operations because btrfs_emit_options() is only called during remount, not during initial mount. Fix this by calling btrfs_emit_options() in btrfs_fill_super() after open_ctree() succeeds. Additionally, prevent log duplication by ensuring btrfs_check_options() handles validation with warn-level and err-level messages, while btrfs_emit_options() provides info-level messages. Fixes: eddb1a433f26 ("btrfs: add reconfigure callback for fs_context") CC: stable@vger.kernel.org # 6.8+ Reviewed-by: Qu Wenruo Signed-off-by: Kyoji Ogasawara Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 768a2532fa4a..8469f36ef011 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -88,6 +88,9 @@ struct btrfs_fs_context { refcount_t refs; }; +static void btrfs_emit_options(struct btrfs_fs_info *info, + struct btrfs_fs_context *old); + enum { Opt_acl, Opt_clear_cache, @@ -698,12 +701,9 @@ bool btrfs_check_options(const struct btrfs_fs_info *info, if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) { if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { - btrfs_info(info, "disk space caching is enabled"); btrfs_warn(info, "space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2"); } - if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) - btrfs_info(info, "using free-space-tree"); } return ret; @@ -980,6 +980,8 @@ static int btrfs_fill_super(struct super_block *sb, return ret; } + btrfs_emit_options(fs_info, NULL); + inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); From 74857fdc5dd2cdcdeb6e99bdf26976fd9299d2bb Mon Sep 17 00:00:00 2001 From: Kyoji Ogasawara Date: Wed, 13 Aug 2025 03:00:07 +0900 Subject: [PATCH 113/246] btrfs: fix printing of mount info messages for NODATACOW/NODATASUM The NODATASUM message was printed twice by mistake and the NODATACOW was missing from the 'unset' part. Fix the duplication and make the output look the same. Fixes: eddb1a433f26 ("btrfs: add reconfigure callback for fs_context") CC: stable@vger.kernel.org # 6.8+ Reviewed-by: Qu Wenruo Signed-off-by: Kyoji Ogasawara Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8469f36ef011..7f31f8bd63ba 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1439,7 +1439,7 @@ static void btrfs_emit_options(struct btrfs_fs_info *info, { btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts"); - btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); + btrfs_info_if_set(info, old, NODATACOW, "setting nodatacow"); btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations"); btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme"); btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers"); @@ -1461,6 +1461,7 @@ static void btrfs_emit_options(struct btrfs_fs_info *info, btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums"); btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags"); + btrfs_info_if_unset(info, old, NODATASUM, "setting datasum"); btrfs_info_if_unset(info, old, NODATACOW, "setting datacow"); btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations"); btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme"); From 9d83e1f05c98bab5de350bef89177e2be8b34db0 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Wed, 13 Aug 2025 20:02:14 +0800 Subject: [PATCH 114/246] io_uring/io-wq: add check free worker before create new worker After commit 0b2b066f8a85 ("io_uring/io-wq: only create a new worker if it can make progress"), in our produce environment, we still observe that part of io_worker threads keeps creating and destroying. After analysis, it was confirmed that this was due to a more complex scenario involving a large number of fsync operations, which can be abstracted as frequent write + fsync operations on multiple files in a single uring instance. Since write is a hash operation while fsync is not, and fsync is likely to be suspended during execution, the action of checking the hash value in io_wqe_dec_running cannot handle such scenarios. Similarly, if hash-based work and non-hash-based work are sent at the same time, similar issues are likely to occur. Returning to the starting point of the issue, when a new work arrives, io_wq_enqueue may wake up free worker A, while io_wq_dec_running may create worker B. Ultimately, only one of A and B can obtain and process the task, leaving the other in an idle state. In the end, the issue is caused by inconsistent logic in the checks performed by io_wq_enqueue and io_wq_dec_running. Therefore, the problem can be resolved by checking for available workers in io_wq_dec_running. Signed-off-by: Fengnan Chang Reviewed-by: Diangang Li Link: https://lore.kernel.org/r/20250813120214.18729-1-changfengnan@bytedance.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index be91edf34f01..17dfaa0395c4 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -357,6 +357,13 @@ static void create_worker_cb(struct callback_head *cb) worker = container_of(cb, struct io_worker, create_work); wq = worker->wq; acct = worker->acct; + + rcu_read_lock(); + do_create = !io_acct_activate_free_worker(acct); + rcu_read_unlock(); + if (!do_create) + goto no_need_create; + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers < acct->max_workers) { @@ -367,6 +374,7 @@ static void create_worker_cb(struct callback_head *cb) if (do_create) { create_io_worker(wq, acct); } else { +no_need_create: atomic_dec(&acct->nr_running); io_worker_ref_put(wq); } From 47ed64db8c17eb16541098add865178fb7e68744 Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Wed, 13 Aug 2025 18:07:08 +0800 Subject: [PATCH 115/246] ASoC: tas2781: Normalize the volume kcontrol name Change the name of the kcontrol from "Gain" to "Volume". Signed-off-by: Baojun Xu Link: https://patch.msgid.link/20250813100708.12197-1-baojun.xu@ti.com Signed-off-by: Mark Brown --- sound/soc/codecs/tas2781-i2c.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/tas2781-i2c.c b/sound/soc/codecs/tas2781-i2c.c index 8e7e45c046b8..676130f4cf3e 100644 --- a/sound/soc/codecs/tas2781-i2c.c +++ b/sound/soc/codecs/tas2781-i2c.c @@ -908,10 +908,10 @@ static const struct snd_kcontrol_new tasdevice_cali_controls[] = { }; static const struct snd_kcontrol_new tas2781_snd_controls[] = { - SOC_SINGLE_RANGE_EXT_TLV("Speaker Analog Gain", TAS2781_AMP_LEVEL, + SOC_SINGLE_RANGE_EXT_TLV("Speaker Analog Volume", TAS2781_AMP_LEVEL, 1, 0, 20, 0, tas2781_amp_getvol, tas2781_amp_putvol, amp_vol_tlv), - SOC_SINGLE_RANGE_EXT_TLV("Speaker Digital Gain", TAS2781_DVC_LVL, + SOC_SINGLE_RANGE_EXT_TLV("Speaker Digital Volume", TAS2781_DVC_LVL, 0, 0, 200, 1, tas2781_digital_getvol, tas2781_digital_putvol, dvc_tlv), }; From 23cbfd6fed78715459a4395c034c4e76b8c85320 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 13 Aug 2025 17:36:27 +0200 Subject: [PATCH 116/246] ALSA: azt3328: Put __maybe_unused for inline functions for gameport Some inline functions are unused depending on kconfig, and the recent change for clang builds made those handled as errors with W=1. For avoiding pitfalls, mark those with __maybe_unused attributes. Link: https://patch.msgid.link/20250813153628.12303-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/azt3328.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sound/pci/azt3328.c b/sound/pci/azt3328.c index 4418b9ae33e6..b33344f65b8c 100644 --- a/sound/pci/azt3328.c +++ b/sound/pci/azt3328.c @@ -412,25 +412,25 @@ snd_azf3328_ctrl_outl(const struct snd_azf3328 *chip, unsigned reg, u32 value) outl(value, chip->ctrl_io + reg); } -static inline void +static inline void __maybe_unused snd_azf3328_game_outb(const struct snd_azf3328 *chip, unsigned reg, u8 value) { outb(value, chip->game_io + reg); } -static inline void +static inline void __maybe_unused snd_azf3328_game_outw(const struct snd_azf3328 *chip, unsigned reg, u16 value) { outw(value, chip->game_io + reg); } -static inline u8 +static inline u8 __maybe_unused snd_azf3328_game_inb(const struct snd_azf3328 *chip, unsigned reg) { return inb(chip->game_io + reg); } -static inline u16 +static inline u16 __maybe_unused snd_azf3328_game_inw(const struct snd_azf3328 *chip, unsigned reg) { return inw(chip->game_io + reg); From ee8f1613596ad44c7cff4805d65a8a705998db11 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 13 Aug 2025 09:03:08 -0500 Subject: [PATCH 117/246] Revert "ALSA: hda: Add ASRock X670E Taichi to denylist" On a motherboard with an AMD Granite Ridge CPU there is a report that 3.5mm microphone and headphones aren't working. In the log it's observed: snd_hda_intel 0000:02:00.6: Skipping the device on the denylist This was because of commit df42ee7e22f03 ("ALSA: hda: Add ASRock X670E Taichi to denylist"). Reverting this commit allows the microphone and headphones to work again. As at least some combinations of this motherboard do have applicable devices, revert so that they can be probed. Cc: Richard Gong Cc: Juan Martinez Signed-off-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/20250813140427.1577172-1-superm1@kernel.org Signed-off-by: Takashi Iwai --- sound/hda/controllers/intel.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/hda/controllers/intel.c b/sound/hda/controllers/intel.c index fcf67e97a546..1bb3ff55b115 100644 --- a/sound/hda/controllers/intel.c +++ b/sound/hda/controllers/intel.c @@ -2077,7 +2077,6 @@ static const struct pci_device_id driver_denylist[] = { { PCI_DEVICE_SUB(0x1022, 0x1487, 0x1043, 0x874f) }, /* ASUS ROG Zenith II / Strix */ { PCI_DEVICE_SUB(0x1022, 0x1487, 0x1462, 0xcb59) }, /* MSI TRX40 Creator */ { PCI_DEVICE_SUB(0x1022, 0x1487, 0x1462, 0xcb60) }, /* MSI TRX40 */ - { PCI_DEVICE_SUB(0x1022, 0x15e3, 0x1022, 0xd601) }, /* ASRock X670E Taichi */ {} }; From 7d34ec36abb84fdfb6632a0f2cbda90379ae21fc Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 11 Aug 2025 23:14:55 -0500 Subject: [PATCH 118/246] smb3: fix for slab out of bounds on mount to ksmbd With KASAN enabled, it is possible to get a slab out of bounds during mount to ksmbd due to missing check in parse_server_interfaces() (see below): BUG: KASAN: slab-out-of-bounds in parse_server_interfaces+0x14ee/0x1880 [cifs] Read of size 4 at addr ffff8881433dba98 by task mount/9827 CPU: 5 UID: 0 PID: 9827 Comm: mount Tainted: G OE 6.16.0-rc2-kasan #2 PREEMPT(voluntary) Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE Hardware name: Dell Inc. Precision Tower 3620/0MWYPT, BIOS 2.13.1 06/14/2019 Call Trace: dump_stack_lvl+0x9f/0xf0 print_report+0xd1/0x670 __virt_addr_valid+0x22c/0x430 ? parse_server_interfaces+0x14ee/0x1880 [cifs] ? kasan_complete_mode_report_info+0x2a/0x1f0 ? parse_server_interfaces+0x14ee/0x1880 [cifs] kasan_report+0xd6/0x110 parse_server_interfaces+0x14ee/0x1880 [cifs] __asan_report_load_n_noabort+0x13/0x20 parse_server_interfaces+0x14ee/0x1880 [cifs] ? __pfx_parse_server_interfaces+0x10/0x10 [cifs] ? trace_hardirqs_on+0x51/0x60 SMB3_request_interfaces+0x1ad/0x3f0 [cifs] ? __pfx_SMB3_request_interfaces+0x10/0x10 [cifs] ? SMB2_tcon+0x23c/0x15d0 [cifs] smb3_qfs_tcon+0x173/0x2b0 [cifs] ? __pfx_smb3_qfs_tcon+0x10/0x10 [cifs] ? cifs_get_tcon+0x105d/0x2120 [cifs] ? do_raw_spin_unlock+0x5d/0x200 ? cifs_get_tcon+0x105d/0x2120 [cifs] ? __pfx_smb3_qfs_tcon+0x10/0x10 [cifs] cifs_mount_get_tcon+0x369/0xb90 [cifs] ? dfs_cache_find+0xe7/0x150 [cifs] dfs_mount_share+0x985/0x2970 [cifs] ? check_path.constprop.0+0x28/0x50 ? save_trace+0x54/0x370 ? __pfx_dfs_mount_share+0x10/0x10 [cifs] ? __lock_acquire+0xb82/0x2ba0 ? __kasan_check_write+0x18/0x20 cifs_mount+0xbc/0x9e0 [cifs] ? __pfx_cifs_mount+0x10/0x10 [cifs] ? do_raw_spin_unlock+0x5d/0x200 ? cifs_setup_cifs_sb+0x29d/0x810 [cifs] cifs_smb3_do_mount+0x263/0x1990 [cifs] Reported-by: Namjae Jeon Tested-by: Namjae Jeon Cc: stable@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/smb2ops.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index ad8947434b71..218b6ce7ff3a 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -772,6 +772,13 @@ next_iface: bytes_left -= sizeof(*p); break; } + /* Validate that Next doesn't point beyond the buffer */ + if (next > bytes_left) { + cifs_dbg(VFS, "%s: invalid Next pointer %zu > %zd\n", + __func__, next, bytes_left); + rc = -EINVAL; + goto out; + } p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); bytes_left -= next; } @@ -783,7 +790,9 @@ next_iface: } /* Azure rounds the buffer size up 8, to a 16 byte boundary */ - if ((bytes_left > 8) || p->Next) + if ((bytes_left > 8) || + (bytes_left >= offsetof(struct network_interface_info_ioctl_rsp, Next) + + sizeof(p->Next) && p->Next)) cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); ses->iface_last_update = jiffies; From e3835731e169a48a2c73018d135b5c08c39ea61d Mon Sep 17 00:00:00 2001 From: Wang Zhaolong Date: Mon, 11 Aug 2025 22:07:37 +0800 Subject: [PATCH 119/246] smb: client: fix mid_q_entry memleak leak with per-mid locking This is step 4/4 of a patch series to fix mid_q_entry memory leaks caused by race conditions in callback execution. In compound_send_recv(), when wait_for_response() is interrupted by signals, the code attempts to cancel pending requests by changing their callbacks to cifs_cancelled_callback. However, there's a race condition between signal interruption and network response processing that causes both mid_q_entry and server buffer leaks: ``` User foreground process cifsd cifs_readdir open_cached_dir cifs_send_recv compound_send_recv smb2_setup_request smb2_mid_entry_alloc smb2_get_mid_entry smb2_mid_entry_alloc mempool_alloc // alloc mid kref_init(&temp->refcount); // refcount = 1 mid[0]->callback = cifs_compound_callback; mid[1]->callback = cifs_compound_last_callback; smb_send_rqst rc = wait_for_response wait_event_state TASK_KILLABLE cifs_demultiplex_thread allocate_buffers server->bigbuf = cifs_buf_get() standard_receive3 ->find_mid() smb2_find_mid __smb2_find_mid kref_get(&mid->refcount) // +1 cifs_handle_standard handle_mid /* bigbuf will also leak */ mid->resp_buf = server->bigbuf server->bigbuf = NULL; dequeue_mid /* in for loop */ mids[0]->callback cifs_compound_callback /* Signal interrupts wait: rc = -ERESTARTSYS */ /* if (... || midQ[i]->mid_state == MID_RESPONSE_RECEIVED) *? midQ[0]->callback = cifs_cancelled_callback; cancelled_mid[i] = true; /* The change comes too late */ mid->mid_state = MID_RESPONSE_READY release_mid // -1 /* cancelled_mid[i] == true causes mid won't be released in compound_send_recv cleanup */ /* cifs_cancelled_callback won't executed to release mid */ ``` The root cause is that there's a race between callback assignment and execution. Fix this by introducing per-mid locking: - Add spinlock_t mid_lock to struct mid_q_entry - Add mid_execute_callback() for atomic callback execution - Use mid_lock in cancellation paths to ensure atomicity This ensures that either the original callback or the cancellation callback executes atomically, preventing reference count leaks when requests are interrupted by signals. Link: https://bugzilla.kernel.org/show_bug.cgi?id=220404 Fixes: ee258d79159a ("CIFS: Move credit processing to mid callbacks for SMB3") Signed-off-by: Wang Zhaolong Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 21 +++++++++++++++++++++ fs/smb/client/cifstransport.c | 19 +++++++++---------- fs/smb/client/connect.c | 8 ++++---- fs/smb/client/smb2ops.c | 4 ++-- fs/smb/client/smb2transport.c | 1 + fs/smb/client/transport.c | 7 +++---- 6 files changed, 40 insertions(+), 20 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index e6830ab3a546..1e64a4fb6af0 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1732,6 +1732,7 @@ struct mid_q_entry { int mid_rc; /* rc for MID_RC */ __le16 command; /* smb command code */ unsigned int optype; /* operation type */ + spinlock_t mid_lock; bool wait_cancelled:1; /* Cancelled while waiting for response */ bool deleted_from_q:1; /* Whether Mid has been dequeued frem pending_mid_q */ bool large_buf:1; /* if valid response, is pointer to large buf */ @@ -2036,6 +2037,9 @@ require use of the stronger protocol */ * cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo * ->invalidHandle initiate_cifs_search * ->oplock_break_cancelled + * mid_q_entry->mid_lock mid_q_entry->callback alloc_mid + * smb2_mid_entry_alloc + * (Any fields of mid_q_entry that will need protection) ****************************************************************************/ #ifdef DECLARE_GLOBALS_HERE @@ -2375,6 +2379,23 @@ static inline bool cifs_netbios_name(const char *name, size_t namelen) return ret; } +/* + * Execute mid callback atomically - ensures callback runs exactly once + * and prevents sleeping in atomic context. + */ +static inline void mid_execute_callback(struct mid_q_entry *mid) +{ + void (*callback)(struct mid_q_entry *mid); + + spin_lock(&mid->mid_lock); + callback = mid->callback; + mid->callback = NULL; /* Mark as executed, */ + spin_unlock(&mid->mid_lock); + + if (callback) + callback(mid); +} + #define CIFS_REPARSE_SUPPORT(tcon) \ ((tcon)->posix_extensions || \ (le32_to_cpu((tcon)->fsAttrInfo.Attributes) & \ diff --git a/fs/smb/client/cifstransport.c b/fs/smb/client/cifstransport.c index 352dafb888dd..e98b95eff8c9 100644 --- a/fs/smb/client/cifstransport.c +++ b/fs/smb/client/cifstransport.c @@ -46,6 +46,7 @@ alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); memset(temp, 0, sizeof(struct mid_q_entry)); kref_init(&temp->refcount); + spin_lock_init(&temp->mid_lock); temp->mid = get_mid(smb_buffer); temp->pid = current->pid; temp->command = cpu_to_le16(smb_buffer->Command); @@ -345,16 +346,15 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, rc = wait_for_response(server, midQ); if (rc != 0) { send_cancel(server, &rqst, midQ); - spin_lock(&server->mid_queue_lock); - if (midQ->mid_state == MID_REQUEST_SUBMITTED || - midQ->mid_state == MID_RESPONSE_RECEIVED) { + spin_lock(&midQ->mid_lock); + if (midQ->callback) { /* no longer considered to be "in-flight" */ midQ->callback = release_mid; - spin_unlock(&server->mid_queue_lock); + spin_unlock(&midQ->mid_lock); add_credits(server, &credits, 0); return rc; } - spin_unlock(&server->mid_queue_lock); + spin_unlock(&midQ->mid_lock); } rc = cifs_sync_mid_result(midQ, server); @@ -527,15 +527,14 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, rc = wait_for_response(server, midQ); if (rc) { send_cancel(server, &rqst, midQ); - spin_lock(&server->mid_queue_lock); - if (midQ->mid_state == MID_REQUEST_SUBMITTED || - midQ->mid_state == MID_RESPONSE_RECEIVED) { + spin_lock(&midQ->mid_lock); + if (midQ->callback) { /* no longer considered to be "in-flight" */ midQ->callback = release_mid; - spin_unlock(&server->mid_queue_lock); + spin_unlock(&midQ->mid_lock); return rc; } - spin_unlock(&server->mid_queue_lock); + spin_unlock(&midQ->mid_lock); } /* We got the response - restart system call. */ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 587845a2452d..281ccbeea719 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -335,7 +335,7 @@ cifs_abort_connection(struct TCP_Server_Info *server) cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__); list_for_each_entry_safe(mid, nmid, &retry_list, qhead) { list_del_init(&mid->qhead); - mid->callback(mid); + mid_execute_callback(mid); release_mid(mid); } @@ -919,7 +919,7 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type) list_del_init(&mid->qhead); mid->mid_rc = mid_rc; mid->mid_state = MID_RC; - mid->callback(mid); + mid_execute_callback(mid); release_mid(mid); } @@ -1117,7 +1117,7 @@ clean_demultiplex_info(struct TCP_Server_Info *server) mid_entry = list_entry(tmp, struct mid_q_entry, qhead); cifs_dbg(FYI, "Callback mid %llu\n", mid_entry->mid); list_del_init(&mid_entry->qhead); - mid_entry->callback(mid_entry); + mid_execute_callback(mid_entry); release_mid(mid_entry); } /* 1/8th of sec is more than enough time for them to exit */ @@ -1394,7 +1394,7 @@ next_pdu: } if (!mids[i]->multiRsp || mids[i]->multiEnd) - mids[i]->callback(mids[i]); + mid_execute_callback(mids[i]); release_mid(mids[i]); } else if (server->ops->is_oplock_break && diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 218b6ce7ff3a..3b251de874ec 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4814,7 +4814,7 @@ static void smb2_decrypt_offload(struct work_struct *work) dw->server->ops->is_network_name_deleted(dw->buf, dw->server); - mid->callback(mid); + mid_execute_callback(mid); } else { spin_lock(&dw->server->srv_lock); if (dw->server->tcpStatus == CifsNeedReconnect) { @@ -4822,7 +4822,7 @@ static void smb2_decrypt_offload(struct work_struct *work) mid->mid_state = MID_RETRY_NEEDED; spin_unlock(&dw->server->mid_queue_lock); spin_unlock(&dw->server->srv_lock); - mid->callback(mid); + mid_execute_callback(mid); } else { spin_lock(&dw->server->mid_queue_lock); mid->mid_state = MID_REQUEST_SUBMITTED; diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index ff9ef7fcd010..bc0e92eb2b64 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -771,6 +771,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *shdr, temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); memset(temp, 0, sizeof(struct mid_q_entry)); kref_init(&temp->refcount); + spin_lock_init(&temp->mid_lock); temp->mid = le64_to_cpu(shdr->MessageId); temp->credits = credits > 0 ? credits : 1; temp->pid = current->pid; diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 32d528b4dd83..a61ba7f3fb86 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -1005,15 +1005,14 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, cifs_server_dbg(FYI, "Cancelling wait for mid %llu cmd: %d\n", midQ[i]->mid, le16_to_cpu(midQ[i]->command)); send_cancel(server, &rqst[i], midQ[i]); - spin_lock(&server->mid_queue_lock); + spin_lock(&midQ[i]->mid_lock); midQ[i]->wait_cancelled = true; - if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED || - midQ[i]->mid_state == MID_RESPONSE_RECEIVED) { + if (midQ[i]->callback) { midQ[i]->callback = cifs_cancelled_callback; cancelled_mid[i] = true; credits[i].value = 0; } - spin_unlock(&server->mid_queue_lock); + spin_unlock(&midQ[i]->mid_lock); } } From 8c48e1c7520321cc87ff651e96093e2f412785fb Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 12 Aug 2025 18:45:06 +0200 Subject: [PATCH 120/246] smb: client: don't wait for info->send_pending == 0 on error We already called ib_drain_qp() before and that makes sure send_done() was called with IB_WC_WR_FLUSH_ERR, but didn't called atomic_dec_and_test(&sc->send_io.pending.count) So we may never reach the info->send_pending == 0 condition. Cc: Steve French Cc: Tom Talpey Cc: Long Li Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Fixes: 5349ae5e05fa ("smb: client: let send_done() cleanup before calling smbd_disconnect_rdma_connection()") Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index c628e91c328b..02d6db431fd4 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1337,10 +1337,6 @@ void smbd_destroy(struct TCP_Server_Info *server) log_rdma_event(INFO, "cancelling idle timer\n"); cancel_delayed_work_sync(&info->idle_timer_work); - log_rdma_event(INFO, "wait for all send posted to IB to finish\n"); - wait_event(info->wait_send_pending, - atomic_read(&info->send_pending) == 0); - /* It's not possible for upper layer to get to reassembly */ log_rdma_event(INFO, "drain the reassembly queue\n"); do { @@ -1986,7 +1982,11 @@ int smbd_send(struct TCP_Server_Info *server, */ wait_event(info->wait_send_pending, - atomic_read(&info->send_pending) == 0); + atomic_read(&info->send_pending) == 0 || + sc->status != SMBDIRECT_SOCKET_CONNECTED); + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0) + rc = -EAGAIN; return rc; } From e3f776d30a56286aaf882b96c92e797b7587a6ab Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 9 Aug 2025 09:17:46 -0500 Subject: [PATCH 121/246] cifs: update internal version number to 2.56 Signed-off-by: Steve French --- fs/smb/client/cifsfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index 487f39cff77e..3ce7c614ccc0 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -145,6 +145,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 55 -#define CIFS_VERSION "2.55" +#define SMB3_PRODUCT_BUILD 56 +#define CIFS_VERSION "2.56" #endif /* _CIFSFS_H */ From e19d8dd694d261ac26adb2a26121a37c107c81ad Mon Sep 17 00:00:00 2001 From: Wang Zhaolong Date: Fri, 1 Aug 2025 17:07:24 +0800 Subject: [PATCH 122/246] smb: client: remove redundant lstrp update in negotiate protocol Commit 34331d7beed7 ("smb: client: fix first command failure during re-negotiation") addressed a race condition by updating lstrp before entering negotiate state. However, this approach may have some unintended side effects. The lstrp field is documented as "when we got last response from this server", and updating it before actually receiving a server response could potentially affect other mechanisms that rely on this timestamp. For example, the SMB echo detection logic also uses lstrp as a reference point. In scenarios with frequent user operations during reconnect states, the repeated calls to cifs_negotiate_protocol() might continuously update lstrp, which could interfere with the echo detection timing. Additionally, commit 266b5d02e14f ("smb: client: fix race condition in negotiate timeout by using more precise timing") introduced a dedicated neg_start field specifically for tracking negotiate start time. This provides a more precise solution for the original race condition while preserving the intended semantics of lstrp. Since the race condition is now properly handled by the neg_start mechanism, the lstrp update in cifs_negotiate_protocol() is no longer necessary and can be safely removed. Fixes: 266b5d02e14f ("smb: client: fix race condition in negotiate timeout by using more precise timing") Cc: stable@vger.kernel.org Acked-by: Paulo Alcantara (Red Hat) Signed-off-by: Wang Zhaolong Signed-off-by: Steve French --- fs/smb/client/connect.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 281ccbeea719..dd12f3eb61dc 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -4205,7 +4205,6 @@ retry: return 0; } - server->lstrp = jiffies; server->tcpStatus = CifsInNegotiate; server->neg_start = jiffies; spin_unlock(&server->srv_lock); From 757fc66da91b54d4fbc414bee5c440b52560d3b7 Mon Sep 17 00:00:00 2001 From: Baolin Liu Date: Tue, 12 Aug 2025 10:17:09 +0800 Subject: [PATCH 123/246] ext4: fix incorrect function name in comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 6b730a405037 “ext4: hoist ext4_block_write_begin and replace the __block_write_begin”, the comment should be updated accordingly from "__block_write_begin" to "ext4_block_write_begin". Fixes: 6b730a405037 (“ext4: hoist ext4_block_write_begin and replace...") Signed-off-by: Baolin Liu Reviewed-by: Darrick J. Wong Link: https://patch.msgid.link/20250812021709.1120716-1-liubaolin12138@163.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 731a800d9c14..238a0f12a5c0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3159,7 +3159,7 @@ retry: folio_unlock(folio); folio_put(folio); /* - * block_write_begin may have instantiated a few blocks + * ext4_block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold inode lock. */ From 9d98cf4632258720f18265a058e62fde120c0151 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 12 Aug 2025 14:37:52 +0800 Subject: [PATCH 124/246] jbd2: prevent softlockup in jbd2_log_do_checkpoint() Both jbd2_log_do_checkpoint() and jbd2_journal_shrink_checkpoint_list() periodically release j_list_lock after processing a batch of buffers to avoid long hold times on the j_list_lock. However, since both functions contend for j_list_lock, the combined time spent waiting and processing can be significant. jbd2_journal_shrink_checkpoint_list() explicitly calls cond_resched() when need_resched() is true to avoid softlockups during prolonged operations. But jbd2_log_do_checkpoint() only exits its loop when need_resched() is true, relying on potentially sleeping functions like __flush_batch() or wait_on_buffer() to trigger rescheduling. If those functions do not sleep, the kernel may hit a softlockup. watchdog: BUG: soft lockup - CPU#3 stuck for 156s! [kworker/u129:2:373] CPU: 3 PID: 373 Comm: kworker/u129:2 Kdump: loaded Not tainted 6.6.0+ #10 Hardware name: Huawei TaiShan 2280 /BC11SPCD, BIOS 1.27 06/13/2017 Workqueue: writeback wb_workfn (flush-7:2) pstate: 20000005 (nzCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : native_queued_spin_lock_slowpath+0x358/0x418 lr : jbd2_log_do_checkpoint+0x31c/0x438 [jbd2] Call trace: native_queued_spin_lock_slowpath+0x358/0x418 jbd2_log_do_checkpoint+0x31c/0x438 [jbd2] __jbd2_log_wait_for_space+0xfc/0x2f8 [jbd2] add_transaction_credits+0x3bc/0x418 [jbd2] start_this_handle+0xf8/0x560 [jbd2] jbd2__journal_start+0x118/0x228 [jbd2] __ext4_journal_start_sb+0x110/0x188 [ext4] ext4_do_writepages+0x3dc/0x740 [ext4] ext4_writepages+0xa4/0x190 [ext4] do_writepages+0x94/0x228 __writeback_single_inode+0x48/0x318 writeback_sb_inodes+0x204/0x590 __writeback_inodes_wb+0x54/0xf8 wb_writeback+0x2cc/0x3d8 wb_do_writeback+0x2e0/0x2f8 wb_workfn+0x80/0x2a8 process_one_work+0x178/0x3e8 worker_thread+0x234/0x3b8 kthread+0xf0/0x108 ret_from_fork+0x10/0x20 So explicitly call cond_resched() in jbd2_log_do_checkpoint() to avoid softlockup. Cc: stable@kernel.org Signed-off-by: Baokun Li Link: https://patch.msgid.link/20250812063752.912130-1-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index b3971e91e8eb..38861ca04899 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -285,6 +285,7 @@ restart: retry: if (batch_count) __flush_batch(journal, &batch_count); + cond_resched(); spin_lock(&journal->j_list_lock); goto restart; } From d832ccbc301fbd9e5a1d691bdcf461cdb514595f Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 14 Aug 2025 10:12:42 +0200 Subject: [PATCH 125/246] ALSA: usb-audio: Validate UAC3 power domain descriptors, too UAC3 power domain descriptors need to be verified with its variable bLength for avoiding the unexpected OOB accesses by malicious firmware, too. Fixes: 9a2fe9b801f5 ("ALSA: usb: initial USB Audio Device Class 3.0 support") Reported-and-tested-by: Youngjun Lee Cc: Link: https://patch.msgid.link/20250814081245.8902-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/validate.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sound/usb/validate.c b/sound/usb/validate.c index 6fe206f6e911..4f4e8e87a14c 100644 --- a/sound/usb/validate.c +++ b/sound/usb/validate.c @@ -221,6 +221,17 @@ static bool validate_uac3_feature_unit(const void *p, return d->bLength >= sizeof(*d) + 4 + 2; } +static bool validate_uac3_power_domain_unit(const void *p, + const struct usb_desc_validator *v) +{ + const struct uac3_power_domain_descriptor *d = p; + + if (d->bLength < sizeof(*d)) + return false; + /* baEntities[] + wPDomainDescrStr */ + return d->bLength >= sizeof(*d) + d->bNrEntities + 2; +} + static bool validate_midi_out_jack(const void *p, const struct usb_desc_validator *v) { @@ -285,6 +296,7 @@ static const struct usb_desc_validator audio_validators[] = { struct uac3_clock_multiplier_descriptor), /* UAC_VERSION_3, UAC3_SAMPLE_RATE_CONVERTER: not implemented yet */ /* UAC_VERSION_3, UAC3_CONNECTORS: not implemented yet */ + FUNC(UAC_VERSION_3, UAC3_POWER_DOMAIN, validate_uac3_power_domain_unit), { } /* terminator */ }; From ecfd41166b72b67d3bdeb88d224ff445f6163869 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 14 Aug 2025 10:12:43 +0200 Subject: [PATCH 126/246] ALSA: usb-audio: Validate UAC3 cluster segment descriptors UAC3 class segment descriptors need to be verified whether their sizes match with the declared lengths and whether they fit with the allocated buffer sizes, too. Otherwise malicious firmware may lead to the unexpected OOB accesses. Fixes: 11785ef53228 ("ALSA: usb-audio: Initial Power Domain support") Reported-and-tested-by: Youngjun Lee Cc: Link: https://patch.msgid.link/20250814081245.8902-2-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/stream.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/sound/usb/stream.c b/sound/usb/stream.c index ad6ced780634..acf3dc2d79e0 100644 --- a/sound/usb/stream.c +++ b/sound/usb/stream.c @@ -341,20 +341,28 @@ snd_pcm_chmap_elem *convert_chmap_v3(struct uac3_cluster_header_descriptor len = le16_to_cpu(cluster->wLength); c = 0; - p += sizeof(struct uac3_cluster_header_descriptor); + p += sizeof(*cluster); + len -= sizeof(*cluster); - while (((p - (void *)cluster) < len) && (c < channels)) { + while (len > 0 && (c < channels)) { struct uac3_cluster_segment_descriptor *cs_desc = p; u16 cs_len; u8 cs_type; + if (len < sizeof(*p)) + break; cs_len = le16_to_cpu(cs_desc->wLength); + if (len < cs_len) + break; cs_type = cs_desc->bSegmentType; if (cs_type == UAC3_CHANNEL_INFORMATION) { struct uac3_cluster_information_segment_descriptor *is = p; unsigned char map; + if (cs_len < sizeof(*is)) + break; + /* * TODO: this conversion is not complete, update it * after adding UAC3 values to asound.h @@ -456,6 +464,7 @@ snd_pcm_chmap_elem *convert_chmap_v3(struct uac3_cluster_header_descriptor chmap->map[c++] = map; } p += cs_len; + len -= cs_len; } if (channels < c) @@ -881,7 +890,7 @@ snd_usb_get_audioformat_uac3(struct snd_usb_audio *chip, u64 badd_formats = 0; unsigned int num_channels; struct audioformat *fp; - u16 cluster_id, wLength; + u16 cluster_id, wLength, cluster_wLength; int clock = 0; int err; @@ -1011,6 +1020,16 @@ snd_usb_get_audioformat_uac3(struct snd_usb_audio *chip, return ERR_PTR(-EIO); } + cluster_wLength = le16_to_cpu(cluster->wLength); + if (cluster_wLength < sizeof(*cluster) || + cluster_wLength > wLength) { + dev_err(&dev->dev, + "%u:%d : invalid Cluster Descriptor size\n", + iface_no, altno); + kfree(cluster); + return ERR_PTR(-EIO); + } + num_channels = cluster->bNrChannels; chmap = convert_chmap_v3(cluster); kfree(cluster); From c345102d1feed3de8aa9b9ec7d18b3fbba62deb7 Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Wed, 13 Aug 2025 18:08:42 +0800 Subject: [PATCH 127/246] ALSA: hda/tas2781: Normalize the volume kcontrol name Change the name of the kcontrol from "Gain" to "Volume". Signed-off-by: Baojun Xu Link: https://patch.msgid.link/20250813100842.12224-1-baojun.xu@ti.com Signed-off-by: Takashi Iwai --- sound/hda/codecs/side-codecs/tas2781_hda_i2c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c index 45ac5e41bd4f..06c7bc2b9e9d 100644 --- a/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c +++ b/sound/hda/codecs/side-codecs/tas2781_hda_i2c.c @@ -265,7 +265,7 @@ static const struct snd_kcontrol_new tas2770_snd_controls[] = { }; static const struct snd_kcontrol_new tas2781_snd_controls[] = { - ACARD_SINGLE_RANGE_EXT_TLV("Speaker Analog Gain", TAS2781_AMP_LEVEL, + ACARD_SINGLE_RANGE_EXT_TLV("Speaker Analog Volume", TAS2781_AMP_LEVEL, 1, 0, 20, 0, tas2781_amp_getvol, tas2781_amp_putvol, amp_vol_tlv), ACARD_SINGLE_BOOL_EXT("Speaker Force Firmware Load", 0, From 35f6bedccf4c4280f02d48e4f7d194e64e9a62d8 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 13 Aug 2025 18:08:53 +0900 Subject: [PATCH 128/246] ata: libata-eh: Fix link state check for IDE/PATA ports Commit 4371fe1ba400 ("ata: libata-eh: Avoid unnecessary resets when revalidating devices") replaced the call to ata_phys_link_offline() in ata_eh_revalidate_and_attach() with the new function ata_eh_link_established() which relaxes the checks on a device link state to account for low power mode transitions. However, this change assumed that the device port has a valid scr_read method to obtain the SStatus register for the port. This is not always the case, especially with older IDE/PATA adapters (e.g. PATA/IDE devices emulated with QEMU). For such adapter, ata_eh_link_established() will always return false, causing ata_eh_revalidate_and_attach() to go into its error path and ultimately to the device being disabled. Avoid this by restoring the previous behavior, which is to assume that the link is online if reading the port SStatus register fails. While at it, also fix the spelling of SStatus in the comment describing the function ata_eh_link_established(). Reported-by: Shin'ichiro Kawasaki Fixes: 4371fe1ba400 ("ata: libata-eh: Avoid unnecessary resets when revalidating devices") Signed-off-by: Damien Le Moal Tested-by: Shin'ichiro Kawasaki Reviewed-by: Niklas Cassel --- drivers/ata/libata-eh.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 2946ae6d4b2c..2586e77ebf45 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -2075,7 +2075,7 @@ out: * Check if a link is established. This is a relaxed version of * ata_phys_link_online() which accounts for the fact that this is potentially * called after changing the link power management policy, which may not be - * reflected immediately in the SSTAUS register (e.g., we may still be seeing + * reflected immediately in the SStatus register (e.g., we may still be seeing * the PHY in partial, slumber or devsleep Partial power management state. * So check that: * - A device is still present, that is, DET is 1h (Device presence detected @@ -2089,8 +2089,13 @@ static bool ata_eh_link_established(struct ata_link *link) u32 sstatus; u8 det, ipm; + /* + * For old IDE/PATA adapters that do not have a valid scr_read method, + * or if reading the SStatus register fails, assume that the device is + * present. Device probe will determine if that is really the case. + */ if (sata_scr_read(link, SCR_STATUS, &sstatus)) - return false; + return true; det = sstatus & 0x0f; ipm = (sstatus >> 8) & 0x0f; From 58768b0563916ddcb73d8ed26ede664915f8df31 Mon Sep 17 00:00:00 2001 From: Igor Pylypiv Date: Wed, 13 Aug 2025 19:22:56 -0700 Subject: [PATCH 129/246] ata: libata-scsi: Fix CDL control Delete extra checks for the ATA_DFLAG_CDL_ENABLED flag that prevent SET FEATURES command from being issued to a drive when NCQ commands are active. ata_mselect_control_ata_feature() sets / clears the ATA_DFLAG_CDL_ENABLED flag during the translation of MODE SELECT to SET FEATURES. If SET FEATURES gets deferred due to outstanding NCQ commands, the original MODE SELECT command will be re-queued. When the re-queued MODE SELECT goes through the ata_mselect_control_ata_feature() translation again, SET FEATURES will not be issued because ATA_DFLAG_CDL_ENABLED has been already set or cleared by the initial translation of MODE SELECT. The ATA_DFLAG_CDL_ENABLED checks in ata_mselect_control_ata_feature() are safe to remove because scsi_cdl_enable() implements a similar logic that avoids enabling CDL if it has been enabled already. Fixes: 17e897a45675 ("ata: libata-scsi: Improve CDL control") Cc: stable@vger.kernel.org Signed-off-by: Igor Pylypiv Reviewed-by: Niklas Cassel Signed-off-by: Damien Le Moal --- drivers/ata/libata-scsi.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 57f674f51b0c..2ded5e476d6e 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -3904,21 +3904,16 @@ static int ata_mselect_control_ata_feature(struct ata_queued_cmd *qc, /* Check cdl_ctrl */ switch (buf[0] & 0x03) { case 0: - /* Disable CDL if it is enabled */ - if (!(dev->flags & ATA_DFLAG_CDL_ENABLED)) - return 0; + /* Disable CDL */ ata_dev_dbg(dev, "Disabling CDL\n"); cdl_action = 0; dev->flags &= ~ATA_DFLAG_CDL_ENABLED; break; case 0x02: /* - * Enable CDL if not already enabled. Since this is mutually - * exclusive with NCQ priority, allow this only if NCQ priority - * is disabled. + * Enable CDL. Since CDL is mutually exclusive with NCQ + * priority, allow this only if NCQ priority is disabled. */ - if (dev->flags & ATA_DFLAG_CDL_ENABLED) - return 0; if (dev->flags & ATA_DFLAG_NCQ_PRIO_ENABLED) { ata_dev_err(dev, "NCQ priority must be disabled to enable CDL\n"); From 94eae6ee4c2df2031bca586405e9ec36e0b9ccf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Winiarski?= Date: Tue, 27 May 2025 14:06:37 +0200 Subject: [PATCH 130/246] drm/xe/pf: Set VF LMEM BAR size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LMEM is partitioned between multiple VFs and we expect that the more VFs we have, the less LMEM is assigned to each VF. This means that we can achieve full LMEM BAR access without the need to attempt full VF LMEM BAR resize via pci_resize_resource(). Always try to set the largest possible BAR size that allows to fit the number of enabled VFs and inform the user in case the resize attempt is not successful. Signed-off-by: Michał Winiarski Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20250527120637.665506-7-michal.winiarski@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 32a4d1b98e6663101fd0abfaf151c48feea7abb1) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/regs/xe_bars.h | 1 + drivers/gpu/drm/xe/xe_pci_sriov.c | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/drivers/gpu/drm/xe/regs/xe_bars.h b/drivers/gpu/drm/xe/regs/xe_bars.h index ce05b6ae832f..880140d6ccdc 100644 --- a/drivers/gpu/drm/xe/regs/xe_bars.h +++ b/drivers/gpu/drm/xe/regs/xe_bars.h @@ -7,5 +7,6 @@ #define GTTMMADR_BAR 0 /* MMIO + GTT */ #define LMEM_BAR 2 /* VRAM */ +#define VF_LMEM_BAR 9 /* VF VRAM */ #endif diff --git a/drivers/gpu/drm/xe/xe_pci_sriov.c b/drivers/gpu/drm/xe/xe_pci_sriov.c index 447a7867eecb..af05db07162e 100644 --- a/drivers/gpu/drm/xe/xe_pci_sriov.c +++ b/drivers/gpu/drm/xe/xe_pci_sriov.c @@ -3,6 +3,10 @@ * Copyright © 2023-2024 Intel Corporation */ +#include +#include + +#include "regs/xe_bars.h" #include "xe_assert.h" #include "xe_device.h" #include "xe_gt_sriov_pf_config.h" @@ -128,6 +132,18 @@ static void pf_engine_activity_stats(struct xe_device *xe, unsigned int num_vfs, } } +static int resize_vf_vram_bar(struct xe_device *xe, int num_vfs) +{ + struct pci_dev *pdev = to_pci_dev(xe->drm.dev); + u32 sizes; + + sizes = pci_iov_vf_bar_get_sizes(pdev, VF_LMEM_BAR, num_vfs); + if (!sizes) + return 0; + + return pci_iov_vf_bar_set_size(pdev, VF_LMEM_BAR, __fls(sizes)); +} + static int pf_enable_vfs(struct xe_device *xe, int num_vfs) { struct pci_dev *pdev = to_pci_dev(xe->drm.dev); @@ -158,6 +174,12 @@ static int pf_enable_vfs(struct xe_device *xe, int num_vfs) if (err < 0) goto failed; + if (IS_DGFX(xe)) { + err = resize_vf_vram_bar(xe, num_vfs); + if (err) + xe_sriov_info(xe, "Failed to set VF LMEM BAR size: %d\n", err); + } + err = pci_enable_sriov(pdev, num_vfs); if (err < 0) goto failed; From 1548549e17e374a126e9a4e9edab8bb041fbd67e Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Wed, 13 Aug 2025 13:16:33 +0200 Subject: [PATCH 131/246] MAINTAINERS: update s390/net Remove Thorsten Winkler as maintainer and add Aswin Karuvally as reviewer. Thank you Thorsten for your support, welcome Aswin! Signed-off-by: Alexandra Winter Acked-by: Thorsten Winkler Acked-by: Aswin Karuvally Link: https://patch.msgid.link/20250813111633.241111-1-wintera@linux.ibm.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index daf520a13bdf..2720544cd91f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22174,7 +22174,7 @@ F: arch/s390/mm S390 NETWORK DRIVERS M: Alexandra Winter -M: Thorsten Winkler +R: Aswin Karuvally L: linux-s390@vger.kernel.org L: netdev@vger.kernel.org S: Supported From fd980bf6e9cdae885105685259421164f843ca55 Mon Sep 17 00:00:00 2001 From: Suraj Gupta Date: Wed, 13 Aug 2025 19:25:59 +0530 Subject: [PATCH 132/246] net: xilinx: axienet: Fix RX skb ring management in DMAengine mode Submit multiple descriptors in axienet_rx_cb() to fill Rx skb ring. This ensures the ring "catches up" on previously missed allocations. Increment Rx skb ring head pointer after BD is successfully allocated. Previously, head pointer was incremented before verifying if descriptor is successfully allocated and has valid entries, which could lead to ring state inconsistency if descriptor setup failed. These changes improve reliability by maintaining adequate descriptor availability and ensuring proper ring buffer state management. Fixes: 6a91b846af85 ("net: axienet: Introduce dmaengine support") Signed-off-by: Suraj Gupta Link: https://patch.msgid.link/20250813135559.1555652-1-suraj.gupta2@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 6011d7eae0c7..0d8a05fe541a 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -1160,6 +1160,7 @@ static void axienet_dma_rx_cb(void *data, const struct dmaengine_result *result) struct axienet_local *lp = data; struct sk_buff *skb; u32 *app_metadata; + int i; skbuf_dma = axienet_get_rx_desc(lp, lp->rx_ring_tail++); skb = skbuf_dma->skb; @@ -1178,7 +1179,10 @@ static void axienet_dma_rx_cb(void *data, const struct dmaengine_result *result) u64_stats_add(&lp->rx_packets, 1); u64_stats_add(&lp->rx_bytes, rx_len); u64_stats_update_end(&lp->rx_stat_sync); - axienet_rx_submit_desc(lp->ndev); + + for (i = 0; i < CIRC_SPACE(lp->rx_ring_head, lp->rx_ring_tail, + RX_BUF_NUM_DEFAULT); i++) + axienet_rx_submit_desc(lp->ndev); dma_async_issue_pending(lp->rx_chan); } @@ -1457,7 +1461,6 @@ static void axienet_rx_submit_desc(struct net_device *ndev) if (!skbuf_dma) return; - lp->rx_ring_head++; skb = netdev_alloc_skb(ndev, lp->max_frm_size); if (!skb) return; @@ -1482,6 +1485,7 @@ static void axienet_rx_submit_desc(struct net_device *ndev) skbuf_dma->desc = dma_rx_desc; dma_rx_desc->callback_param = lp; dma_rx_desc->callback_result = axienet_dma_rx_cb; + lp->rx_ring_head++; dmaengine_submit(dma_rx_desc); return; From d1547bf460baec718b3398365f8de33d25c5f36f Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Wed, 13 Aug 2025 10:10:54 +0800 Subject: [PATCH 133/246] net: bridge: fix soft lockup in br_multicast_query_expired() When set multicast_query_interval to a large value, the local variable 'time' in br_multicast_send_query() may overflow. If the time is smaller than jiffies, the timer will expire immediately, and then call mod_timer() again, which creates a loop and may trigger the following soft lockup issue. watchdog: BUG: soft lockup - CPU#1 stuck for 221s! [rb_consumer:66] CPU: 1 UID: 0 PID: 66 Comm: rb_consumer Not tainted 6.16.0+ #259 PREEMPT(none) Call Trace: __netdev_alloc_skb+0x2e/0x3a0 br_ip6_multicast_alloc_query+0x212/0x1b70 __br_multicast_send_query+0x376/0xac0 br_multicast_send_query+0x299/0x510 br_multicast_query_expired.constprop.0+0x16d/0x1b0 call_timer_fn+0x3b/0x2a0 __run_timers+0x619/0x950 run_timer_softirq+0x11c/0x220 handle_softirqs+0x18e/0x560 __irq_exit_rcu+0x158/0x1a0 sysvec_apic_timer_interrupt+0x76/0x90 This issue can be reproduced with: ip link add br0 type bridge echo 1 > /sys/class/net/br0/bridge/multicast_querier echo 0xffffffffffffffff > /sys/class/net/br0/bridge/multicast_query_interval ip link set dev br0 up The multicast_startup_query_interval can also cause this issue. Similar to the commit 99b40610956a ("net: bridge: mcast: add and enforce query interval minimum"), add check for the query interval maximum to fix this issue. Link: https://lore.kernel.org/netdev/20250806094941.1285944-1-wangliang74@huawei.com/ Link: https://lore.kernel.org/netdev/20250812091818.542238-1-wangliang74@huawei.com/ Fixes: d902eee43f19 ("bridge: Add multicast count/interval sysfs entries") Suggested-by: Nikolay Aleksandrov Signed-off-by: Wang Liang Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250813021054.1643649-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- net/bridge/br_multicast.c | 16 ++++++++++++++++ net/bridge/br_private.h | 2 ++ 2 files changed, 18 insertions(+) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 1377f31b719c..8ce145938b02 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -4818,6 +4818,14 @@ void br_multicast_set_query_intvl(struct net_bridge_mcast *brmctx, intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MIN; } + if (intvl_jiffies > BR_MULTICAST_QUERY_INTVL_MAX) { + br_info(brmctx->br, + "trying to set multicast query interval above maximum, setting to %lu (%ums)\n", + jiffies_to_clock_t(BR_MULTICAST_QUERY_INTVL_MAX), + jiffies_to_msecs(BR_MULTICAST_QUERY_INTVL_MAX)); + intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MAX; + } + brmctx->multicast_query_interval = intvl_jiffies; } @@ -4834,6 +4842,14 @@ void br_multicast_set_startup_query_intvl(struct net_bridge_mcast *brmctx, intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MIN; } + if (intvl_jiffies > BR_MULTICAST_STARTUP_QUERY_INTVL_MAX) { + br_info(brmctx->br, + "trying to set multicast startup query interval above maximum, setting to %lu (%ums)\n", + jiffies_to_clock_t(BR_MULTICAST_STARTUP_QUERY_INTVL_MAX), + jiffies_to_msecs(BR_MULTICAST_STARTUP_QUERY_INTVL_MAX)); + intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MAX; + } + brmctx->multicast_startup_query_interval = intvl_jiffies; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index b159aae594c0..8de0904b9627 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -31,6 +31,8 @@ #define BR_MULTICAST_DEFAULT_HASH_MAX 4096 #define BR_MULTICAST_QUERY_INTVL_MIN msecs_to_jiffies(1000) #define BR_MULTICAST_STARTUP_QUERY_INTVL_MIN BR_MULTICAST_QUERY_INTVL_MIN +#define BR_MULTICAST_QUERY_INTVL_MAX msecs_to_jiffies(86400000) /* 24 hours */ +#define BR_MULTICAST_STARTUP_QUERY_INTVL_MAX BR_MULTICAST_QUERY_INTVL_MAX #define BR_HWDOM_MAX BITS_PER_LONG From 52bf272636bda69587952b35ae97690b8dc89941 Mon Sep 17 00:00:00 2001 From: William Liu Date: Tue, 12 Aug 2025 23:57:57 +0000 Subject: [PATCH 134/246] net/sched: Fix backlog accounting in qdisc_dequeue_internal This issue applies for the following qdiscs: hhf, fq, fq_codel, and fq_pie, and occurs in their change handlers when adjusting to the new limit. The problem is the following in the values passed to the subsequent qdisc_tree_reduce_backlog call given a tbf parent: When the tbf parent runs out of tokens, skbs of these qdiscs will be placed in gso_skb. Their peek handlers are qdisc_peek_dequeued, which accounts for both qlen and backlog. However, in the case of qdisc_dequeue_internal, ONLY qlen is accounted for when pulling from gso_skb. This means that these qdiscs are missing a qdisc_qstats_backlog_dec when dropping packets to satisfy the new limit in their change handlers. One can observe this issue with the following (with tc patched to support a limit of 0): export TARGET=fq tc qdisc del dev lo root tc qdisc add dev lo root handle 1: tbf rate 8bit burst 100b latency 1ms tc qdisc replace dev lo handle 3: parent 1:1 $TARGET limit 1000 echo ''; echo 'add child'; tc -s -d qdisc show dev lo ping -I lo -f -c2 -s32 -W0.001 127.0.0.1 2>&1 >/dev/null echo ''; echo 'after ping'; tc -s -d qdisc show dev lo tc qdisc change dev lo handle 3: parent 1:1 $TARGET limit 0 echo ''; echo 'after limit drop'; tc -s -d qdisc show dev lo tc qdisc replace dev lo handle 2: parent 1:1 sfq echo ''; echo 'post graft'; tc -s -d qdisc show dev lo The second to last show command shows 0 packets but a positive number (74) of backlog bytes. The problem becomes clearer in the last show command, where qdisc_purge_queue triggers qdisc_tree_reduce_backlog with the positive backlog and causes an underflow in the tbf parent's backlog (4096 Mb instead of 0). To fix this issue, the codepath for all clients of qdisc_dequeue_internal has been simplified: codel, pie, hhf, fq, fq_pie, and fq_codel. qdisc_dequeue_internal handles the backlog adjustments for all cases that do not directly use the dequeue handler. The old fq_codel_change limit adjustment loop accumulated the arguments to the subsequent qdisc_tree_reduce_backlog call through the cstats field. However, this is confusing and error prone as fq_codel_dequeue could also potentially mutate this field (which qdisc_dequeue_internal calls in the non gso_skb case), so we have unified the code here with other qdiscs. Fixes: 2d3cbfd6d54a ("net_sched: Flush gso_skb list too during ->change()") Fixes: 4b549a2ef4be ("fq_codel: Fair Queue Codel AQM") Fixes: 10239edf86f1 ("net-qdisc-hhf: Heavy-Hitter Filter (HHF) qdisc") Signed-off-by: William Liu Reviewed-by: Savino Dicanosa Link: https://patch.msgid.link/20250812235725.45243-1-will@willsroot.io Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 11 ++++++++--- net/sched/sch_codel.c | 12 +++++++----- net/sched/sch_fq.c | 12 +++++++----- net/sched/sch_fq_codel.c | 12 +++++++----- net/sched/sch_fq_pie.c | 12 +++++++----- net/sched/sch_hhf.c | 12 +++++++----- net/sched/sch_pie.c | 12 +++++++----- 7 files changed, 50 insertions(+), 33 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 638948be4c50..738cd5b13c62 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1038,12 +1038,17 @@ static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool dir skb = __skb_dequeue(&sch->gso_skb); if (skb) { sch->q.qlen--; + qdisc_qstats_backlog_dec(sch, skb); return skb; } - if (direct) - return __qdisc_dequeue_head(&sch->q); - else + if (direct) { + skb = __qdisc_dequeue_head(&sch->q); + if (skb) + qdisc_qstats_backlog_dec(sch, skb); + return skb; + } else { return sch->dequeue(sch); + } } static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index c93761040c6e..fa0314679e43 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -101,9 +101,9 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { static int codel_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CODEL_MAX + 1]; - unsigned int qlen, dropped = 0; int err; err = nla_parse_nested_deprecated(tb, TCA_CODEL_MAX, opt, @@ -142,15 +142,17 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->params.ecn, !!nla_get_u32(tb[TCA_CODEL_ECN])); - qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, true); - dropped += qdisc_pkt_len(skb); - qdisc_qstats_backlog_dec(sch, skb); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_qdisc_drop(skb, sch); } - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 902ff5470607..fee922da2f99 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -1013,11 +1013,11 @@ static int fq_load_priomap(struct fq_sched_data *q, static int fq_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct fq_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_MAX + 1]; - int err, drop_count = 0; - unsigned drop_len = 0; u32 fq_log; + int err; err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy, NULL); @@ -1135,16 +1135,18 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, err = fq_resize(sch, fq_log); sch_tree_lock(sch); } + while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); if (!skb) break; - drop_len += qdisc_pkt_len(skb); + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); - drop_count++; } - qdisc_tree_reduce_backlog(sch, drop_count, drop_len); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return err; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 2a0f3a513bfa..a14142392939 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -366,6 +366,7 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct fq_codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; u32 quantum = 0; @@ -443,13 +444,14 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, q->memory_usage > q->memory_limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); - q->cstats.drop_len += qdisc_pkt_len(skb); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); - q->cstats.drop_count++; } - qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); - q->cstats.drop_count = 0; - q->cstats.drop_len = 0; + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index b0e34daf1f75..7b96bc3ff891 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -287,10 +287,9 @@ begin: static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct fq_pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_PIE_MAX + 1]; - unsigned int len_dropped = 0; - unsigned int num_dropped = 0; int err; err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack); @@ -368,11 +367,14 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); - len_dropped += qdisc_pkt_len(skb); - num_dropped += 1; + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); } - qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 5aa434b46707..2d4855e28a28 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -508,9 +508,9 @@ static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { static int hhf_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_HHF_MAX + 1]; - unsigned int qlen, prev_backlog; int err; u64 non_hh_quantum; u32 new_quantum = q->quantum; @@ -561,15 +561,17 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt, usecs_to_jiffies(us)); } - qlen = sch->q.qlen; - prev_backlog = sch->qstats.backlog; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); } - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, - prev_backlog - sch->qstats.backlog); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index ad46ee3ed5a9..0a377313b6a9 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -141,9 +141,9 @@ static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { static int pie_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_PIE_MAX + 1]; - unsigned int qlen, dropped = 0; int err; err = nla_parse_nested_deprecated(tb, TCA_PIE_MAX, opt, pie_policy, @@ -193,15 +193,17 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR])); /* Drop excess packets if new limit is lower */ - qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, true); - dropped += qdisc_pkt_len(skb); - qdisc_qstats_backlog_dec(sch, skb); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_qdisc_drop(skb, sch); } - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; From 8c06cbdcbaea34d7b96d76df4d6669275c1d291a Mon Sep 17 00:00:00 2001 From: William Liu Date: Tue, 12 Aug 2025 23:58:26 +0000 Subject: [PATCH 135/246] selftests/tc-testing: Check backlog stats in gso_skb case Add tests to ensure proper backlog accounting in hhf, codel, pie, fq, fq_pie, and fq_codel qdiscs. We check for the bug pattern originally found in fq, fq_pie, and fq_codel, which was an underflow in the tbf parent backlog stats upon child qdisc removal. Signed-off-by: William Liu Reviewed-by: Savino Dicanosa Link: https://patch.msgid.link/20250812235808.45281-1-will@willsroot.io Signed-off-by: Jakub Kicinski --- .../tc-testing/tc-tests/infra/qdiscs.json | 198 ++++++++++++++++++ 1 file changed, 198 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 23a61e5b99d0..998e5a2f4579 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -185,6 +185,204 @@ "$IP addr del 10.10.10.10/24 dev $DUMMY || true" ] }, + { + "id": "34c0", + "name": "Test TBF with HHF Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "hhf" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 hhf limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 hhf limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "fd68", + "name": "Test TBF with CODEL Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "codel" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 codel limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 codel limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "514e", + "name": "Test TBF with PIE Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "pie" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 pie limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 pie limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "6c97", + "name": "Test TBF with FQ Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "fq" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 fq limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 fq limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "5d0b", + "name": "Test TBF with FQ_CODEL Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "fq_codel" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 fq_codel limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 fq_codel limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "21c3", + "name": "Test TBF with FQ_PIE Backlog Accounting in gso_skb case against underflow", + "category": [ + "qdisc", + "tbf", + "fq_pie" + ], + "plugins": { + "requires": [ + "nsPlugin" + ] + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 8bit burst 100b latency 100ms", + "$TC qdisc replace dev $DUMMY handle 2: parent 1:1 fq_pie limit 1000", + [ + "ping -I $DUMMY -c2 10.10.11.11", + 1 + ], + "$TC qdisc change dev $DUMMY handle 2: parent 1:1 fq_pie limit 1" + ], + "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 2: parent 1:1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "backlog 0b 0p", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, { "id": "a4bb", "name": "Test FQ_CODEL with HTB parent - force packet drop with empty queue", From 065c31f2c6915b38f45b1c817b31f41f62eaa774 Mon Sep 17 00:00:00 2001 From: Justin Lai Date: Wed, 13 Aug 2025 15:16:31 +0800 Subject: [PATCH 136/246] rtase: Fix Rx descriptor CRC error bit definition The CRC error bit is located at bit 17 in the Rx descriptor, but the driver was incorrectly using bit 16. Fix it. Fixes: a36e9f5cfe9e ("rtase: Add support for a pci table in this module") Signed-off-by: Justin Lai Link: https://patch.msgid.link/20250813071631.7566-1-justinlai0215@realtek.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/rtase/rtase.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/rtase/rtase.h b/drivers/net/ethernet/realtek/rtase/rtase.h index 20decdeb9fdb..b9209eb6ea73 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase.h +++ b/drivers/net/ethernet/realtek/rtase/rtase.h @@ -241,7 +241,7 @@ union rtase_rx_desc { #define RTASE_RX_RES BIT(20) #define RTASE_RX_RUNT BIT(19) #define RTASE_RX_RWT BIT(18) -#define RTASE_RX_CRC BIT(16) +#define RTASE_RX_CRC BIT(17) #define RTASE_RX_V6F BIT(31) #define RTASE_RX_V4F BIT(30) #define RTASE_RX_UDPT BIT(29) From d73915fdc0011d536c03856be7ec451f6a5fb4ff Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 31 Jul 2025 15:42:18 -0700 Subject: [PATCH 137/246] lib/crypto: sha: Update Kconfig help for SHA1 and SHA256 Update the help text for CRYPTO_LIB_SHA1 and CRYPTO_LIB_SHA256 to reflect the addition of HMAC support, and to be consistent with CRYPTO_LIB_SHA512. Link: https://lore.kernel.org/r/20250731224218.137947-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- lib/crypto/Kconfig | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index c2b65b6a9bb6..1e6b008f8fca 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -140,8 +140,8 @@ config CRYPTO_LIB_CHACHA20POLY1305 config CRYPTO_LIB_SHA1 tristate help - The SHA-1 library functions. Select this if your module uses any of - the functions from . + The SHA-1 and HMAC-SHA1 library functions. Select this if your module + uses any of the functions from . config CRYPTO_LIB_SHA1_ARCH bool @@ -157,9 +157,9 @@ config CRYPTO_LIB_SHA1_ARCH config CRYPTO_LIB_SHA256 tristate help - Enable the SHA-256 library interface. This interface may be fulfilled - by either the generic implementation or an arch-specific one, if one - is available and enabled. + The SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions. + Select this if your module uses any of these functions from + . config CRYPTO_LIB_SHA256_ARCH bool From fd7e5de4b2eddd34e3567cd419812d8869ef4f13 Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Thu, 14 Aug 2025 02:51:57 -0400 Subject: [PATCH 138/246] lib/crypto: ensure generated *.S files are removed on make clean make clean does not check the kernel config when removing files. As such, additions to clean-files under CONFIG_ARM or CONFIG_ARM64 are not evaluated. For example, when building on arm64, this means that lib/crypto/arm64/sha{256,512}-core.S are left over after make clean. Set clean-files unconditionally to ensure that make clean removes these files. Fixes: e96cb9507f2d ("lib/crypto: sha256: Consolidate into single module") Fixes: 24c91b62ac50 ("lib/crypto: arm/sha512: Migrate optimized SHA-512 code to library") Fixes: 60e3f1e9b7a5 ("lib/crypto: arm64/sha512: Migrate optimized SHA-512 code to library") Signed-off-by: Tal Zussman Link: https://lore.kernel.org/r/20250814-crypto_clean-v2-1-659a2dc86302@columbia.edu Signed-off-by: Eric Biggers --- lib/crypto/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index e4151be2ebd4..539d5d59a50e 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -100,7 +100,6 @@ ifeq ($(CONFIG_ARM),y) libsha256-y += arm/sha256-ce.o arm/sha256-core.o $(obj)/arm/sha256-core.S: $(src)/arm/sha256-armv4.pl $(call cmd,perlasm) -clean-files += arm/sha256-core.S AFLAGS_arm/sha256-core.o += $(aflags-thumb2-y) endif @@ -108,7 +107,6 @@ ifeq ($(CONFIG_ARM64),y) libsha256-y += arm64/sha256-core.o $(obj)/arm64/sha256-core.S: $(src)/arm64/sha2-armv8.pl $(call cmd,perlasm_with_args) -clean-files += arm64/sha256-core.S libsha256-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha256-ce.o endif @@ -132,7 +130,6 @@ ifeq ($(CONFIG_ARM),y) libsha512-y += arm/sha512-core.o $(obj)/arm/sha512-core.S: $(src)/arm/sha512-armv4.pl $(call cmd,perlasm) -clean-files += arm/sha512-core.S AFLAGS_arm/sha512-core.o += $(aflags-thumb2-y) endif @@ -140,7 +137,6 @@ ifeq ($(CONFIG_ARM64),y) libsha512-y += arm64/sha512-core.o $(obj)/arm64/sha512-core.S: $(src)/arm64/sha2-armv8.pl $(call cmd,perlasm_with_args) -clean-files += arm64/sha512-core.S libsha512-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha512-ce-core.o endif @@ -167,3 +163,7 @@ obj-$(CONFIG_PPC) += powerpc/ obj-$(CONFIG_RISCV) += riscv/ obj-$(CONFIG_S390) += s390/ obj-$(CONFIG_X86) += x86/ + +# clean-files must be defined unconditionally +clean-files += arm/sha256-core.S arm/sha512-core.S +clean-files += arm64/sha256-core.S arm64/sha512-core.S From de5d7d3f27ddd4046736f558a40e252ddda82013 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 28 Jul 2025 17:08:44 +0800 Subject: [PATCH 139/246] Bluetooth: hci_sync: Avoid adding default advertising on startup list_empty(&hdev->adv_instances) is always true during startup, so an advertising instance is added by default. Call trace: dump_backtrace+0x94/0xec show_stack+0x18/0x24 dump_stack_lvl+0x48/0x60 dump_stack+0x18/0x24 hci_setup_ext_adv_instance_sync+0x17c/0x328 hci_powered_update_adv_sync+0xb4/0x12c hci_powered_update_sync+0x54/0x70 hci_power_on_sync+0xe4/0x278 hci_set_powered_sync+0x28/0x34 set_powered_sync+0x40/0x58 hci_cmd_sync_work+0x94/0x100 process_one_work+0x168/0x444 worker_thread+0x378/0x3f4 kthread+0x108/0x10c ret_from_fork+0x10/0x20 Link: https://github.com/bluez/bluez/issues/1442 Signed-off-by: Yang Li Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 2b4f21fbf9c1..7397b6b50ccb 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3344,7 +3344,7 @@ static int hci_powered_update_adv_sync(struct hci_dev *hdev) * advertising data. This also applies to the case * where BR/EDR was toggled during the AUTO_OFF phase. */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || + if (hci_dev_test_flag(hdev, HCI_ADVERTISING) && list_empty(&hdev->adv_instances)) { if (ext_adv_capable(hdev)) { err = hci_setup_ext_adv_instance_sync(hdev, 0x00); From ca88be1a2725a42f8dbad579181611d9dcca8e88 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 24 Jul 2025 16:43:18 -0400 Subject: [PATCH 140/246] Bluetooth: hci_sync: Fix scan state after PA Sync has been established Passive scanning is used to program the address of the peer to be synchronized, so once HCI_EV_LE_PA_SYNC_ESTABLISHED is received it needs to be updated after clearing HCI_PA_SYNC then call hci_update_passive_scan_sync to return it to its original state. Fixes: 6d0417e4e1cf ("Bluetooth: hci_conn: Fix not setting conn_timeout for Broadcast Receiver") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 7397b6b50ccb..387c128f2ba0 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -6985,8 +6985,6 @@ static void create_pa_complete(struct hci_dev *hdev, void *data, int err) hci_dev_lock(hdev); - hci_dev_clear_flag(hdev, HCI_PA_SYNC); - if (!hci_conn_valid(hdev, conn)) clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); @@ -7080,6 +7078,11 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); + hci_dev_clear_flag(hdev, HCI_PA_SYNC); + + /* Update passive scan since HCI_PA_SYNC flag has been cleared */ + hci_update_passive_scan_sync(hdev); + return err; } From aee29c18a38d479c2f058c9b6a39b0527cf81d10 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 24 Jul 2025 16:36:27 -0400 Subject: [PATCH 141/246] Bluetooth: ISO: Fix getname not returning broadcast fields getname shall return iso_bc fields for both BIS_LINK and PA_LINK since the likes of bluetoothd do use the getpeername to retrieve the SID both when enumerating the broadcasters and when synchronizing. Fixes: a7bcffc673de ("Bluetooth: Add PA_LINK to distinguish BIG sync and PA sync connections") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 7bd3aa0a6db9..eaffd25570e3 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1347,7 +1347,7 @@ static int iso_sock_getname(struct socket *sock, struct sockaddr *addr, bacpy(&sa->iso_bdaddr, &iso_pi(sk)->dst); sa->iso_bdaddr_type = iso_pi(sk)->dst_type; - if (hcon && hcon->type == BIS_LINK) { + if (hcon && (hcon->type == BIS_LINK || hcon->type == PA_LINK)) { sa->iso_bc->bc_sid = iso_pi(sk)->bc_sid; sa->iso_bc->bc_num_bis = iso_pi(sk)->bc_num_bis; memcpy(sa->iso_bc->bc_bis, iso_pi(sk)->bc_bis, From d36349ea73d805bb72cbc24ab90cb1da4ad5c379 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 28 Jul 2025 13:51:01 -0400 Subject: [PATCH 142/246] Bluetooth: hci_conn: Fix running bis_cleanup for hci_conn->type PA_LINK Connections with type of PA_LINK shall be considered temporary just to track the lifetime of PA Sync setup, once the BIG Sync is established and connection are created with BIS_LINK the existing PA_LINK connection shall not longer use bis_cleanup otherwise it terminates the PA Sync when that shall be left to BIS_LINK connection to do it. Fixes: a7bcffc673de ("Bluetooth: Add PA_LINK to distinguish BIG sync and PA sync connections") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 12 +++++++++++- net/bluetooth/hci_event.c | 7 ++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 7d1e79f69cd1..f8b20b609a03 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -830,7 +830,17 @@ static void bis_cleanup(struct hci_conn *conn) /* Check if ISO connection is a BIS and terminate advertising * set and BIG if there are no other connections using it. */ - bis = hci_conn_hash_lookup_big(hdev, conn->iso_qos.bcast.big); + bis = hci_conn_hash_lookup_big_state(hdev, + conn->iso_qos.bcast.big, + BT_CONNECTED, + HCI_ROLE_MASTER); + if (bis) + return; + + bis = hci_conn_hash_lookup_big_state(hdev, + conn->iso_qos.bcast.big, + BT_CONNECT, + HCI_ROLE_MASTER); if (bis) return; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 8aa5039b975a..4f0a6116291e 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6957,9 +6957,14 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, continue; } - if (ev->status != 0x42) + if (ev->status != 0x42) { /* Mark PA sync as established */ set_bit(HCI_CONN_PA_SYNC, &bis->flags); + /* Reset cleanup callback of PA Sync so it doesn't + * terminate the sync when deleting the connection. + */ + conn->cleanup = NULL; + } bis->sync_handle = conn->sync_handle; bis->iso_qos.bcast.big = ev->handle; From 3ba486c5f3ce2c22ffd29c0103404cdbe21912b3 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 29 Jul 2025 12:11:09 -0400 Subject: [PATCH 143/246] Bluetooth: hci_conn: Fix not cleaning up Broadcaster/Broadcast Source This fixes Broadcaster/Broadcast Source not sending HCI_OP_LE_TERM_BIG because HCI_CONN_PER_ADV where not being set. Fixes: a7bcffc673de ("Bluetooth: Add PA_LINK to distinguish BIG sync and PA sync connections") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index f8b20b609a03..ab6fe5b0cc0f 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2259,7 +2259,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, * the start periodic advertising and create BIG commands have * been queued */ - hci_conn_hash_list_state(hdev, bis_mark_per_adv, PA_LINK, + hci_conn_hash_list_state(hdev, bis_mark_per_adv, BIS_LINK, BT_BOUND, &data); /* Queue start periodic advertising and create BIG */ From 099799fa9b76c5c02b49e07005a85117a25b01ea Mon Sep 17 00:00:00 2001 From: Jiande Lu Date: Thu, 24 Jul 2025 16:51:17 +0800 Subject: [PATCH 144/246] Bluetooth: btmtk: Fix wait_on_bit_timeout interruption during shutdown During the shutdown process, an interrupt occurs that prematurely terminates the wait for the expected event. This change replaces TASK_INTERRUPTIBLE with TASK_UNINTERRUPTIBLE in the wait_on_bit_timeout call to ensure the shutdown process completes as intended without being interrupted by signals. Fixes: d019930b0049 ("Bluetooth: btmtk: move btusb_mtk_hci_wmt_sync to btmtk.c") Signed-off-by: Jiande Lu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtk.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c index 4390fd571dbd..a8c520dc09e1 100644 --- a/drivers/bluetooth/btmtk.c +++ b/drivers/bluetooth/btmtk.c @@ -642,12 +642,7 @@ static int btmtk_usb_hci_wmt_sync(struct hci_dev *hdev, * WMT command. */ err = wait_on_bit_timeout(&data->flags, BTMTK_TX_WAIT_VND_EVT, - TASK_INTERRUPTIBLE, HCI_INIT_TIMEOUT); - if (err == -EINTR) { - bt_dev_err(hdev, "Execution of wmt command interrupted"); - clear_bit(BTMTK_TX_WAIT_VND_EVT, &data->flags); - goto err_free_wc; - } + TASK_UNINTERRUPTIBLE, HCI_INIT_TIMEOUT); if (err) { bt_dev_err(hdev, "Execution of wmt command timed out"); From 709788b154caf042874d765628ffa860f0bb0d1e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 4 Aug 2025 09:54:05 -0400 Subject: [PATCH 145/246] Bluetooth: hci_core: Fix using {cis,bis}_capable for current settings {cis,bis}_capable only indicates the controller supports the feature since it doesn't check that LE is enabled so it shall not be used for current setting, instead this introduces {cis,bis}_enabled macros that can be used to indicate that these features are currently enabled. Fixes: 26afbd826ee3 ("Bluetooth: Add initial implementation of CIS connections") Fixes: eca0ae4aea66 ("Bluetooth: Add initial implementation of BIS connections") Fixes: ae7533613133 ("Bluetooth: Check for ISO support in controller") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 4 ++-- include/net/bluetooth/hci_core.h | 13 ++++++++++++- net/bluetooth/hci_sync.c | 4 ++-- net/bluetooth/iso.c | 14 +++++++------- net/bluetooth/mgmt.c | 10 +++++----- 5 files changed, 28 insertions(+), 17 deletions(-) diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index ada5b56a4413..e5751f3070b8 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -647,7 +647,7 @@ static inline void sco_exit(void) #if IS_ENABLED(CONFIG_BT_LE) int iso_init(void); int iso_exit(void); -bool iso_enabled(void); +bool iso_inited(void); #else static inline int iso_init(void) { @@ -659,7 +659,7 @@ static inline int iso_exit(void) return 0; } -static inline bool iso_enabled(void) +static inline bool iso_inited(void) { return false; } diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 4dc11c66f7b8..bc29f2e2e16f 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1915,6 +1915,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); !hci_dev_test_flag(dev, HCI_RPA_EXPIRED)) #define adv_rpa_valid(adv) (bacmp(&adv->random_addr, BDADDR_ANY) && \ !adv->rpa_expired) +#define le_enabled(dev) (lmp_le_capable(dev) && \ + hci_dev_test_flag(dev, HCI_LE_ENABLED)) #define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M)) @@ -1981,14 +1983,23 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* CIS Master/Slave and BIS support */ #define iso_capable(dev) (cis_capable(dev) || bis_capable(dev)) +#define iso_enabled(dev) (le_enabled(dev) && iso_capable(dev)) #define cis_capable(dev) \ (cis_central_capable(dev) || cis_peripheral_capable(dev)) +#define cis_enabled(dev) (le_enabled(dev) && cis_capable(dev)) #define cis_central_capable(dev) \ ((dev)->le_features[3] & HCI_LE_CIS_CENTRAL) +#define cis_central_enabled(dev) \ + (le_enabled(dev) && cis_central_capable(dev)) #define cis_peripheral_capable(dev) \ ((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL) +#define cis_peripheral_enabled(dev) \ + (le_enabled(dev) && cis_peripheral_capable(dev)) #define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER) -#define sync_recv_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) +#define bis_enabled(dev) (le_enabled(dev) && bis_capable(dev)) +#define sync_recv_capable(dev) \ + ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) +#define sync_recv_enabled(dev) (le_enabled(dev) && sync_recv_capable(dev)) #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ (!hci_test_quirk((dev), HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG))) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 387c128f2ba0..aa7d7a8ec3ee 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4531,14 +4531,14 @@ static int hci_le_set_host_feature_sync(struct hci_dev *hdev) { struct hci_cp_le_set_host_feature cp; - if (!cis_capable(hdev)) + if (!iso_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); /* Connected Isochronous Channels (Host Support) */ cp.bit_number = 32; - cp.bit_value = 1; + cp.bit_value = iso_enabled(hdev) ? 0x01 : 0x00; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_HOST_FEATURE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index eaffd25570e3..5ce823ca3aaf 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -2483,11 +2483,11 @@ static const struct net_proto_family iso_sock_family_ops = { .create = iso_sock_create, }; -static bool iso_inited; +static bool inited; -bool iso_enabled(void) +bool iso_inited(void) { - return iso_inited; + return inited; } int iso_init(void) @@ -2496,7 +2496,7 @@ int iso_init(void) BUILD_BUG_ON(sizeof(struct sockaddr_iso) > sizeof(struct sockaddr)); - if (iso_inited) + if (inited) return -EALREADY; err = proto_register(&iso_proto, 0); @@ -2524,7 +2524,7 @@ int iso_init(void) iso_debugfs = debugfs_create_file("iso", 0444, bt_debugfs, NULL, &iso_debugfs_fops); - iso_inited = true; + inited = true; return 0; @@ -2535,7 +2535,7 @@ error: int iso_exit(void) { - if (!iso_inited) + if (!inited) return -EALREADY; bt_procfs_cleanup(&init_net, "iso"); @@ -2549,7 +2549,7 @@ int iso_exit(void) proto_unregister(&iso_proto); - iso_inited = false; + inited = false; return 0; } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 1ce682038b51..c42dffe77daf 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -922,16 +922,16 @@ static u32 get_current_settings(struct hci_dev *hdev) if (hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED)) settings |= MGMT_SETTING_WIDEBAND_SPEECH; - if (cis_central_capable(hdev)) + if (cis_central_enabled(hdev)) settings |= MGMT_SETTING_CIS_CENTRAL; - if (cis_peripheral_capable(hdev)) + if (cis_peripheral_enabled(hdev)) settings |= MGMT_SETTING_CIS_PERIPHERAL; - if (bis_capable(hdev)) + if (bis_enabled(hdev)) settings |= MGMT_SETTING_ISO_BROADCASTER; - if (sync_recv_capable(hdev)) + if (sync_recv_enabled(hdev)) settings |= MGMT_SETTING_ISO_SYNC_RECEIVER; if (ll_privacy_capable(hdev)) @@ -4513,7 +4513,7 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, } if (IS_ENABLED(CONFIG_BT_LE)) { - flags = iso_enabled() ? BIT(0) : 0; + flags = iso_inited() ? BIT(0) : 0; memcpy(rp->features[idx].uuid, iso_socket_uuid, 16); rp->features[idx].flags = cpu_to_le32(flags); idx++; From 3dcf7175f2c04bd3a7d50db3fa42a0bd933b6e23 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 4 Aug 2025 14:05:03 -0400 Subject: [PATCH 146/246] Bluetooth: hci_core: Fix using ll_privacy_capable for current settings ll_privacy_capable only indicates that the controller supports the feature but it doesnt' check that LE is enabled so it end up being marked as active in the current settings when it shouldn't. Fixes: ad383c2c65a5 ("Bluetooth: hci_sync: Enable advertising when LL privacy is enabled") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/mgmt.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index bc29f2e2e16f..bb30bde6f0e8 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1934,6 +1934,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn); ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED)) #define ll_privacy_capable(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY) +#define ll_privacy_enabled(dev) (le_enabled(dev) && ll_privacy_capable(dev)) #define privacy_mode_capable(dev) (ll_privacy_capable(dev) && \ ((dev)->commands[39] & 0x04)) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index c42dffe77daf..3166f5fb876b 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -934,7 +934,7 @@ static u32 get_current_settings(struct hci_dev *hdev) if (sync_recv_enabled(hdev)) settings |= MGMT_SETTING_ISO_SYNC_RECEIVER; - if (ll_privacy_capable(hdev)) + if (ll_privacy_enabled(hdev)) settings |= MGMT_SETTING_LL_PRIVACY; return settings; From 4d19cd228bbe8ff84a63fe7b11bc756b4b4370c7 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 7 Aug 2025 15:56:03 +0800 Subject: [PATCH 147/246] Bluetooth: hci_sync: Prevent unintended PA sync when SID is 0xFF After LE Extended Scan times out, conn->sid remains 0xFF, so the PA sync creation process should be aborted. Btmon snippet from PA sync with SID=0xFF: < HCI Command: LE Set Extended.. (0x08|0x0042) plen 6 #74726 [hci0] 863.107927 Extended scan: Enabled (0x01) Filter duplicates: Enabled (0x01) Duration: 0 msec (0x0000) Period: 0.00 sec (0x0000) > HCI Event: Command Complete (0x0e) plen 4 #74727 [hci0] 863.109389 LE Set Extended Scan Enable (0x08|0x0042) ncmd 1 Status: Success (0x00) < HCI Command: LE Periodic Ad.. (0x08|0x0044) plen 14 #74728 [hci0] 865.141168 Options: 0x0000 Use advertising SID, Advertiser Address Type and address Reporting initially enabled SID: 0xff Adv address type: Random (0x01) Adv address: 0D:D7:2C:E7:42:46 (Non-Resolvable) Skip: 0x0000 Sync timeout: 20000 msec (0x07d0) Sync CTE type: 0x0000 > HCI Event: Command Status (0x0f) plen 4 #74729 [hci0] 865.143223 LE Periodic Advertising Create Sync (0x08|0x0044) ncmd 1 Status: Success (0x00) Fixes: e2d471b7806b ("Bluetooth: ISO: Fix not using SID from adv report") Signed-off-by: Yang Li Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index aa7d7a8ec3ee..31d72b9683ef 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -7045,10 +7045,13 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) /* SID has not been set listen for HCI_EV_LE_EXT_ADV_REPORT to update * it. */ - if (conn->sid == HCI_SID_INVALID) - __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL, - HCI_EV_LE_EXT_ADV_REPORT, - conn->conn_timeout, NULL); + if (conn->sid == HCI_SID_INVALID) { + err = __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL, + HCI_EV_LE_EXT_ADV_REPORT, + conn->conn_timeout, NULL); + if (err == -ETIMEDOUT) + goto done; + } memset(&cp, 0, sizeof(cp)); cp.options = qos->bcast.options; @@ -7078,6 +7081,7 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); +done: hci_dev_clear_flag(hdev, HCI_PA_SYNC); /* Update passive scan since HCI_PA_SYNC flag has been cleared */ From 0b3725dbf61b51e7c663834811b3691157ae17d6 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sat, 9 Aug 2025 11:36:20 +0300 Subject: [PATCH 148/246] Bluetooth: hci_event: fix MTU for BN == 0 in CIS Established BN == 0x00 in CIS Established means no isochronous data for the corresponding direction (Core v6.1 pp. 2394). In this case SDU MTU should be 0. However, the specification does not say the Max_PDU_C_To_P or P_To_C are then zero. Intel AX210 in Framed CIS mode sets nonzero Max_PDU for direction with zero BN. This causes failure later when we try to LE Setup ISO Data Path for disabled direction, which is disallowed (Core v6.1 pp. 2750). Fix by setting SDU MTU to 0 if BN == 0. Fixes: 2be22f1941d5f ("Bluetooth: hci_event: Fix parsing of CIS Established Event") Signed-off-by: Pauli Virtanen Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 4f0a6116291e..fe7cdd67ad2a 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6745,8 +6745,8 @@ static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data, qos->ucast.out.latency = DIV_ROUND_CLOSEST(get_unaligned_le24(ev->p_latency), 1000); - qos->ucast.in.sdu = le16_to_cpu(ev->c_mtu); - qos->ucast.out.sdu = le16_to_cpu(ev->p_mtu); + qos->ucast.in.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0; + qos->ucast.out.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0; qos->ucast.in.phy = ev->c_phy; qos->ucast.out.phy = ev->p_phy; break; @@ -6760,8 +6760,8 @@ static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data, qos->ucast.in.latency = DIV_ROUND_CLOSEST(get_unaligned_le24(ev->p_latency), 1000); - qos->ucast.out.sdu = le16_to_cpu(ev->c_mtu); - qos->ucast.in.sdu = le16_to_cpu(ev->p_mtu); + qos->ucast.out.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0; + qos->ucast.in.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0; qos->ucast.out.phy = ev->c_phy; qos->ucast.in.phy = ev->p_phy; break; From b5ca88927e353185b3d9ac4362d33e5aeb25771f Mon Sep 17 00:00:00 2001 From: Thomas Bertschinger Date: Thu, 14 Aug 2025 17:54:28 -0600 Subject: [PATCH 149/246] fhandle: do_handle_open() should get FD with user flags In f07c7cc4684a, do_handle_open() was switched to use the automatic cleanup method for getting a FD. In that change it was also switched to pass O_CLOEXEC unconditionally to get_unused_fd_flags() instead of passing the user-specified flags. I don't see anything in that commit description that indicates this was intentional, so I am assuming it was an oversight. With this fix, the FD will again be opened with, or without, O_CLOEXEC according to what the user requested. Fixes: f07c7cc4684a ("fhandle: simplify error handling") Signed-off-by: Thomas Bertschinger Link: https://lore.kernel.org/20250814235431.995876-4-tahbertschinger@gmail.com Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/fhandle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fhandle.c b/fs/fhandle.c index 7c236f64cdea..68a7d2861c58 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -402,7 +402,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, if (retval) return retval; - CLASS(get_unused_fd, fd)(O_CLOEXEC); + CLASS(get_unused_fd, fd)(open_flag); if (fd < 0) return fd; From a3de58b12ce074ec05b8741fa28d62ccb1070468 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 14 Aug 2025 22:45:50 +0100 Subject: [PATCH 150/246] netfs: Fix unbuffered write error handling If all the subrequests in an unbuffered write stream fail, the subrequest collector doesn't update the stream->transferred value and it retains its initial LONG_MAX value. Unfortunately, if all active streams fail, then we take the smallest value of { LONG_MAX, LONG_MAX, ... } as the value to set in wreq->transferred - which is then returned from ->write_iter(). LONG_MAX was chosen as the initial value so that all the streams can be quickly assessed by taking the smallest value of all stream->transferred - but this only works if we've set any of them. Fix this by adding a flag to indicate whether the value in stream->transferred is valid and checking that when we integrate the values. stream->transferred can then be initialised to zero. This was found by running the generic/750 xfstest against cifs with cache=none. It splices data to the target file. Once (if) it has used up all the available scratch space, the writes start failing with ENOSPC. This causes ->write_iter() to fail. However, it was returning wreq->transferred, i.e. LONG_MAX, rather than an error (because it thought the amount transferred was non-zero) and iter_file_splice_write() would then try to clean up that amount of pipe bufferage - leading to an oops when it overran. The kernel log showed: CIFS: VFS: Send error in write = -28 followed by: BUG: kernel NULL pointer dereference, address: 0000000000000008 with: RIP: 0010:iter_file_splice_write+0x3a4/0x520 do_splice+0x197/0x4e0 or: RIP: 0010:pipe_buf_release (include/linux/pipe_fs_i.h:282) iter_file_splice_write (fs/splice.c:755) Also put a warning check into splice to announce if ->write_iter() returned that it had written more than it was asked to. Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Reported-by: Xiaoli Feng Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220445 Signed-off-by: David Howells Link: https://lore.kernel.org/915443.1755207950@warthog.procyon.org.uk cc: Paulo Alcantara cc: Steve French cc: Shyam Prasad N cc: netfs@lists.linux.dev cc: linux-cifs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org cc: stable@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_collect.c | 4 +++- fs/netfs/write_collect.c | 10 ++++++++-- fs/netfs/write_issue.c | 4 ++-- fs/splice.c | 3 +++ include/linux/netfs.h | 1 + 5 files changed, 17 insertions(+), 5 deletions(-) diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 3e804da1e1eb..a95e7aadafd0 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -281,8 +281,10 @@ reassess: } else if (test_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags)) { notes |= MADE_PROGRESS; } else { - if (!stream->failed) + if (!stream->failed) { stream->transferred += transferred; + stream->transferred_valid = true; + } if (front->transferred < front->len) set_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags); notes |= MADE_PROGRESS; diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 0f3a36852a4d..cbf3d9194c7b 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -254,6 +254,7 @@ reassess_streams: if (front->start + front->transferred > stream->collected_to) { stream->collected_to = front->start + front->transferred; stream->transferred = stream->collected_to - wreq->start; + stream->transferred_valid = true; notes |= MADE_PROGRESS; } if (test_bit(NETFS_SREQ_FAILED, &front->flags)) { @@ -356,6 +357,7 @@ bool netfs_write_collection(struct netfs_io_request *wreq) { struct netfs_inode *ictx = netfs_inode(wreq->inode); size_t transferred; + bool transferred_valid = false; int s; _enter("R=%x", wreq->debug_id); @@ -376,12 +378,16 @@ bool netfs_write_collection(struct netfs_io_request *wreq) continue; if (!list_empty(&stream->subrequests)) return false; - if (stream->transferred < transferred) + if (stream->transferred_valid && + stream->transferred < transferred) { transferred = stream->transferred; + transferred_valid = true; + } } /* Okay, declare that all I/O is complete. */ - wreq->transferred = transferred; + if (transferred_valid) + wreq->transferred = transferred; trace_netfs_rreq(wreq, netfs_rreq_trace_write_done); if (wreq->io_streams[1].active && diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 50bee2c4130d..0584cba1a043 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -118,12 +118,12 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, wreq->io_streams[0].prepare_write = ictx->ops->prepare_write; wreq->io_streams[0].issue_write = ictx->ops->issue_write; wreq->io_streams[0].collected_to = start; - wreq->io_streams[0].transferred = LONG_MAX; + wreq->io_streams[0].transferred = 0; wreq->io_streams[1].stream_nr = 1; wreq->io_streams[1].source = NETFS_WRITE_TO_CACHE; wreq->io_streams[1].collected_to = start; - wreq->io_streams[1].transferred = LONG_MAX; + wreq->io_streams[1].transferred = 0; if (fscache_resources_valid(&wreq->cache_resources)) { wreq->io_streams[1].avail = true; wreq->io_streams[1].active = true; diff --git a/fs/splice.c b/fs/splice.c index 4d6df083e0c0..f5094b6d00a0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -739,6 +739,9 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, sd.pos = kiocb.ki_pos; if (ret <= 0) break; + WARN_ONCE(ret > sd.total_len - left, + "Splice Exceeded! ret=%zd tot=%zu left=%zu\n", + ret, sd.total_len, left); sd.num_spliced += ret; sd.total_len -= ret; diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 185bd8196503..98c96d649bf9 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -150,6 +150,7 @@ struct netfs_io_stream { bool active; /* T if stream is active */ bool need_retry; /* T if this stream needs retrying */ bool failed; /* T if this stream failed */ + bool transferred_valid; /* T is ->transferred is valid */ }; /* From 0b2d71a7c82628bb36fd43e80193bcc2693c239a Mon Sep 17 00:00:00 2001 From: "Adrian Huang (Lenovo)" Date: Thu, 14 Aug 2025 17:44:53 +0800 Subject: [PATCH 151/246] pidfs: Fix memory leak in pidfd_info() After running the program 'ioctl_pidfd03' of Linux Test Project (LTP) or the program 'pidfd_info_test' in 'tools/testing/selftests/pidfd' of the kernel source, kmemleak reports the following memory leaks: # cat /sys/kernel/debug/kmemleak unreferenced object 0xff110020e5988000 (size 8216): comm "ioctl_pidfd03", pid 10853, jiffies 4294800031 hex dump (first 32 bytes): 02 40 00 00 00 00 00 00 10 00 00 00 00 00 00 00 .@.............. 00 00 00 00 af 01 00 00 80 00 00 00 00 00 00 00 ................ backtrace (crc 69483047): kmem_cache_alloc_node_noprof+0x2fb/0x410 copy_process+0x178/0x1740 kernel_clone+0x99/0x3b0 __do_sys_clone3+0xbe/0x100 do_syscall_64+0x7b/0x2c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e ... unreferenced object 0xff11002097b70000 (size 8216): comm "pidfd_info_test", pid 11840, jiffies 4294889165 hex dump (first 32 bytes): 06 40 00 00 00 00 00 00 10 00 00 00 00 00 00 00 .@.............. 00 00 00 00 b5 00 00 00 80 00 00 00 00 00 00 00 ................ backtrace (crc a6286bb7): kmem_cache_alloc_node_noprof+0x2fb/0x410 copy_process+0x178/0x1740 kernel_clone+0x99/0x3b0 __do_sys_clone3+0xbe/0x100 do_syscall_64+0x7b/0x2c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e ... The leak occurs because pidfd_info() obtains a task_struct via get_pid_task() but never calls put_task_struct() to drop the reference, leaving task->usage unbalanced. Fix the issue by adding '__free(put_task) = NULL' to the local variable 'task', ensuring that put_task_struct() is automatically invoked when the variable goes out of scope. Fixes: 7477d7dce48a ("pidfs: allow to retrieve exit information") Signed-off-by: Adrian Huang (Lenovo) Link: https://lore.kernel.org/20250814094453.15232-1-adrianhuang0701@gmail.com Signed-off-by: Christian Brauner --- fs/pidfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index edc35522d75c..108e7527f837 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -296,12 +296,12 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags) static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) { struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; + struct task_struct *task __free(put_task) = NULL; struct pid *pid = pidfd_pid(file); size_t usize = _IOC_SIZE(cmd); struct pidfd_info kinfo = {}; struct pidfs_exit_info *exit_info; struct user_namespace *user_ns; - struct task_struct *task; struct pidfs_attr *attr; const struct cred *c; __u64 mask; From 0eaf7c7e85da7495c0e03a99375707fc954f5e7b Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Tue, 5 Aug 2025 22:14:51 +0300 Subject: [PATCH 152/246] Bluetooth: hci_conn: do return error from hci_enhanced_setup_sync() The commit e07a06b4eb41 ("Bluetooth: Convert SCO configure_datapath to hci_sync") missed to update the *return* statement under the *case* of BT_CODEC_TRANSPARENT in hci_enhanced_setup_sync(), which led to returning success (0) instead of the negative error code (-EINVAL). However, the result of hci_enhanced_setup_sync() seems to be ignored anyway, since NULL gets passed to hci_cmd_sync_queue() as the last argument in that case and the only function interested in that result is specified by that argument. Fixes: e07a06b4eb41 ("Bluetooth: Convert SCO configure_datapath to hci_sync") Signed-off-by: Sergey Shtylyov Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ab6fe5b0cc0f..7a879290dd28 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -339,7 +339,8 @@ static int hci_enhanced_setup_sync(struct hci_dev *hdev, void *data) case BT_CODEC_TRANSPARENT: if (!find_next_esco_param(conn, esco_param_msbc, ARRAY_SIZE(esco_param_msbc))) - return false; + return -EINVAL; + param = &esco_param_msbc[conn->attempt - 1]; cp.tx_coding_format.id = 0x03; cp.rx_coding_format.id = 0x03; From e489317d2fd9a51a81bdcbe15a73ddde8246e6d6 Mon Sep 17 00:00:00 2001 From: Neeraj Sanjay Kale Date: Mon, 4 Aug 2025 16:00:15 +0530 Subject: [PATCH 153/246] Bluetooth: btnxpuart: Uses threaded IRQ for host wakeup handling This replaces devm_request_irq() with devm_request_threaded_irq(). On iMX93 11x11 EVK platform, the BT chip's BT_WAKE_OUT pin is connected to an I2C GPIO expander instead of directly been connected to iMX GPIO. When I2C GPIO expander's (PCAL6524) host driver receives an interrupt on it's INTR line, the driver's interrupt handler needs to query the interrupt source with PCAL6524 first, and then call the actual interrupt handler, in this case the IRQ handler in BTNXPUART. In order to handle interrupts when such I2C GPIO expanders are between the host and interrupt source, devm_request_threaded_irq() is needed. This commit also removes the IRQF_TRIGGER_FALLING flag, to allow setting the IRQ trigger type from the device tree setting instead of hardcoding in the driver. Signed-off-by: Neeraj Sanjay Kale Reviewed-by: Sherry Sun Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btnxpuart.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c index 73a4a325c867..76e7f857fb7d 100644 --- a/drivers/bluetooth/btnxpuart.c +++ b/drivers/bluetooth/btnxpuart.c @@ -543,10 +543,10 @@ static int ps_setup(struct hci_dev *hdev) } if (psdata->wakeup_source) { - ret = devm_request_irq(&serdev->dev, psdata->irq_handler, - ps_host_wakeup_irq_handler, - IRQF_ONESHOT | IRQF_TRIGGER_FALLING, - dev_name(&serdev->dev), nxpdev); + ret = devm_request_threaded_irq(&serdev->dev, psdata->irq_handler, + NULL, ps_host_wakeup_irq_handler, + IRQF_ONESHOT, + dev_name(&serdev->dev), nxpdev); if (ret) bt_dev_info(hdev, "error setting wakeup IRQ handler, ignoring\n"); disable_irq(psdata->irq_handler); From 9d4b01a0bf8d2163ae129c9c537cb0753ad5a2aa Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 14 Aug 2025 11:57:19 -0400 Subject: [PATCH 154/246] Bluetooth: hci_core: Fix not accounting for BIS/CIS/PA links separately This fixes the likes of hci_conn_num(CIS_LINK) returning the total of ISO connection which includes BIS_LINK as well, so this splits the iso_num into each link type and introduces hci_iso_num that can be used in places where the total number of ISO connection still needs to be used. Fixes: 23205562ffc8 ("Bluetooth: separate CIS_LINK and BIS_LINK link types") Fixes: a7bcffc673de ("Bluetooth: Add PA_LINK to distinguish BIG sync and PA sync connections") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index bb30bde6f0e8..6906af7a8f24 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -129,7 +129,9 @@ struct hci_conn_hash { struct list_head list; unsigned int acl_num; unsigned int sco_num; - unsigned int iso_num; + unsigned int cis_num; + unsigned int bis_num; + unsigned int pa_num; unsigned int le_num; unsigned int le_num_peripheral; }; @@ -1014,9 +1016,13 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c) h->sco_num++; break; case CIS_LINK: + h->cis_num++; + break; case BIS_LINK: + h->bis_num++; + break; case PA_LINK: - h->iso_num++; + h->pa_num++; break; } } @@ -1042,9 +1048,13 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c) h->sco_num--; break; case CIS_LINK: + h->cis_num--; + break; case BIS_LINK: + h->bis_num--; + break; case PA_LINK: - h->iso_num--; + h->pa_num--; break; } } @@ -1061,9 +1071,11 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type) case ESCO_LINK: return h->sco_num; case CIS_LINK: + return h->cis_num; case BIS_LINK: + return h->bis_num; case PA_LINK: - return h->iso_num; + return h->pa_num; default: return 0; } @@ -1073,7 +1085,15 @@ static inline unsigned int hci_conn_count(struct hci_dev *hdev) { struct hci_conn_hash *c = &hdev->conn_hash; - return c->acl_num + c->sco_num + c->le_num + c->iso_num; + return c->acl_num + c->sco_num + c->le_num + c->cis_num + c->bis_num + + c->pa_num; +} + +static inline unsigned int hci_iso_count(struct hci_dev *hdev) +{ + struct hci_conn_hash *c = &hdev->conn_hash; + + return c->cis_num + c->bis_num; } static inline bool hci_conn_valid(struct hci_dev *hdev, struct hci_conn *conn) From c08ba63078dd6046c279df37795cb77e784e1ec9 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 16 Jul 2025 15:41:35 -0500 Subject: [PATCH 155/246] virt: sev-guest: Satisfy linear mapping requirement in get_derived_key() Commit 7ffeb2fc2670 ("x86/sev: Document requirement for linear mapping of guest request buffers") added a check that requires the guest request buffers to be in the linear mapping. The get_derived_key() function was passing a buffer that was allocated on the stack, resulting in the call to snp_send_guest_request() returning an error. Update the get_derived_key() function to use an allocated buffer instead of a stack buffer. Fixes: 7ffeb2fc2670 ("x86/sev: Document requirement for linear mapping of guest request buffers") Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Cc: Link: https://lore.kernel.org/9b764ca9fc79199a091aac684c4926e2080ca7a8.1752698495.git.thomas.lendacky@amd.com --- drivers/virt/coco/sev-guest/sev-guest.c | 27 +++++++++++-------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c index d2b3ae7113ab..b01ec99106cd 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.c +++ b/drivers/virt/coco/sev-guest/sev-guest.c @@ -116,13 +116,11 @@ e_free: static int get_derived_key(struct snp_guest_dev *snp_dev, struct snp_guest_request_ioctl *arg) { + struct snp_derived_key_resp *derived_key_resp __free(kfree) = NULL; struct snp_derived_key_req *derived_key_req __free(kfree) = NULL; - struct snp_derived_key_resp derived_key_resp = {0}; struct snp_msg_desc *mdesc = snp_dev->msg_desc; struct snp_guest_req req = {}; int rc, resp_len; - /* Response data is 64 bytes and max authsize for GCM is 16 bytes. */ - u8 buf[64 + 16]; if (!arg->req_data || !arg->resp_data) return -EINVAL; @@ -132,8 +130,9 @@ static int get_derived_key(struct snp_guest_dev *snp_dev, struct snp_guest_reque * response payload. Make sure that it has enough space to cover the * authtag. */ - resp_len = sizeof(derived_key_resp.data) + mdesc->ctx->authsize; - if (sizeof(buf) < resp_len) + resp_len = sizeof(derived_key_resp->data) + mdesc->ctx->authsize; + derived_key_resp = kzalloc(resp_len, GFP_KERNEL_ACCOUNT); + if (!derived_key_resp) return -ENOMEM; derived_key_req = kzalloc(sizeof(*derived_key_req), GFP_KERNEL_ACCOUNT); @@ -149,23 +148,21 @@ static int get_derived_key(struct snp_guest_dev *snp_dev, struct snp_guest_reque req.vmpck_id = mdesc->vmpck_id; req.req_buf = derived_key_req; req.req_sz = sizeof(*derived_key_req); - req.resp_buf = buf; + req.resp_buf = derived_key_resp; req.resp_sz = resp_len; req.exit_code = SVM_VMGEXIT_GUEST_REQUEST; rc = snp_send_guest_request(mdesc, &req); arg->exitinfo2 = req.exitinfo2; - if (rc) - return rc; - - memcpy(derived_key_resp.data, buf, sizeof(derived_key_resp.data)); - if (copy_to_user((void __user *)arg->resp_data, &derived_key_resp, - sizeof(derived_key_resp))) - rc = -EFAULT; + if (!rc) { + if (copy_to_user((void __user *)arg->resp_data, derived_key_resp, + sizeof(derived_key_resp->data))) + rc = -EFAULT; + } /* The response buffer contains the sensitive data, explicitly clear it. */ - memzero_explicit(buf, sizeof(buf)); - memzero_explicit(&derived_key_resp, sizeof(derived_key_resp)); + memzero_explicit(derived_key_resp, sizeof(*derived_key_resp)); + return rc; } From 3ee9cebd0a5e7ea47eb35cec95eaa1a866af982d Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 13 Aug 2025 10:26:59 -0500 Subject: [PATCH 156/246] x86/sev: Ensure SVSM reserved fields in a page validation entry are initialized to zero In order to support future versions of the SVSM_CORE_PVALIDATE call, all reserved fields within a PVALIDATE entry must be set to zero as an SVSM should be ensuring all reserved fields are zero in order to support future usage of reserved areas based on the protocol version. Fixes: fcd042e86422 ("x86/sev: Perform PVALIDATE using the SVSM when not at VMPL0") Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Joerg Roedel Cc: Link: https://lore.kernel.org/7cde412f8b057ea13a646fb166b1ca023f6a5031.1755098819.git.thomas.lendacky@amd.com --- arch/x86/boot/startup/sev-shared.c | 1 + arch/x86/coco/sev/core.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index 7a706db87b93..4ab0dbd043c6 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -785,6 +785,7 @@ static void __head svsm_pval_4k_page(unsigned long paddr, bool validate) pc->entry[0].page_size = RMP_PG_SIZE_4K; pc->entry[0].action = validate; pc->entry[0].ignore_cf = 0; + pc->entry[0].rsvd = 0; pc->entry[0].pfn = paddr >> PAGE_SHIFT; /* Protocol 0, Call ID 1 */ diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index fc59ce78c477..43ecc6b9fb9c 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -227,6 +227,7 @@ static u64 svsm_build_ca_from_pfn_range(u64 pfn, u64 pfn_end, bool action, pe->page_size = RMP_PG_SIZE_4K; pe->action = action; pe->ignore_cf = 0; + pe->rsvd = 0; pe->pfn = pfn; pe++; @@ -257,6 +258,7 @@ static int svsm_build_ca_from_psc_desc(struct snp_psc_desc *desc, unsigned int d pe->page_size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K; pe->action = e->operation == SNP_PAGE_STATE_PRIVATE; pe->ignore_cf = 0; + pe->rsvd = 0; pe->pfn = e->gfn; pe++; From ed6c4b657bca3b39f7b11cba1405931aeb490f3d Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Fri, 15 Aug 2025 09:01:54 +0200 Subject: [PATCH 157/246] x86/cpuid: Remove transitional header All CPUID call sites were updated at commit: 968e30006807 ("x86/cpuid: Set as the main CPUID header") to include instead of . The header was still retained as a wrapper, just in case some new code in -next started using it. Now that everything is merged to Linus' tree, remove the header. Signed-off-by: Ahmed S. Darwish Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250815070227.19981-2-darwi@linutronix.de --- arch/x86/include/asm/cpuid.h | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 arch/x86/include/asm/cpuid.h diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h deleted file mode 100644 index d5749b25fa10..000000000000 --- a/arch/x86/include/asm/cpuid.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _ASM_X86_CPUID_H -#define _ASM_X86_CPUID_H - -#include - -#endif /* _ASM_X86_CPUID_H */ From f604d3aaf64ff0d90cc875295474d3abf4155629 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 14 Aug 2025 15:06:40 +0200 Subject: [PATCH 158/246] mlxsw: spectrum: Forward packets with an IPv4 link-local source IP By default, the device does not forward IPv4 packets with a link-local source IP (i.e., 169.254.0.0/16). This behavior does not align with the kernel which does forward them. Fix by instructing the device to forward such packets instead of dropping them. Fixes: ca360db4b825 ("mlxsw: spectrum: Disable DIP_LINK_LOCAL check in hardware pipeline") Reported-by: Zoey Mertes Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Link: https://patch.msgid.link/6721e6b2c96feb80269e72ce8d0b426e2f32d99c.1755174341.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 2 ++ drivers/net/ethernet/mellanox/mlxsw/trap.h | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 618957d65663..9a2d64a0a858 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -2375,6 +2375,8 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { ROUTER_EXP, false), MLXSW_SP_RXL_NO_MARK(DISCARD_ING_ROUTER_DIP_LINK_LOCAL, FORWARD, ROUTER_EXP, false), + MLXSW_SP_RXL_NO_MARK(DISCARD_ING_ROUTER_SIP_LINK_LOCAL, FORWARD, + ROUTER_EXP, false), /* Multicast Router Traps */ MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false), MLXSW_SP_RXL_L3_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false), diff --git a/drivers/net/ethernet/mellanox/mlxsw/trap.h b/drivers/net/ethernet/mellanox/mlxsw/trap.h index 80ee5c4825dc..9962dc157901 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/trap.h +++ b/drivers/net/ethernet/mellanox/mlxsw/trap.h @@ -94,6 +94,7 @@ enum { MLXSW_TRAP_ID_DISCARD_ING_ROUTER_IPV4_SIP_BC = 0x16A, MLXSW_TRAP_ID_DISCARD_ING_ROUTER_IPV4_DIP_LOCAL_NET = 0x16B, MLXSW_TRAP_ID_DISCARD_ING_ROUTER_DIP_LINK_LOCAL = 0x16C, + MLXSW_TRAP_ID_DISCARD_ING_ROUTER_SIP_LINK_LOCAL = 0x16D, MLXSW_TRAP_ID_DISCARD_ROUTER_IRIF_EN = 0x178, MLXSW_TRAP_ID_DISCARD_ROUTER_ERIF_EN = 0x179, MLXSW_TRAP_ID_DISCARD_ROUTER_LPM4 = 0x17B, From 5e0b2177bdba99c2487480e9864825f742b684ee Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 14 Aug 2025 15:06:41 +0200 Subject: [PATCH 159/246] selftest: forwarding: router: Add a test case for IPv4 link-local source IP Add a test case which checks that packets with an IPv4 link-local source IP are forwarded and not dropped. Signed-off-by: Ido Schimmel Signed-off-by: Petr Machata Link: https://patch.msgid.link/3c2e0b17d99530f57bef5ddff9af284fa0c9b667.1755174341.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../selftests/net/forwarding/router.sh | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/router.sh b/tools/testing/selftests/net/forwarding/router.sh index b98ea9449b8b..dfb6646cb97b 100755 --- a/tools/testing/selftests/net/forwarding/router.sh +++ b/tools/testing/selftests/net/forwarding/router.sh @@ -18,6 +18,8 @@ # | 2001:db8:1::1/64 2001:db8:2::1/64 | # | | # +-----------------------------------------------------------------+ +# +#shellcheck disable=SC2034 # SC doesn't see our uses of global variables ALL_TESTS=" ping_ipv4 @@ -27,6 +29,7 @@ ALL_TESTS=" ipv4_sip_equal_dip ipv6_sip_equal_dip ipv4_dip_link_local + ipv4_sip_link_local " NUM_NETIFS=4 @@ -330,6 +333,32 @@ ipv4_dip_link_local() tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower } +ipv4_sip_link_local() +{ + local sip=169.254.1.1 + + RET=0 + + # Disable rpfilter to prevent packets to be dropped because of it. + sysctl_set net.ipv4.conf.all.rp_filter 0 + sysctl_set net.ipv4.conf."$rp1".rp_filter 0 + + tc filter add dev "$rp2" egress protocol ip pref 1 handle 101 \ + flower src_ip "$sip" action pass + + $MZ "$h1" -t udp "sp=54321,dp=12345" -c 5 -d 1msec -b "$rp1mac" \ + -A "$sip" -B 198.51.100.2 -q + + tc_check_packets "dev $rp2 egress" 101 5 + check_err $? "Packets were dropped" + + log_test "IPv4 source IP is link-local" + + tc filter del dev "$rp2" egress protocol ip pref 1 handle 101 flower + sysctl_restore net.ipv4.conf."$rp1".rp_filter + sysctl_restore net.ipv4.conf.all.rp_filter +} + trap cleanup EXIT setup_prepare From 12da2b92ad50e6602b4c5e9073d71f2368b70b63 Mon Sep 17 00:00:00 2001 From: Chandra Mohan Sundar Date: Thu, 14 Aug 2025 22:00:10 +0530 Subject: [PATCH 160/246] net: libwx: Fix the size in RSS hash key population While trying to fill a random RSS key, the size of the pointer is being used rather than the actual size of the RSS key. Fix by passing an appropriate value of the RSS key. This issue was reported by static coverity analyser. Fixes: eb4898fde1de8 ("net: libwx: add wangxun vf common api") Signed-off-by: Chandra Mohan Sundar Link: https://patch.msgid.link/20250814163014.613004-1-chandramohan.explore@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c index 5d48df7a849f..3023ea2732ef 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c @@ -192,7 +192,7 @@ void wx_setup_vfmrqc_vf(struct wx *wx) u8 i, j; /* Fill out hash function seeds */ - netdev_rss_key_fill(wx->rss_key, sizeof(wx->rss_key)); + netdev_rss_key_fill(wx->rss_key, WX_RSS_KEY_SIZE); for (i = 0; i < WX_RSS_KEY_SIZE / 4; i++) wr32(wx, WX_VXRSSRK(i), wx->rss_key[i]); From 715c7a36d59f54162a26fac1d1ed8dc087a24cf1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 14 Aug 2025 12:43:23 -0700 Subject: [PATCH 161/246] selftests: tls: make the new data_steal test less flaky The CI has hit a couple of cases of: RUN global.data_steal ... tls.c:2762:data_steal:Expected recv(cfd, buf2, sizeof(buf2), MSG_DONTWAIT) (20000) == -1 (-1) data_steal: Test terminated by timeout FAIL global.data_steal Looks like the 2msec sleep is not long enough. Make the sleep longer, and then instead of second sleep wait for the thieving process to exit. That way we can be sure it called recv() before us. While at it also avoid trying to steal more than a record, this seems to be causing issues in manual testing as well. Fixes: d7e82594a45c ("selftests: tls: test TCP stealing data from under the TLS socket") Link: https://patch.msgid.link/20250814194323.2014650-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tls.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index d8cfcf9bb825..2b8387a83bc7 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -2748,17 +2748,18 @@ TEST(data_steal) { pid = fork(); ASSERT_GE(pid, 0); if (!pid) { - EXPECT_EQ(recv(cfd, buf, sizeof(buf), MSG_WAITALL), - sizeof(buf)); + EXPECT_EQ(recv(cfd, buf, sizeof(buf) / 2, MSG_WAITALL), + sizeof(buf) / 2); exit(!__test_passed(_metadata)); } - usleep(2000); + usleep(10000); ASSERT_EQ(setsockopt(fd, SOL_TLS, TLS_TX, &tls, tls.len), 0); ASSERT_EQ(setsockopt(cfd, SOL_TLS, TLS_RX, &tls, tls.len), 0); EXPECT_EQ(send(fd, buf, sizeof(buf), 0), sizeof(buf)); - usleep(2000); + EXPECT_EQ(wait(&status), pid); + EXPECT_EQ(status, 0); EXPECT_EQ(recv(cfd, buf2, sizeof(buf2), MSG_DONTWAIT), -1); /* Don't check errno, the error will be different depending * on what random bytes TLS interpreted as the record length. @@ -2766,9 +2767,6 @@ TEST(data_steal) { close(fd); close(cfd); - - EXPECT_EQ(wait(&status), pid); - EXPECT_EQ(status, 0); } static void __attribute__((constructor)) fips_check(void) { From c17b750b3ad9f45f2b6f7e6f7f4679844244f0b9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 17 Aug 2025 15:22:10 -0700 Subject: [PATCH 162/246] Linux 6.17-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 6bfe776bf3c5..d1adb78c3596 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Baby Opossum Posse # *DOCUMENTATION* From bac7b996d42e458a94578f4227795a0d4deef6fa Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Tue, 12 Aug 2025 18:45:46 +0200 Subject: [PATCH 163/246] smb: server: split ksmbd_rdma_stop_listening() out of ksmbd_rdma_destroy() We can't call destroy_workqueue(smb_direct_wq); before stop_sessions()! Otherwise already existing connections try to use smb_direct_wq as a NULL pointer. Cc: Namjae Jeon Cc: Steve French Cc: Tom Talpey Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Fixes: 0626e6641f6b ("cifsd: add server handler for central processing and tranport layers") Signed-off-by: Stefan Metzmacher Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/connection.c | 3 ++- fs/smb/server/transport_rdma.c | 5 ++++- fs/smb/server/transport_rdma.h | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 3f04a2977ba8..67c4f73398df 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -504,7 +504,8 @@ void ksmbd_conn_transport_destroy(void) { mutex_lock(&init_lock); ksmbd_tcp_destroy(); - ksmbd_rdma_destroy(); + ksmbd_rdma_stop_listening(); stop_sessions(); + ksmbd_rdma_destroy(); mutex_unlock(&init_lock); } diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 8d366db5f605..5466aa8c39b1 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -2194,7 +2194,7 @@ int ksmbd_rdma_init(void) return 0; } -void ksmbd_rdma_destroy(void) +void ksmbd_rdma_stop_listening(void) { if (!smb_direct_listener.cm_id) return; @@ -2203,7 +2203,10 @@ void ksmbd_rdma_destroy(void) rdma_destroy_id(smb_direct_listener.cm_id); smb_direct_listener.cm_id = NULL; +} +void ksmbd_rdma_destroy(void) +{ if (smb_direct_wq) { destroy_workqueue(smb_direct_wq); smb_direct_wq = NULL; diff --git a/fs/smb/server/transport_rdma.h b/fs/smb/server/transport_rdma.h index 77aee4e5c9dc..a2291b77488a 100644 --- a/fs/smb/server/transport_rdma.h +++ b/fs/smb/server/transport_rdma.h @@ -54,13 +54,15 @@ struct smb_direct_data_transfer { #ifdef CONFIG_SMB_SERVER_SMBDIRECT int ksmbd_rdma_init(void); +void ksmbd_rdma_stop_listening(void); void ksmbd_rdma_destroy(void); bool ksmbd_rdma_capable_netdev(struct net_device *netdev); void init_smbd_max_io_size(unsigned int sz); unsigned int get_smbd_max_read_write_size(void); #else static inline int ksmbd_rdma_init(void) { return 0; } -static inline int ksmbd_rdma_destroy(void) { return 0; } +static inline void ksmbd_rdma_stop_listening(void) { } +static inline void ksmbd_rdma_destroy(void) { } static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; } static inline void init_smbd_max_io_size(unsigned int sz) { } static inline unsigned int get_smbd_max_read_write_size(void) { return 0; } From c0d41112f1a5828c194b59cca953114bc3776ef2 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 17 Aug 2025 09:48:40 +0900 Subject: [PATCH 164/246] ksmbd: extend the connection limiting mechanism to support IPv6 Update the connection tracking logic to handle both IPv4 and IPv6 address families. Cc: stable@vger.kernel.org Fixes: e6bb91939740 ("ksmbd: limit repeated connections from clients with the same IP") Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/connection.h | 7 ++++++- fs/smb/server/transport_tcp.c | 26 +++++++++++++++++++++++--- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 31dd1caac1e8..2aa8084bb593 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -46,7 +46,12 @@ struct ksmbd_conn { struct mutex srv_mutex; int status; unsigned int cli_cap; - __be32 inet_addr; + union { + __be32 inet_addr; +#if IS_ENABLED(CONFIG_IPV6) + u8 inet6_addr[16]; +#endif + }; char *request_buf; struct ksmbd_transport *transport; struct nls_table *local_nls; diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index b1df02e321b0..4337df97987d 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -85,7 +85,14 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk) return NULL; } +#if IS_ENABLED(CONFIG_IPV6) + if (client_sk->sk->sk_family == AF_INET6) + memcpy(&conn->inet6_addr, &client_sk->sk->sk_v6_daddr, 16); + else + conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr; +#else conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr; +#endif conn->transport = KSMBD_TRANS(t); KSMBD_TRANS(t)->conn = conn; KSMBD_TRANS(t)->ops = &ksmbd_tcp_transport_ops; @@ -229,7 +236,6 @@ static int ksmbd_kthread_fn(void *p) { struct socket *client_sk = NULL; struct interface *iface = (struct interface *)p; - struct inet_sock *csk_inet; struct ksmbd_conn *conn; int ret; @@ -252,13 +258,27 @@ static int ksmbd_kthread_fn(void *p) /* * Limits repeated connections from clients with the same IP. */ - csk_inet = inet_sk(client_sk->sk); down_read(&conn_list_lock); list_for_each_entry(conn, &conn_list, conns_list) - if (csk_inet->inet_daddr == conn->inet_addr) { +#if IS_ENABLED(CONFIG_IPV6) + if (client_sk->sk->sk_family == AF_INET6) { + if (memcmp(&client_sk->sk->sk_v6_daddr, + &conn->inet6_addr, 16) == 0) { + ret = -EAGAIN; + break; + } + } else if (inet_sk(client_sk->sk)->inet_daddr == + conn->inet_addr) { ret = -EAGAIN; break; } +#else + if (inet_sk(client_sk->sk)->inet_daddr == + conn->inet_addr) { + ret = -EAGAIN; + break; + } +#endif up_read(&conn_list_lock); if (ret == -EAGAIN) continue; From 89bb430f621124af39bb31763c4a8b504c9651e2 Mon Sep 17 00:00:00 2001 From: Ziyan Xu Date: Sat, 16 Aug 2025 10:20:05 +0900 Subject: [PATCH 165/246] ksmbd: fix refcount leak causing resource not released When ksmbd_conn_releasing(opinfo->conn) returns true,the refcount was not decremented properly, causing a refcount leak that prevents the count from reaching zero and the memory from being released. Cc: stable@vger.kernel.org Signed-off-by: Ziyan Xu Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/oplock.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index d7a8a580d013..a04d5702820d 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -1102,8 +1102,10 @@ void smb_send_parent_lease_break_noti(struct ksmbd_file *fp, if (!atomic_inc_not_zero(&opinfo->refcount)) continue; - if (ksmbd_conn_releasing(opinfo->conn)) + if (ksmbd_conn_releasing(opinfo->conn)) { + opinfo_put(opinfo); continue; + } oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE, NULL); opinfo_put(opinfo); @@ -1139,8 +1141,11 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp) if (!atomic_inc_not_zero(&opinfo->refcount)) continue; - if (ksmbd_conn_releasing(opinfo->conn)) + if (ksmbd_conn_releasing(opinfo->conn)) { + opinfo_put(opinfo); continue; + } + oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE, NULL); opinfo_put(opinfo); } @@ -1343,8 +1348,10 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp, if (!atomic_inc_not_zero(&brk_op->refcount)) continue; - if (ksmbd_conn_releasing(brk_op->conn)) + if (ksmbd_conn_releasing(brk_op->conn)) { + opinfo_put(brk_op); continue; + } if (brk_op->is_lease && (brk_op->o_lease->state & (~(SMB2_LEASE_READ_CACHING_LE | From 5f1c8965e748c150d580a2ea8fbee1bd80d07a24 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 4 Aug 2025 22:11:28 +1000 Subject: [PATCH 166/246] ovl: use I_MUTEX_PARENT when locking parent in ovl_create_temp() ovl_create_temp() treats "workdir" as a parent in which it creates an object so it should use I_MUTEX_PARENT. Prior to the commit identified below the lock was taken by the caller which sometimes used I_MUTEX_PARENT and sometimes used I_MUTEX_NORMAL. The use of I_MUTEX_NORMAL was incorrect but unfortunately copied into ovl_create_temp(). Note to backporters: This patch only applies after the last Fixes given below (post v6.16). To fix the bug in v6.7 and later the inode_lock() call in ovl_copy_up_workdir() needs to nest using I_MUTEX_PARENT. Link: https://lore.kernel.org/all/67a72070.050a0220.3d72c.0022.GAE@google.com/ Cc: stable@vger.kernel.org Reported-by: syzbot+7836a68852a10ec3d790@syzkaller.appspotmail.com Tested-by: syzbot+7836a68852a10ec3d790@syzkaller.appspotmail.com Fixes: c63e56a4a652 ("ovl: do not open/llseek lower file with upper sb_writers held") Fixes: d2c995581c7c ("ovl: Call ovl_create_temp() without lock held.") Signed-off-by: NeilBrown Signed-off-by: Amir Goldstein --- fs/overlayfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 70b8687dc45e..dbd63a74df4b 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -225,7 +225,7 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir, struct ovl_cattr *attr) { struct dentry *ret; - inode_lock(workdir->d_inode); + inode_lock_nested(workdir->d_inode, I_MUTEX_PARENT); ret = ovl_create_real(ofs, workdir, ovl_lookup_temp(ofs, workdir), attr); inode_unlock(workdir->d_inode); From e8bd877fb76bb9f35253e8f41ce0c772269934dd Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 18 Aug 2025 11:23:55 +0200 Subject: [PATCH 167/246] ovl: fix possible double unlink commit 9d23967b18c6 ("ovl: simplify an error path in ovl_copy_up_workdir()") introduced the helper ovl_cleanup_unlocked(), which is later used in several following patches to re-acquire the parent inode lock and unlink a dentry that was earlier found using lookup. This helper was eventually renamed to ovl_cleanup(). The helper ovl_parent_lock() is used to re-acquire the parent inode lock. After acquiring the parent inode lock, the helper verifies that the dentry has not since been moved to another parent, but it failed to verify that the dentry wasn't unlinked from the parent. This means that now every call to ovl_cleanup() could potentially race with another thread, unlinking the dentry to be cleaned up underneath overlayfs and trigger a vfs assertion. Reported-by: syzbot+ec9fab8b7f0386b98a17@syzkaller.appspotmail.com Tested-by: syzbot+ec9fab8b7f0386b98a17@syzkaller.appspotmail.com Fixes: 9d23967b18c6 ("ovl: simplify an error path in ovl_copy_up_workdir()") Suggested-by: NeilBrown Signed-off-by: Amir Goldstein --- fs/overlayfs/util.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index a33115e7384c..41033bac96cb 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -1552,7 +1552,8 @@ void ovl_copyattr(struct inode *inode) int ovl_parent_lock(struct dentry *parent, struct dentry *child) { inode_lock_nested(parent->d_inode, I_MUTEX_PARENT); - if (!child || child->d_parent == parent) + if (!child || + (!d_unhashed(child) && child->d_parent == parent)) return 0; inode_unlock(parent->d_inode); From 864e3396976ef41de6cc7bc366276bf4e084fff2 Mon Sep 17 00:00:00 2001 From: Jakub Ramaseuski Date: Thu, 14 Aug 2025 12:51:19 +0200 Subject: [PATCH 168/246] net: gso: Forbid IPv6 TSO with extensions on devices with only IPV6_CSUM When performing Generic Segmentation Offload (GSO) on an IPv6 packet that contains extension headers, the kernel incorrectly requests checksum offload if the egress device only advertises NETIF_F_IPV6_CSUM feature, which has a strict contract: it supports checksum offload only for plain TCP or UDP over IPv6 and explicitly does not support packets with extension headers. The current GSO logic violates this contract by failing to disable the feature for packets with extension headers, such as those used in GREoIPv6 tunnels. This violation results in the device being asked to perform an operation it cannot support, leading to a `skb_warn_bad_offload` warning and a collapse of network throughput. While device TSO/USO is correctly bypassed in favor of software GSO for these packets, the GSO stack must be explicitly told not to request checksum offload. Mask NETIF_F_IPV6_CSUM, NETIF_F_TSO6 and NETIF_F_GSO_UDP_L4 in gso_features_check if the IPv6 header contains extension headers to compute checksum in software. The exception is a BIG TCP extension, which, as stated in commit 68e068cabd2c6c53 ("net: reenable NETIF_F_IPV6_CSUM offload for BIG TCP packets"): "The feature is only enabled on devices that support BIG TCP TSO. The header is only present for PF_PACKET taps like tcpdump, and not transmitted by physical devices." kernel log output (truncated): WARNING: CPU: 1 PID: 5273 at net/core/dev.c:3535 skb_warn_bad_offload+0x81/0x140 ... Call Trace: skb_checksum_help+0x12a/0x1f0 validate_xmit_skb+0x1a3/0x2d0 validate_xmit_skb_list+0x4f/0x80 sch_direct_xmit+0x1a2/0x380 __dev_xmit_skb+0x242/0x670 __dev_queue_xmit+0x3fc/0x7f0 ip6_finish_output2+0x25e/0x5d0 ip6_finish_output+0x1fc/0x3f0 ip6_tnl_xmit+0x608/0xc00 [ip6_tunnel] ip6gre_tunnel_xmit+0x1c0/0x390 [ip6_gre] dev_hard_start_xmit+0x63/0x1c0 __dev_queue_xmit+0x6d0/0x7f0 ip6_finish_output2+0x214/0x5d0 ip6_finish_output+0x1fc/0x3f0 ip6_xmit+0x2ca/0x6f0 ip6_finish_output+0x1fc/0x3f0 ip6_xmit+0x2ca/0x6f0 inet6_csk_xmit+0xeb/0x150 __tcp_transmit_skb+0x555/0xa80 tcp_write_xmit+0x32a/0xe90 tcp_sendmsg_locked+0x437/0x1110 tcp_sendmsg+0x2f/0x50 ... skb linear: 00000000: e4 3d 1a 7d ec 30 e4 3d 1a 7e 5d 90 86 dd 60 0e skb linear: 00000010: 00 0a 1b 34 3c 40 20 11 00 00 00 00 00 00 00 00 skb linear: 00000020: 00 00 00 00 00 12 20 11 00 00 00 00 00 00 00 00 skb linear: 00000030: 00 00 00 00 00 11 2f 00 04 01 04 01 01 00 00 00 skb linear: 00000040: 86 dd 60 0e 00 0a 1b 00 06 40 20 23 00 00 00 00 skb linear: 00000050: 00 00 00 00 00 00 00 00 00 12 20 23 00 00 00 00 skb linear: 00000060: 00 00 00 00 00 00 00 00 00 11 bf 96 14 51 13 f9 skb linear: 00000070: ae 27 a0 a8 2b e3 80 18 00 40 5b 6f 00 00 01 01 skb linear: 00000080: 08 0a 42 d4 50 d5 4b 70 f8 1a Fixes: 04c20a9356f283da ("net: skip offload for NETIF_F_IPV6_CSUM if ipv6 header contains extension") Reported-by: Tianhao Zhao Suggested-by: Michal Schmidt Suggested-by: Willem de Bruijn Signed-off-by: Jakub Ramaseuski Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250814105119.1525687-1-jramaseu@redhat.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index 5a3c0f40a93f..93a25d87b86b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3779,6 +3779,18 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, features &= ~NETIF_F_TSO_MANGLEID; } + /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers, + * so neither does TSO that depends on it. + */ + if (features & NETIF_F_IPV6_CSUM && + (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 || + (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && + vlan_get_protocol(skb) == htons(ETH_P_IPV6))) && + skb_transport_header_was_set(skb) && + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) + features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4); + return features; } From 84967deee9d9870b15bc4c3acb50f1d401807902 Mon Sep 17 00:00:00 2001 From: Minhong He Date: Fri, 15 Aug 2025 14:38:45 +0800 Subject: [PATCH 169/246] ipv6: sr: validate HMAC algorithm ID in seg6_hmac_info_add The seg6_genl_sethmac() directly uses the algorithm ID provided by the userspace without verifying whether it is an HMAC algorithm supported by the system. If an unsupported HMAC algorithm ID is configured, packets using SRv6 HMAC will be dropped during encapsulation or decapsulation. Fixes: 4f4853dc1c9c ("ipv6: sr: implement API to control SR HMAC structure") Signed-off-by: Minhong He Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250815063845.85426-1-heminhong@kylinos.cn Signed-off-by: Jakub Kicinski --- net/ipv6/seg6_hmac.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index f78ecb6ad838..d77b52523b6a 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -304,6 +304,9 @@ int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) struct seg6_pernet_data *sdata = seg6_pernet(net); int err; + if (!__hmac_get_algo(hinfo->alg_id)) + return -EINVAL; + err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node, rht_params); From ccab044697980c6c01ab51f43f48f13b8a3e5c33 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Fri, 15 Aug 2025 19:28:19 +0200 Subject: [PATCH 170/246] mptcp: drop skb if MPTCP skb extension allocation fails When skb_ext_add(skb, SKB_EXT_MPTCP) fails in mptcp_incoming_options(), we used to return true, letting the segment proceed through the TCP receive path without a DSS mapping. Such segments can leave inconsistent mapping state and trigger a mid-stream fallback to TCP, which in testing collapsed (by artificially forcing failures in skb_ext_add) throughput to zero. Return false instead so the TCP input path drops the skb (see tcp_data_queue() and step-7 processing). This is the safer choice under memory pressure: it preserves MPTCP correctness and provides backpressure to the sender. Control packets remain unaffected: ACK updates and DATA_FIN handling happen before attempting the extension allocation, and tcp_reset() continues to ignore the return value. With this change, MPTCP continues to work at high throughput if we artificially inject failures into skb_ext_add. Fixes: 6787b7e350d3 ("mptcp: avoid processing packet if a subflow reset") Cc: stable@vger.kernel.org Signed-off-by: Christoph Paasch Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-1-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 70c0ab0ecf90..2a8ea28442b2 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1118,7 +1118,9 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk, return hmac == mp_opt->ahmac; } -/* Return false if a subflow has been reset, else return true */ +/* Return false in case of error (or subflow has been reset), + * else return true. + */ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -1222,7 +1224,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) - return true; + return false; memset(mpext, 0, sizeof(*mpext)); From 68fc0f4b0d25692940cdc85c68e366cae63e1757 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 15 Aug 2025 19:28:20 +0200 Subject: [PATCH 171/246] mptcp: pm: kernel: flush: do not reset ADD_ADDR limit A flush of the MPTCP endpoints should not affect the MPTCP limits. In other words, 'ip mptcp endpoint flush' should not change 'ip mptcp limits'. But it was the case: the MPTCP_PM_ATTR_RCV_ADD_ADDRS (add_addr_accepted) limit was reset by accident. Removing the reset of this counter during a flush fixes this issue. Fixes: 01cacb00b35c ("mptcp: add netlink-based PM") Cc: stable@vger.kernel.org Reported-by: Thomas Dreibholz Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/579 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-2-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index d39e7c178460..667803d72b64 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -1085,7 +1085,6 @@ static void __flush_addrs(struct list_head *list) static void __reset_counters(struct pm_nl_pernet *pernet) { WRITE_ONCE(pernet->add_addr_signal_max, 0); - WRITE_ONCE(pernet->add_addr_accept_max, 0); WRITE_ONCE(pernet->local_addr_max, 0); pernet->addrs = 0; } From 452690be7de2f91cc0de68cb9e95252875b33503 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 15 Aug 2025 19:28:21 +0200 Subject: [PATCH 172/246] selftests: mptcp: pm: check flush doesn't reset limits This modification is linked to the parent commit where the received ADD_ADDR limit was accidentally reset when the endpoints were flushed. To validate that, the test is now flushing endpoints after having set new limits, and before checking them. The 'Fixes' tag here below is the same as the one from the previous commit: this patch here is not fixing anything wrong in the selftests, but it validates the previous fix for an issue introduced by this commit ID. Fixes: 01cacb00b35c ("mptcp: add netlink-based PM") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-3-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/pm_netlink.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 2e6648a2b2c0..ac7ec6f94023 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -198,6 +198,7 @@ set_limits 1 9 2>/dev/null check "get_limits" "${default_limits}" "subflows above hard limit" set_limits 8 8 +flush_endpoint ## to make sure it doesn't affect the limits check "get_limits" "$(format_limits 8 8)" "set limits" flush_endpoint From 5d13349472ac8abcbcb94407969aa0fdc2e1f1be Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 15 Aug 2025 19:28:22 +0200 Subject: [PATCH 173/246] mptcp: remove duplicate sk_reset_timer call sk_reset_timer() was called twice in mptcp_pm_alloc_anno_list. Simplify the code by using a 'goto' statement to eliminate the duplication. Note that this is not a fix, but it will help backporting the following patch. The same "Fixes" tag has been added for this reason. Fixes: 93f323b9cccc ("mptcp: add a new sysctl add_addr_timeout") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-4-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 420d416e2603..c5f6a53ce5f1 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -353,9 +353,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk))) return false; - sk_reset_timer(sk, &add_entry->add_timer, - jiffies + mptcp_get_add_addr_timeout(net)); - return true; + goto reset_timer; } add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); @@ -369,6 +367,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, add_entry->retrans_times = 0; timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); +reset_timer: sk_reset_timer(sk, &add_entry->add_timer, jiffies + mptcp_get_add_addr_timeout(net)); From f5ce0714623cffd00bf2a83e890d09c609b7f50a Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 15 Aug 2025 19:28:23 +0200 Subject: [PATCH 174/246] mptcp: disable add_addr retransmission when timeout is 0 When add_addr_timeout was set to 0, this caused the ADD_ADDR to be retransmitted immediately, which looks like a buggy behaviour. Instead, interpret 0 as "no retransmissions needed". The documentation is updated to explicitly state that setting the timeout to 0 disables retransmission. Fixes: 93f323b9cccc ("mptcp: add a new sysctl add_addr_timeout") Cc: stable@vger.kernel.org Suggested-by: Matthieu Baerts Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-5-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/mptcp-sysctl.rst | 2 ++ net/mptcp/pm.c | 13 ++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index 5bfab01eff5a..1683c139821e 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -12,6 +12,8 @@ add_addr_timeout - INTEGER (seconds) resent to an MPTCP peer that has not acknowledged a previous ADD_ADDR message. + Do not retransmit if set to 0. + The default value matches TCP_RTO_MAX. This is a per-namespace sysctl. diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index c5f6a53ce5f1..136a380602ca 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -274,6 +274,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) add_timer); struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; + unsigned int timeout; pr_debug("msk=%p\n", msk); @@ -291,6 +292,10 @@ static void mptcp_pm_add_timer(struct timer_list *timer) goto out; } + timeout = mptcp_get_add_addr_timeout(sock_net(sk)); + if (!timeout) + goto out; + spin_lock_bh(&msk->pm.lock); if (!mptcp_pm_should_add_signal_addr(msk)) { @@ -302,7 +307,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) sk_reset_timer(sk, timer, - jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); + jiffies + timeout); spin_unlock_bh(&msk->pm.lock); @@ -344,6 +349,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; struct net *net = sock_net(sk); + unsigned int timeout; lockdep_assert_held(&msk->pm.lock); @@ -368,8 +374,9 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); reset_timer: - sk_reset_timer(sk, &add_entry->add_timer, - jiffies + mptcp_get_add_addr_timeout(net)); + timeout = mptcp_get_add_addr_timeout(net); + if (timeout) + sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout); return true; } From f92199f551e617fae028c5c5905ddd63e3616e18 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 15 Aug 2025 19:28:24 +0200 Subject: [PATCH 175/246] selftests: mptcp: disable add_addr retrans in endpoint_tests To prevent test instability in the "delete re-add signal" test caused by ADD_ADDR retransmissions, disable retransmissions for this test by setting net.mptcp.add_addr_timeout to 0. Suggested-by: Matthieu Baerts Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-6-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_join.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index b8af65373b3a..82cae37d9c20 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -3842,6 +3842,7 @@ endpoint_tests() # remove and re-add if reset_with_events "delete re-add signal" && mptcp_lib_kallsyms_has "subflow_rebuild_header$"; then + ip netns exec $ns1 sysctl -q net.mptcp.add_addr_timeout=0 pm_nl_set_limits $ns1 0 3 pm_nl_set_limits $ns2 3 3 pm_nl_add_endpoint $ns1 10.0.2.1 id 1 flags signal From 2eefbed30d46d5e68593baf6b52923e00e7678af Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 15 Aug 2025 19:28:25 +0200 Subject: [PATCH 176/246] selftests: mptcp: connect: fix C23 extension warning GCC was complaining about the new label: mptcp_connect.c:187:2: warning: label followed by a declaration is a C23 extension [-Wc23-extensions] 187 | int err = getaddrinfo(node, service, hints, res); | ^ Simply declare 'err' before the label to avoid this warning. Fixes: a862771d1aa4 ("selftests: mptcp: use IPPROTO_MPTCP for getaddrinfo") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-7-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index ac1349c4b9e5..4f07ac9fa207 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -183,9 +183,10 @@ static void xgetaddrinfo(const char *node, const char *service, struct addrinfo *hints, struct addrinfo **res) { -again: - int err = getaddrinfo(node, service, hints, res); + int err; +again: + err = getaddrinfo(node, service, hints, res); if (err) { const char *errstr; From 3259889fd3c0cc165b7e9ee375c789875dd32326 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 15 Aug 2025 19:28:26 +0200 Subject: [PATCH 177/246] selftests: mptcp: sockopt: fix C23 extension warning GCC was complaining about the new label: mptcp_inq.c:79:2: warning: label followed by a declaration is a C23 extension [-Wc23-extensions] 79 | int err = getaddrinfo(node, service, hints, res); | ^ mptcp_sockopt.c:166:2: warning: label followed by a declaration is a C23 extension [-Wc23-extensions] 166 | int err = getaddrinfo(node, service, hints, res); | ^ Simply declare 'err' before the label to avoid this warning. Fixes: dd367e81b79a ("selftests: mptcp: sockopt: use IPPROTO_MPTCP for getaddrinfo") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-8-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_inq.c | 5 +++-- tools/testing/selftests/net/mptcp/mptcp_sockopt.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_inq.c b/tools/testing/selftests/net/mptcp/mptcp_inq.c index 3cf1e2a612ce..f3bcaa48df8f 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_inq.c +++ b/tools/testing/selftests/net/mptcp/mptcp_inq.c @@ -75,9 +75,10 @@ static void xgetaddrinfo(const char *node, const char *service, struct addrinfo *hints, struct addrinfo **res) { -again: - int err = getaddrinfo(node, service, hints, res); + int err; +again: + err = getaddrinfo(node, service, hints, res); if (err) { const char *errstr; diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c index 9934a68df237..e934dd26a59d 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c @@ -162,9 +162,10 @@ static void xgetaddrinfo(const char *node, const char *service, struct addrinfo *hints, struct addrinfo **res) { -again: - int err = getaddrinfo(node, service, hints, res); + int err; +again: + err = getaddrinfo(node, service, hints, res); if (err) { const char *errstr; From 89eb9a62aed77b409663ba1eac152e8f758815b7 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Fri, 15 Aug 2025 22:18:09 +0200 Subject: [PATCH 178/246] net: dsa: b53: fix reserved register access in b53_fdb_dump() When BCM5325 support was added in c45655386e53 ("net: dsa: b53: add support for FDB operations on 5325/5365"), the register used for ARL access was made conditional on the chip. But in b53_fdb_dump(), instead of the register argument the page argument was replaced, causing it to write to a reserved page 0x50 on !BCM5325*. Writing to this page seems to completely lock the switch up: [ 89.680000] b53-switch spi0.1 lan2: Link is Down [ 89.680000] WARNING: CPU: 1 PID: 26 at drivers/net/phy/phy.c:1350 _phy_state_machine+0x1bc/0x454 [ 89.720000] phy_check_link_status+0x0/0x114: returned: -5 [ 89.730000] Modules linked in: nft_fib_inet nf_flow_table_inet nft_reject_ipv6 nft_reject_ipv4 nft_reject_inet nft_reject nft_redir nft_quota nft_numgen nft_nat nft_masq nft_log nft_limit nft_hash nft_flow_offload nft_fib_ipv6 nft_fib_ipv4 nft_fib nft_ct nft_chain_nat nf_tables nf_nat nf_flow_table nf_conntrack nfnetlink nf_reject_ipv6 nf_reject_ipv4 nf_log_syslog nf_defrag_ipv6 nf_defrag_ipv4 cls_flower sch_tbf sch_ingress sch_htb sch_hfsc em_u32 cls_u32 cls_route cls_matchall cls_fw cls_flow cls_basic act_skbedit act_mirred act_gact vrf md5 crc32c_cryptoapi [ 89.780000] CPU: 1 UID: 0 PID: 26 Comm: kworker/u10:0 Tainted: G W 6.16.0-rc1+ #0 NONE [ 89.780000] Tainted: [W]=WARN [ 89.780000] Hardware name: Netgear DGND3700 v1 [ 89.780000] Workqueue: events_power_efficient phy_state_machine [ 89.780000] Stack : 809c762c 8006b050 00000001 820a9ce3 0000114c 000affff 805d22d0 8200ba00 [ 89.780000] 82005000 6576656e 74735f70 6f776572 5f656666 10008b00 820a9cb8 82088700 [ 89.780000] 00000000 00000000 809c762c 820a9a98 00000000 00000000 ffffefff 80a7a76c [ 89.780000] 80a70000 820a9af8 80a70000 80a70000 80a70000 00000000 809c762c 820a9dd4 [ 89.780000] 00000000 805d1494 80a029e4 80a70000 00000003 00000000 00000004 81a60004 [ 89.780000] ... [ 89.780000] Call Trace: [ 89.780000] [<800228b8>] show_stack+0x38/0x118 [ 89.780000] [<8001afc4>] dump_stack_lvl+0x6c/0xac [ 89.780000] [<80046b90>] __warn+0x9c/0x114 [ 89.780000] [<80046da8>] warn_slowpath_fmt+0x1a0/0x1b0 [ 89.780000] [<805d1494>] _phy_state_machine+0x1bc/0x454 [ 89.780000] [<805d22fc>] phy_state_machine+0x2c/0x70 [ 89.780000] [<80066b08>] process_one_work+0x1e8/0x3e0 [ 89.780000] [<80067a1c>] worker_thread+0x354/0x4e4 [ 89.780000] [<800706cc>] kthread+0x130/0x274 [ 89.780000] [<8001d808>] ret_from_kernel_thread+0x14/0x1c And any further accesses fail: [ 120.790000] b53-switch spi0.1: timeout waiting for ARL to finish: 0x81 [ 120.800000] b53-switch spi0.1: port 2 failed to add 2c:b0:5d:27:9a:bd vid 3 to fdb: -145 [ 121.010000] b53-switch spi0.1: timeout waiting for ARL to finish: 0xbf [ 121.020000] b53-switch spi0.1: port 3 failed to add 2c:b0:5d:27:9a:bd vid 3 to fdb: -145 Restore the correct page B53_ARLIO_PAGE again, and move the offset argument to the correct place. *On BCM5325, this became a write to the MIB page of Port 1. Still a reserved offset, but likely less brokenness from that write. Fixes: c45655386e53 ("net: dsa: b53: add support for FDB operations on 5325/5365") Signed-off-by: Jonas Gorski Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250815201809.549195-1-jonas.gorski@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 9942fb6f7f4b..829b1f087e9e 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -2078,7 +2078,7 @@ int b53_fdb_dump(struct dsa_switch *ds, int port, /* Start search operation */ reg = ARL_SRCH_STDN; - b53_write8(priv, offset, B53_ARL_SRCH_CTL, reg); + b53_write8(priv, B53_ARLIO_PAGE, offset, reg); do { ret = b53_arl_search_wait(priv); From 4611d88a37cfc18cbabc6978aaf7325d1ae3f53a Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sat, 16 Aug 2025 11:38:50 -0700 Subject: [PATCH 179/246] bnxt_en: Fix lockdep warning during rmmod The commit under the Fixes tag added a netdev_assert_locked() in bnxt_free_ntp_fltrs(). The lock should be held during normal run-time but the assert will be triggered (see below) during bnxt_remove_one() which should not need the lock. The netdev is already unregistered by then. Fix it by calling netdev_assert_locked_or_invisible() which will not assert if the netdev is unregistered. WARNING: CPU: 5 PID: 2241 at ./include/net/netdev_lock.h:17 bnxt_free_ntp_fltrs+0xf8/0x100 [bnxt_en] Modules linked in: rpcrdma rdma_cm iw_cm ib_cm configfs ib_core bnxt_en(-) bridge stp llc x86_pkg_temp_thermal xfs tg3 [last unloaded: bnxt_re] CPU: 5 UID: 0 PID: 2241 Comm: rmmod Tainted: G S W 6.16.0 #2 PREEMPT(voluntary) Tainted: [S]=CPU_OUT_OF_SPEC, [W]=WARN Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.4.3 01/17/2017 RIP: 0010:bnxt_free_ntp_fltrs+0xf8/0x100 [bnxt_en] Code: 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 48 8b 47 60 be ff ff ff ff 48 8d b8 28 0c 00 00 e8 d0 cf 41 c3 85 c0 0f 85 2e ff ff ff <0f> 0b e9 27 ff ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 0018:ffffa92082387da0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff9e5b593d8000 RCX: 0000000000000001 RDX: 0000000000000001 RSI: ffffffff83dc9a70 RDI: ffffffff83e1a1cf RBP: ffff9e5b593d8c80 R08: 0000000000000000 R09: ffffffff8373a2b3 R10: 000000008100009f R11: 0000000000000001 R12: 0000000000000001 R13: ffffffffc01c4478 R14: dead000000000122 R15: dead000000000100 FS: 00007f3a8a52c740(0000) GS:ffff9e631ad1c000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055bb289419c8 CR3: 000000011274e001 CR4: 00000000003706f0 Call Trace: bnxt_remove_one+0x57/0x180 [bnxt_en] pci_device_remove+0x39/0xc0 device_release_driver_internal+0xa5/0x130 driver_detach+0x42/0x90 bus_remove_driver+0x61/0xc0 pci_unregister_driver+0x38/0x90 bnxt_exit+0xc/0x7d0 [bnxt_en] Fixes: 004b5008016a ("eth: bnxt: remove most dependencies on RTNL") Reviewed-by: Pavan Chebbi Signed-off-by: Michael Chan Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250816183850.4125033-1-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 2800a90fba1f..207a8bb36ae5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5332,7 +5332,7 @@ static void bnxt_free_ntp_fltrs(struct bnxt *bp, bool all) { int i; - netdev_assert_locked(bp->dev); + netdev_assert_locked_or_invisible(bp->dev); /* Under netdev instance lock and all our NAPIs have been disabled. * It's safe to delete the hash table. From 62c30c544359aa18b8fb2734166467a07d435c2d Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Thu, 14 Aug 2025 09:25:57 +0800 Subject: [PATCH 180/246] net: ethernet: mtk_ppe: add RCU lock around dev_fill_forward_path Ensure ndo_fill_forward_path() is called with RCU lock held. Fixes: 2830e314778d ("net: ethernet: mtk-ppe: fix traffic offload with bridged wlan") Signed-off-by: Qingfang Deng Link: https://patch.msgid.link/20250814012559.3705-1-dqfext@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mediatek/mtk_ppe_offload.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c index c855fb799ce1..e9bd32741983 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c @@ -101,7 +101,9 @@ mtk_flow_get_wdma_info(struct net_device *dev, const u8 *addr, struct mtk_wdma_i if (!IS_ENABLED(CONFIG_NET_MEDIATEK_SOC_WED)) return -1; + rcu_read_lock(); err = dev_fill_forward_path(dev, addr, &stack); + rcu_read_unlock(); if (err) return err; From 0417adf367a0af11adf7ace849af4638cfb573f7 Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Thu, 14 Aug 2025 09:25:58 +0800 Subject: [PATCH 181/246] ppp: fix race conditions in ppp_fill_forward_path ppp_fill_forward_path() has two race conditions: 1. The ppp->channels list can change between list_empty() and list_first_entry(), as ppp_lock() is not held. If the only channel is deleted in ppp_disconnect_channel(), list_first_entry() may access an empty head or a freed entry, and trigger a panic. 2. pch->chan can be NULL. When ppp_unregister_channel() is called, pch->chan is set to NULL before pch is removed from ppp->channels. Fix these by using a lockless RCU approach: - Use list_first_or_null_rcu() to safely test and access the first list entry. - Convert list modifications on ppp->channels to their RCU variants and add synchronize_net() after removal. - Check for a NULL pch->chan before dereferencing it. Fixes: f6efc675c9dd ("net: ppp: resolve forwarding path for bridge pppoe devices") Signed-off-by: Qingfang Deng Link: https://patch.msgid.link/20250814012559.3705-2-dqfext@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ppp/ppp_generic.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 8c98cbd4b06d..824c8dc4120b 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1598,11 +1599,14 @@ static int ppp_fill_forward_path(struct net_device_path_ctx *ctx, if (ppp->flags & SC_MULTILINK) return -EOPNOTSUPP; - if (list_empty(&ppp->channels)) + pch = list_first_or_null_rcu(&ppp->channels, struct channel, clist); + if (!pch) + return -ENODEV; + + chan = READ_ONCE(pch->chan); + if (!chan) return -ENODEV; - pch = list_first_entry(&ppp->channels, struct channel, clist); - chan = pch->chan; if (!chan->ops->fill_forward_path) return -EOPNOTSUPP; @@ -2994,7 +2998,7 @@ ppp_unregister_channel(struct ppp_channel *chan) */ down_write(&pch->chan_sem); spin_lock_bh(&pch->downl); - pch->chan = NULL; + WRITE_ONCE(pch->chan, NULL); spin_unlock_bh(&pch->downl); up_write(&pch->chan_sem); ppp_disconnect_channel(pch); @@ -3515,7 +3519,7 @@ ppp_connect_channel(struct channel *pch, int unit) hdrlen = pch->file.hdrlen + 2; /* for protocol bytes */ if (hdrlen > ppp->dev->hard_header_len) ppp->dev->hard_header_len = hdrlen; - list_add_tail(&pch->clist, &ppp->channels); + list_add_tail_rcu(&pch->clist, &ppp->channels); ++ppp->n_channels; pch->ppp = ppp; refcount_inc(&ppp->file.refcnt); @@ -3545,10 +3549,11 @@ ppp_disconnect_channel(struct channel *pch) if (ppp) { /* remove it from the ppp unit's list */ ppp_lock(ppp); - list_del(&pch->clist); + list_del_rcu(&pch->clist); if (--ppp->n_channels == 0) wake_up_interruptible(&ppp->file.rwait); ppp_unlock(ppp); + synchronize_net(); if (refcount_dec_and_test(&ppp->file.refcnt)) ppp_destroy_interface(ppp); err = 0; From 01792bc3e5bdafa171dd83c7073f00e7de93a653 Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Thu, 14 Aug 2025 16:21:06 +0530 Subject: [PATCH 182/246] net: ti: icssg-prueth: Fix HSR and switch offload Enablement during firwmare reload. To enable HSR / Switch offload, certain configurations are needed. Currently they are done inside icssg_change_mode(). This function only gets called if we move from one mode to another without bringing the links up / down. Once in HSR / Switch mode, if we bring the links down and bring it back up again. The callback sequence is, - emac_ndo_stop() Firmwares are stopped - emac_ndo_open() Firmwares are loaded In this path icssg_change_mode() doesn't get called and as a result the configurations needed for HSR / Switch is not done. To fix this, put all these configurations in a separate function icssg_enable_fw_offload() and call this from both icssg_change_mode() and emac_ndo_open() Fixes: 56375086d093 ("net: ti: icssg-prueth: Enable HSR Tx duplication, Tx Tag and Rx Tag offload") Signed-off-by: MD Danish Anwar Link: https://patch.msgid.link/20250814105106.1491871-1-danishanwar@ti.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/icssg/icssg_prueth.c | 72 +++++++++++--------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index 6c7d776ae4ee..dadce6009791 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -203,6 +203,44 @@ static void prueth_emac_stop(struct prueth *prueth) } } +static void icssg_enable_fw_offload(struct prueth *prueth) +{ + struct prueth_emac *emac; + int mac; + + for (mac = PRUETH_MAC0; mac < PRUETH_NUM_MACS; mac++) { + emac = prueth->emac[mac]; + if (prueth->is_hsr_offload_mode) { + if (emac->ndev->features & NETIF_F_HW_HSR_TAG_RM) + icssg_set_port_state(emac, ICSSG_EMAC_HSR_RX_OFFLOAD_ENABLE); + else + icssg_set_port_state(emac, ICSSG_EMAC_HSR_RX_OFFLOAD_DISABLE); + } + + if (prueth->is_switch_mode || prueth->is_hsr_offload_mode) { + if (netif_running(emac->ndev)) { + icssg_fdb_add_del(emac, eth_stp_addr, prueth->default_vlan, + ICSSG_FDB_ENTRY_P0_MEMBERSHIP | + ICSSG_FDB_ENTRY_P1_MEMBERSHIP | + ICSSG_FDB_ENTRY_P2_MEMBERSHIP | + ICSSG_FDB_ENTRY_BLOCK, + true); + icssg_vtbl_modify(emac, emac->port_vlan | DEFAULT_VID, + BIT(emac->port_id) | DEFAULT_PORT_MASK, + BIT(emac->port_id) | DEFAULT_UNTAG_MASK, + true); + if (prueth->is_hsr_offload_mode) + icssg_vtbl_modify(emac, DEFAULT_VID, + DEFAULT_PORT_MASK, + DEFAULT_UNTAG_MASK, true); + icssg_set_pvid(prueth, emac->port_vlan, emac->port_id); + if (prueth->is_switch_mode) + icssg_set_port_state(emac, ICSSG_EMAC_PORT_VLAN_AWARE_ENABLE); + } + } + } +} + static int prueth_emac_common_start(struct prueth *prueth) { struct prueth_emac *emac; @@ -753,6 +791,7 @@ static int emac_ndo_open(struct net_device *ndev) ret = prueth_emac_common_start(prueth); if (ret) goto free_rx_irq; + icssg_enable_fw_offload(prueth); } flow_cfg = emac->dram.va + ICSSG_CONFIG_OFFSET + PSI_L_REGULAR_FLOW_ID_BASE_OFFSET; @@ -1360,8 +1399,7 @@ static int prueth_emac_restart(struct prueth *prueth) static void icssg_change_mode(struct prueth *prueth) { - struct prueth_emac *emac; - int mac, ret; + int ret; ret = prueth_emac_restart(prueth); if (ret) { @@ -1369,35 +1407,7 @@ static void icssg_change_mode(struct prueth *prueth) return; } - for (mac = PRUETH_MAC0; mac < PRUETH_NUM_MACS; mac++) { - emac = prueth->emac[mac]; - if (prueth->is_hsr_offload_mode) { - if (emac->ndev->features & NETIF_F_HW_HSR_TAG_RM) - icssg_set_port_state(emac, ICSSG_EMAC_HSR_RX_OFFLOAD_ENABLE); - else - icssg_set_port_state(emac, ICSSG_EMAC_HSR_RX_OFFLOAD_DISABLE); - } - - if (netif_running(emac->ndev)) { - icssg_fdb_add_del(emac, eth_stp_addr, prueth->default_vlan, - ICSSG_FDB_ENTRY_P0_MEMBERSHIP | - ICSSG_FDB_ENTRY_P1_MEMBERSHIP | - ICSSG_FDB_ENTRY_P2_MEMBERSHIP | - ICSSG_FDB_ENTRY_BLOCK, - true); - icssg_vtbl_modify(emac, emac->port_vlan | DEFAULT_VID, - BIT(emac->port_id) | DEFAULT_PORT_MASK, - BIT(emac->port_id) | DEFAULT_UNTAG_MASK, - true); - if (prueth->is_hsr_offload_mode) - icssg_vtbl_modify(emac, DEFAULT_VID, - DEFAULT_PORT_MASK, - DEFAULT_UNTAG_MASK, true); - icssg_set_pvid(prueth, emac->port_vlan, emac->port_id); - if (prueth->is_switch_mode) - icssg_set_port_state(emac, ICSSG_EMAC_PORT_VLAN_AWARE_ENABLE); - } - } + icssg_enable_fw_offload(prueth); } static int prueth_netdevice_port_link(struct net_device *ndev, From 7375f22495e7cd1c5b3b5af9dcc4f6dffe34ce49 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 11 Aug 2025 22:18:30 +0800 Subject: [PATCH 183/246] fs/buffer: fix use-after-free when call bh_read() helper There's issue as follows: BUG: KASAN: stack-out-of-bounds in end_buffer_read_sync+0xe3/0x110 Read of size 8 at addr ffffc9000168f7f8 by task swapper/3/0 CPU: 3 UID: 0 PID: 0 Comm: swapper/3 Not tainted 6.16.0-862.14.0.6.x86_64 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) Call Trace: dump_stack_lvl+0x55/0x70 print_address_description.constprop.0+0x2c/0x390 print_report+0xb4/0x270 kasan_report+0xb8/0xf0 end_buffer_read_sync+0xe3/0x110 end_bio_bh_io_sync+0x56/0x80 blk_update_request+0x30a/0x720 scsi_end_request+0x51/0x2b0 scsi_io_completion+0xe3/0x480 ? scsi_device_unbusy+0x11e/0x160 blk_complete_reqs+0x7b/0x90 handle_softirqs+0xef/0x370 irq_exit_rcu+0xa5/0xd0 sysvec_apic_timer_interrupt+0x6e/0x90 Above issue happens when do ntfs3 filesystem mount, issue may happens as follows: mount IRQ ntfs_fill_super read_cache_page do_read_cache_folio filemap_read_folio mpage_read_folio do_mpage_readpage ntfs_get_block_vbo bh_read submit_bh wait_on_buffer(bh); blk_complete_reqs scsi_io_completion scsi_end_request blk_update_request end_bio_bh_io_sync end_buffer_read_sync __end_buffer_read_notouch unlock_buffer wait_on_buffer(bh);--> return will return to caller put_bh --> trigger stack-out-of-bounds In the mpage_read_folio() function, the stack variable 'map_bh' is passed to ntfs_get_block_vbo(). Once unlock_buffer() unlocks and wait_on_buffer() returns to continue processing, the stack variable is likely to be reclaimed. Consequently, during the end_buffer_read_sync() process, calling put_bh() may result in stack overrun. If the bh is not allocated on the stack, it belongs to a folio. Freeing a buffer head which belongs to a folio is done by drop_buffers() which will fail to free buffers which are still locked. So it is safe to call put_bh() before __end_buffer_read_notouch(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Ye Bin Link: https://lore.kernel.org/20250811141830.343774-1-yebin@huaweicloud.com Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index ead4dc85debd..6a8752f7bbed 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -157,8 +157,8 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) */ void end_buffer_read_sync(struct buffer_head *bh, int uptodate) { - __end_buffer_read_notouch(bh, uptodate); put_bh(bh); + __end_buffer_read_notouch(bh, uptodate); } EXPORT_SYMBOL(end_buffer_read_sync); From 589c12edcd8a7b3b24f407b58443bab3560125e4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 Aug 2025 12:41:15 +0300 Subject: [PATCH 184/246] coredump: Fix return value in coredump_parse() The coredump_parse() function is bool type. It should return true on success and false on failure. The cn_printf() returns zero on success or negative error codes. This mismatch means that when "return err;" here, it is treated as success instead of failure. Change it to return false instead. Fixes: a5715af549b2 ("coredump: make coredump_parse() return bool") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/aKRGu14w5vPSZLgv@stanley.mountain Signed-off-by: Christian Brauner --- fs/coredump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/coredump.c b/fs/coredump.c index fedbead956ed..5dce257c67fc 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -345,7 +345,7 @@ static bool coredump_parse(struct core_name *cn, struct coredump_params *cprm, was_space = false; err = cn_printf(cn, "%c", '\0'); if (err) - return err; + return false; (*argv)[(*argc)++] = cn->used; } } From c237aa9884f238e1480897463ca034877ca7530b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 19 Aug 2025 12:08:58 +0200 Subject: [PATCH 185/246] kernfs: don't fail listing extended attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Userspace doesn't expect a failure to list extended attributes: $ ls -lA /sys/ ls: /sys/: No data available ls: /sys/kernel: No data available ls: /sys/power: No data available ls: /sys/class: No data available ls: /sys/devices: No data available ls: /sys/dev: No data available ls: /sys/hypervisor: No data available ls: /sys/fs: No data available ls: /sys/bus: No data available ls: /sys/firmware: No data available ls: /sys/block: No data available ls: /sys/module: No data available total 0 drwxr-xr-x 2 root root 0 Jan 1 1970 block drwxr-xr-x 52 root root 0 Jan 1 1970 bus drwxr-xr-x 88 root root 0 Jan 1 1970 class drwxr-xr-x 4 root root 0 Jan 1 1970 dev drwxr-xr-x 11 root root 0 Jan 1 1970 devices drwxr-xr-x 3 root root 0 Jan 1 1970 firmware drwxr-xr-x 10 root root 0 Jan 1 1970 fs drwxr-xr-x 2 root root 0 Jul 2 09:43 hypervisor drwxr-xr-x 14 root root 0 Jan 1 1970 kernel drwxr-xr-x 251 root root 0 Jan 1 1970 module drwxr-xr-x 3 root root 0 Jul 2 09:43 power Fix it by simply reporting success when no extended attributes are available instead of reporting ENODATA. Link: https://lore.kernel.org/78b13bcdae82ade95e88f315682966051f461dde.camel@linaro.org Fixes: d1f4e9026007 ("kernfs: remove iattr_mutex") # mainline only Reported-by: André Draszik Link: https://lore.kernel.org/20250819-ahndung-abgaben-524a535f8101@brauner Signed-off-by: Christian Brauner --- fs/kernfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 3c293a5a21b1..457f91c412d4 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -142,9 +142,9 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_iattrs *attrs; - attrs = kernfs_iattrs_noalloc(kn); + attrs = kernfs_iattrs(kn); if (!attrs) - return -ENODATA; + return -ENOMEM; return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size); } From a2c1f82618b0b65f1ef615aa9cfdac8122537d69 Mon Sep 17 00:00:00 2001 From: "Adrian Huang (Lenovo)" Date: Mon, 18 Aug 2025 21:43:10 +0800 Subject: [PATCH 186/246] signal: Fix memory leak for PIDFD_SELF* sentinels Commit f08d0c3a7111 ("pidfd: add PIDFD_SELF* sentinels to refer to own thread/process") introduced a leak by acquiring a pid reference through get_task_pid(), which increments pid->count but never drops it with put_pid(). As a result, kmemleak reports unreferenced pid objects after running tools/testing/selftests/pidfd/pidfd_test, for example: unreferenced object 0xff1100206757a940 (size 160): comm "pidfd_test", pid 16965, jiffies 4294853028 hex dump (first 32 bytes): 01 00 00 00 00 00 00 00 00 00 00 00 fd 57 50 04 .............WP. 5e 44 00 00 00 00 00 00 18 de 34 17 01 00 11 ff ^D........4..... backtrace (crc cd8844d4): kmem_cache_alloc_noprof+0x2f4/0x3f0 alloc_pid+0x54/0x3d0 copy_process+0xd58/0x1740 kernel_clone+0x99/0x3b0 __do_sys_clone3+0xbe/0x100 do_syscall_64+0x7b/0x2c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix this by calling put_pid() after do_pidfd_send_signal() returns. Fixes: f08d0c3a7111 ("pidfd: add PIDFD_SELF* sentinels to refer to own thread/process") Signed-off-by: Adrian Huang (Lenovo) Link: https://lore.kernel.org/20250818134310.12273-1-adrianhuang0701@gmail.com Tested-by: Lorenzo Stoakes Reviewed-by: Lorenzo Stoakes Signed-off-by: Christian Brauner --- kernel/signal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/signal.c b/kernel/signal.c index e2c928de7d2c..fe9190d84f28 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4067,6 +4067,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, { struct pid *pid; enum pid_type type; + int ret; /* Enforce flags be set to 0 until we add an extension. */ if (flags & ~PIDFD_SEND_SIGNAL_FLAGS) @@ -4108,7 +4109,10 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, } } - return do_pidfd_send_signal(pid, sig, type, info, flags); + ret = do_pidfd_send_signal(pid, sig, type, info, flags); + put_pid(pid); + + return ret; } static int From 0ddfb62f5d018edcb571a3d8ea30ad5332cf2a69 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Aug 2025 15:38:14 -0400 Subject: [PATCH 187/246] fix the softlockups in attach_recursive_mnt() In case when we mounting something on top of a large stack of overmounts, all of them being peers of each other, we get quadratic time by the depth of overmount stack. Easily fixed by doing commit_tree() before reparenting the overmount; simplifies commit_tree() as well - it doesn't need to skip the already mounted stuff that had been reparented on top of the new mounts. Since we are holding mount_lock through both reparenting and call of commit_tree(), the order does not matter from the mount hash point of view. Reported-by: "Lai, Yi" Tested-by: "Lai, Yi" Reviewed-by: Christian Brauner Fixes: 663206854f02 "copy_tree(): don't link the mounts via mnt_list" Signed-off-by: Al Viro --- fs/namespace.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index ddfd4457d338..1c97f93d1865 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1197,10 +1197,7 @@ static void commit_tree(struct mount *mnt) if (!mnt_ns_attached(mnt)) { for (struct mount *m = mnt; m; m = next_mnt(m, mnt)) - if (unlikely(mnt_ns_attached(m))) - m = skip_mnt_tree(m); - else - mnt_add_to_ns(n, m); + mnt_add_to_ns(n, m); n->nr_mounts += n->pending_mounts; n->pending_mounts = 0; } @@ -2704,6 +2701,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, lock_mnt_tree(child); q = __lookup_mnt(&child->mnt_parent->mnt, child->mnt_mountpoint); + commit_tree(child); if (q) { struct mountpoint *mp = root.mp; struct mount *r = child; @@ -2713,7 +2711,6 @@ static int attach_recursive_mnt(struct mount *source_mnt, mp = shorter; mnt_change_mountpoint(r, mp, q); } - commit_tree(child); } unpin_mountpoint(&root); unlock_mount_hash(); From da025cdb97a23c1916d8491925b878f3e1de0bca Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 14 Aug 2025 23:32:26 -0400 Subject: [PATCH 188/246] propagate_umount(): only surviving overmounts should be reparented ... as the comments in reparent() clearly say. As it is, we reparent *all* overmounts of the mounts being taken out, including those that are taken out themselves. It's not only a potentially massive slowdown (on a pathological setup we might end up with O(N^2) time for N mounts being kicked out), it can end up with incorrect ->overmount in the surviving mounts. Fixes: f0d0ba19985d "Rewrite of propagate_umount()" Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/pnode.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/pnode.c b/fs/pnode.c index 81f7599bdac4..1c789f88b3d2 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -637,10 +637,11 @@ void propagate_umount(struct list_head *set) } // now to_umount consists of all acceptable candidates - // deal with reparenting of remaining overmounts on those + // deal with reparenting of surviving overmounts on those list_for_each_entry(m, &to_umount, mnt_list) { - if (m->overmount) - reparent(m->overmount); + struct mount *over = m->overmount; + if (over && !will_be_unmounted(over)) + reparent(over); } // and fold them into the set From cffd0441872e7f6b1fce5e78fb1c99187a291330 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 14 Aug 2025 01:44:31 -0400 Subject: [PATCH 189/246] use uniform permission checks for all mount propagation changes do_change_type() and do_set_group() are operating on different aspects of the same thing - propagation graph. The latter asks for mounts involved to be mounted in namespace(s) the caller has CAP_SYS_ADMIN for. The former is a mess - originally it didn't even check that mount *is* mounted. That got fixed, but the resulting check turns out to be too strict for userland - in effect, we check that mount is in our namespace, having already checked that we have CAP_SYS_ADMIN there. What we really need (in both cases) is * only touch mounts that are mounted. That's a must-have constraint - data corruption happens if it get violated. * don't allow to mess with a namespace unless you already have enough permissions to do so (i.e. CAP_SYS_ADMIN in its userns). That's an equivalent of what do_set_group() does; let's extract that into a helper (may_change_propagation()) and use it in both do_set_group() and do_change_type(). Fixes: 12f147ddd6de "do_change_type(): refuse to operate on unmounted/not ours mounts" Acked-by: Andrei Vagin Reviewed-by: Pavel Tikhomirov Tested-by: Pavel Tikhomirov Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 1c97f93d1865..88db58061919 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2859,6 +2859,19 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) return attach_recursive_mnt(mnt, p, mp); } +static int may_change_propagation(const struct mount *m) +{ + struct mnt_namespace *ns = m->mnt_ns; + + // it must be mounted in some namespace + if (IS_ERR_OR_NULL(ns)) // is_mounted() + return -EINVAL; + // and the caller must be admin in userns of that namespace + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + return 0; +} + /* * Sanity check the flags to change_mnt_propagation. */ @@ -2895,10 +2908,10 @@ static int do_change_type(struct path *path, int ms_flags) return -EINVAL; namespace_lock(); - if (!check_mnt(mnt)) { - err = -EINVAL; + err = may_change_propagation(mnt); + if (err) goto out_unlock; - } + if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) @@ -3344,18 +3357,11 @@ static int do_set_group(struct path *from_path, struct path *to_path) namespace_lock(); - err = -EINVAL; - /* To and From must be mounted */ - if (!is_mounted(&from->mnt)) + err = may_change_propagation(from); + if (err) goto out; - if (!is_mounted(&to->mnt)) - goto out; - - err = -EPERM; - /* We should be allowed to modify mount namespaces of both mounts */ - if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN)) - goto out; - if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN)) + err = may_change_propagation(to); + if (err) goto out; err = -EINVAL; From fb924b7b8669503582e003dd7b7340ee49029801 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Aug 2025 15:23:08 -0400 Subject: [PATCH 190/246] change_mnt_propagation(): calculate propagation source only if we'll need it We only need it when mount in question was sending events downstream (then recepients need to switch to new master) or the mount is being turned into slave (then we need a new master for it). That wouldn't be a big deal, except that it causes quite a bit of work when umount_tree() is taking a large peer group out. Adding a trivial "don't bother calling propagation_source() unless we are going to use its results" logics improves the things quite a bit. We are still doing unnecessary work on bulk removals from propagation graph, but the full solution for that will have to wait for the next merge window. Fixes: 955336e204ab "do_make_slave(): choose new master sanely" Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/pnode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/pnode.c b/fs/pnode.c index 1c789f88b3d2..6f7d02f3fa98 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -111,7 +111,8 @@ void change_mnt_propagation(struct mount *mnt, int type) return; } if (IS_MNT_SHARED(mnt)) { - m = propagation_source(mnt); + if (type == MS_SLAVE || !hlist_empty(&mnt->mnt_slave_list)) + m = propagation_source(mnt); if (list_empty(&mnt->mnt_share)) { mnt_release_group_id(mnt); } else { From 4a73a36cb704813f588af13d9842d0ba5a185758 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Thu, 14 Aug 2025 17:42:14 +0200 Subject: [PATCH 191/246] cdc_ncm: Flag Intel OEM version of Fibocom L850-GL as WWAN This lets NetworkManager/ModemManager know that this is a modem and needs to be connected first. Signed-off-by: Lubomir Rintel Link: https://patch.msgid.link/20250814154214.250103-1-lkundrak@v3.sk Signed-off-by: Jakub Kicinski --- drivers/net/usb/cdc_ncm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index ea0e5e276cd6..5d123df0a866 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -2087,6 +2087,13 @@ static const struct usb_device_id cdc_devs[] = { .driver_info = (unsigned long)&wwan_info, }, + /* Intel modem (label from OEM reads Fibocom L850-GL) */ + { USB_DEVICE_AND_INTERFACE_INFO(0x8087, 0x095a, + USB_CLASS_COMM, + USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&wwan_info, + }, + /* DisplayLink docking stations */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_VENDOR, From f179f5bc158f07693b74c264f8933c8b0f07503f Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Fri, 15 Aug 2025 10:53:17 -0300 Subject: [PATCH 192/246] net/sched: sch_dualpi2: Run prob update timer in softirq to avoid deadlock When a user creates a dualpi2 qdisc it automatically sets a timer. This timer will run constantly and update the qdisc's probability field. The issue is that the timer acquires the qdisc root lock and runs in hardirq. The qdisc root lock is also acquired in dev.c whenever a packet arrives for this qdisc. Since the dualpi2 timer callback runs in hardirq, it may interrupt the packet processing running in softirq. If that happens and it runs on the same CPU, it will acquire the same lock and cause a deadlock. The following splat shows up when running a kernel compiled with lock debugging: [ +0.000224] WARNING: inconsistent lock state [ +0.000224] 6.16.0+ #10 Not tainted [ +0.000169] -------------------------------- [ +0.000029] inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. [ +0.000000] ping/156 [HC0[0]:SC0[2]:HE1:SE0] takes: [ +0.000000] ffff897841242110 (&sch->root_lock_key){?.-.}-{3:3}, at: __dev_queue_xmit+0x86d/0x1140 [ +0.000000] {IN-HARDIRQ-W} state was registered at: [ +0.000000] lock_acquire.part.0+0xb6/0x220 [ +0.000000] _raw_spin_lock+0x31/0x80 [ +0.000000] dualpi2_timer+0x6f/0x270 [ +0.000000] __hrtimer_run_queues+0x1c5/0x360 [ +0.000000] hrtimer_interrupt+0x115/0x260 [ +0.000000] __sysvec_apic_timer_interrupt+0x6d/0x1a0 [ +0.000000] sysvec_apic_timer_interrupt+0x6e/0x80 [ +0.000000] asm_sysvec_apic_timer_interrupt+0x1a/0x20 [ +0.000000] pv_native_safe_halt+0xf/0x20 [ +0.000000] default_idle+0x9/0x10 [ +0.000000] default_idle_call+0x7e/0x1e0 [ +0.000000] do_idle+0x1e8/0x250 [ +0.000000] cpu_startup_entry+0x29/0x30 [ +0.000000] rest_init+0x151/0x160 [ +0.000000] start_kernel+0x6f3/0x700 [ +0.000000] x86_64_start_reservations+0x24/0x30 [ +0.000000] x86_64_start_kernel+0xc8/0xd0 [ +0.000000] common_startup_64+0x13e/0x148 [ +0.000000] irq event stamp: 6884 [ +0.000000] hardirqs last enabled at (6883): [] neigh_resolve_output+0x223/0x270 [ +0.000000] hardirqs last disabled at (6882): [] neigh_resolve_output+0x1e8/0x270 [ +0.000000] softirqs last enabled at (6880): [] neigh_resolve_output+0x1db/0x270 [ +0.000000] softirqs last disabled at (6884): [] __dev_queue_xmit+0x73/0x1140 [ +0.000000] other info that might help us debug this: [ +0.000000] Possible unsafe locking scenario: [ +0.000000] CPU0 [ +0.000000] ---- [ +0.000000] lock(&sch->root_lock_key); [ +0.000000] [ +0.000000] lock(&sch->root_lock_key); [ +0.000000] *** DEADLOCK *** [ +0.000000] 4 locks held by ping/156: [ +0.000000] #0: ffff897842332e08 (sk_lock-AF_INET){+.+.}-{0:0}, at: raw_sendmsg+0x41e/0xf40 [ +0.000000] #1: ffffffffa816f880 (rcu_read_lock){....}-{1:3}, at: ip_output+0x2c/0x190 [ +0.000000] #2: ffffffffa816f880 (rcu_read_lock){....}-{1:3}, at: ip_finish_output2+0xad/0x950 [ +0.000000] #3: ffffffffa816f840 (rcu_read_lock_bh){....}-{1:3}, at: __dev_queue_xmit+0x73/0x1140 I am able to reproduce it consistently when running the following: tc qdisc add dev lo handle 1: root dualpi2 ping -f 127.0.0.1 To fix it, make the timer run in softirq. Fixes: 320d031ad6e4 ("sched: Struct definition and parsing of dualpi2 qdisc") Reviewed-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20250815135317.664993-1-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/sch_dualpi2.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index 845375ebd4ea..4b975feb52b1 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -927,7 +927,8 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, q->sch = sch; dualpi2_reset_default(sch); - hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_SOFT); if (opt && nla_len(opt)) { err = dualpi2_change(sch, opt, extack); @@ -937,7 +938,7 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, } hrtimer_start(&q->pi2_timer, next_pi2_timeout(q), - HRTIMER_MODE_ABS_PINNED); + HRTIMER_MODE_ABS_PINNED_SOFT); return 0; } From bc1a59cff9f797bfbf8f3104507584d89e9ecf2e Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Mon, 18 Aug 2025 10:10:29 +0200 Subject: [PATCH 193/246] phy: mscc: Fix timestamping for vsc8584 There was a problem when we received frames and the frames were timestamped. The driver is configured to store the nanosecond part of the timestmap in the ptp reserved bits and it would take the second part by reading the LTC. The problem is that when reading the LTC we are in atomic context and to read the second part will go over mdio bus which might sleep, so we get an error. The fix consists in actually put all the frames in a queue and start the aux work and in that work to read the LTC and then calculate the full received time. Fixes: 7d272e63e0979d ("net: phy: mscc: timestamping and PHC support") Signed-off-by: Horatiu Vultur Reviewed-by: Vadim Fedorenko Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250818081029.1300780-1-horatiu.vultur@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/mscc/mscc.h | 12 ++++++++ drivers/net/phy/mscc/mscc_main.c | 12 ++++++++ drivers/net/phy/mscc/mscc_ptp.c | 49 ++++++++++++++++++++++++-------- 3 files changed, 61 insertions(+), 12 deletions(-) diff --git a/drivers/net/phy/mscc/mscc.h b/drivers/net/phy/mscc/mscc.h index 6a3d8a754eb8..58c6d47fbe04 100644 --- a/drivers/net/phy/mscc/mscc.h +++ b/drivers/net/phy/mscc/mscc.h @@ -362,6 +362,13 @@ struct vsc85xx_hw_stat { u16 mask; }; +struct vsc8531_skb_cb { + u32 ns; +}; + +#define VSC8531_SKB_CB(skb) \ + ((struct vsc8531_skb_cb *)((skb)->cb)) + struct vsc8531_private { int rate_magic; u16 supp_led_modes; @@ -410,6 +417,11 @@ struct vsc8531_private { */ struct mutex ts_lock; struct mutex phc_lock; + + /* list of skbs that were received and need timestamp information but it + * didn't received it yet + */ + struct sk_buff_head rx_skbs_list; }; /* Shared structure between the PHYs of the same package. diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c index 7ed6522fb0ef..f1c9ce351ab4 100644 --- a/drivers/net/phy/mscc/mscc_main.c +++ b/drivers/net/phy/mscc/mscc_main.c @@ -2335,6 +2335,13 @@ static int vsc85xx_probe(struct phy_device *phydev) return vsc85xx_dt_led_modes_get(phydev, default_mode); } +static void vsc85xx_remove(struct phy_device *phydev) +{ + struct vsc8531_private *priv = phydev->priv; + + skb_queue_purge(&priv->rx_skbs_list); +} + /* Microsemi VSC85xx PHYs */ static struct phy_driver vsc85xx_driver[] = { { @@ -2589,6 +2596,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .remove = &vsc85xx_remove, .probe = &vsc8574_probe, .set_wol = &vsc85xx_wol_set, .get_wol = &vsc85xx_wol_get, @@ -2614,6 +2622,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .remove = &vsc85xx_remove, .probe = &vsc8574_probe, .set_wol = &vsc85xx_wol_set, .get_wol = &vsc85xx_wol_get, @@ -2639,6 +2648,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .remove = &vsc85xx_remove, .probe = &vsc8584_probe, .get_tunable = &vsc85xx_get_tunable, .set_tunable = &vsc85xx_set_tunable, @@ -2662,6 +2672,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .remove = &vsc85xx_remove, .probe = &vsc8584_probe, .get_tunable = &vsc85xx_get_tunable, .set_tunable = &vsc85xx_set_tunable, @@ -2685,6 +2696,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .remove = &vsc85xx_remove, .probe = &vsc8584_probe, .get_tunable = &vsc85xx_get_tunable, .set_tunable = &vsc85xx_set_tunable, diff --git a/drivers/net/phy/mscc/mscc_ptp.c b/drivers/net/phy/mscc/mscc_ptp.c index 275706de5847..de6c7312e8f2 100644 --- a/drivers/net/phy/mscc/mscc_ptp.c +++ b/drivers/net/phy/mscc/mscc_ptp.c @@ -1194,9 +1194,7 @@ static bool vsc85xx_rxtstamp(struct mii_timestamper *mii_ts, { struct vsc8531_private *vsc8531 = container_of(mii_ts, struct vsc8531_private, mii_ts); - struct skb_shared_hwtstamps *shhwtstamps = NULL; struct vsc85xx_ptphdr *ptphdr; - struct timespec64 ts; unsigned long ns; if (!vsc8531->ptp->configured) @@ -1206,27 +1204,52 @@ static bool vsc85xx_rxtstamp(struct mii_timestamper *mii_ts, type == PTP_CLASS_NONE) return false; - vsc85xx_gettime(&vsc8531->ptp->caps, &ts); - ptphdr = get_ptp_header_rx(skb, vsc8531->ptp->rx_filter); if (!ptphdr) return false; - shhwtstamps = skb_hwtstamps(skb); - memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps)); - ns = ntohl(ptphdr->rsrvd2); - /* nsec is in reserved field */ - if (ts.tv_nsec < ns) - ts.tv_sec--; + VSC8531_SKB_CB(skb)->ns = ns; + skb_queue_tail(&vsc8531->rx_skbs_list, skb); - shhwtstamps->hwtstamp = ktime_set(ts.tv_sec, ns); - netif_rx(skb); + ptp_schedule_worker(vsc8531->ptp->ptp_clock, 0); return true; } +static long vsc85xx_do_aux_work(struct ptp_clock_info *info) +{ + struct vsc85xx_ptp *ptp = container_of(info, struct vsc85xx_ptp, caps); + struct skb_shared_hwtstamps *shhwtstamps = NULL; + struct phy_device *phydev = ptp->phydev; + struct vsc8531_private *priv = phydev->priv; + struct sk_buff_head received; + struct sk_buff *rx_skb; + struct timespec64 ts; + unsigned long flags; + + __skb_queue_head_init(&received); + spin_lock_irqsave(&priv->rx_skbs_list.lock, flags); + skb_queue_splice_tail_init(&priv->rx_skbs_list, &received); + spin_unlock_irqrestore(&priv->rx_skbs_list.lock, flags); + + vsc85xx_gettime(info, &ts); + while ((rx_skb = __skb_dequeue(&received)) != NULL) { + shhwtstamps = skb_hwtstamps(rx_skb); + memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps)); + + if (ts.tv_nsec < VSC8531_SKB_CB(rx_skb)->ns) + ts.tv_sec--; + + shhwtstamps->hwtstamp = ktime_set(ts.tv_sec, + VSC8531_SKB_CB(rx_skb)->ns); + netif_rx(rx_skb); + } + + return -1; +} + static const struct ptp_clock_info vsc85xx_clk_caps = { .owner = THIS_MODULE, .name = "VSC85xx timer", @@ -1240,6 +1263,7 @@ static const struct ptp_clock_info vsc85xx_clk_caps = { .adjfine = &vsc85xx_adjfine, .gettime64 = &vsc85xx_gettime, .settime64 = &vsc85xx_settime, + .do_aux_work = &vsc85xx_do_aux_work, }; static struct vsc8531_private *vsc8584_base_priv(struct phy_device *phydev) @@ -1567,6 +1591,7 @@ int vsc8584_ptp_probe(struct phy_device *phydev) mutex_init(&vsc8531->phc_lock); mutex_init(&vsc8531->ts_lock); + skb_queue_head_init(&vsc8531->rx_skbs_list); /* Retrieve the shared load/save GPIO. Request it as non exclusive as * the same GPIO can be requested by all the PHYs of the same package. From 24ef2f53c07f273bad99173e27ee88d44d135b1c Mon Sep 17 00:00:00 2001 From: Yuichiro Tsuji Date: Mon, 18 Aug 2025 17:45:07 +0900 Subject: [PATCH 194/246] net: usb: asix_devices: Fix PHY address mask in MDIO bus initialization Syzbot reported shift-out-of-bounds exception on MDIO bus initialization. The PHY address should be masked to 5 bits (0-31). Without this mask, invalid PHY addresses could be used, potentially causing issues with MDIO bus operations. Fix this by masking the PHY address with 0x1f (31 decimal) to ensure it stays within the valid range. Fixes: 4faff70959d5 ("net: usb: asix_devices: add phy_mask for ax88772 mdio bus") Reported-by: syzbot+20537064367a0f98d597@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=20537064367a0f98d597 Tested-by: syzbot+20537064367a0f98d597@syzkaller.appspotmail.com Signed-off-by: Yuichiro Tsuji Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250818084541.1958-1-yuichtsu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/asix_devices.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c index d9f5942ccc44..792ddda1ad49 100644 --- a/drivers/net/usb/asix_devices.c +++ b/drivers/net/usb/asix_devices.c @@ -676,7 +676,7 @@ static int ax88772_init_mdio(struct usbnet *dev) priv->mdio->read = &asix_mdio_bus_read; priv->mdio->write = &asix_mdio_bus_write; priv->mdio->name = "Asix MDIO Bus"; - priv->mdio->phy_mask = ~(BIT(priv->phy_addr) | BIT(AX_EMBD_PHY_ADDR)); + priv->mdio->phy_mask = ~(BIT(priv->phy_addr & 0x1f) | BIT(AX_EMBD_PHY_ADDR)); /* mii bus name is usb-- */ snprintf(priv->mdio->id, MII_BUS_ID_SIZE, "usb-%03d:%03d", dev->udev->bus->busnum, dev->udev->devnum); From 75a9a46d67f46d608205888f9b34e315c1786345 Mon Sep 17 00:00:00 2001 From: Jordan Rhee Date: Mon, 18 Aug 2025 14:12:45 -0700 Subject: [PATCH 195/246] gve: prevent ethtool ops after shutdown A crash can occur if an ethtool operation is invoked after shutdown() is called. shutdown() is invoked during system shutdown to stop DMA operations without performing expensive deallocations. It is discouraged to unregister the netdev in this path, so the device may still be visible to userspace and kernel helpers. In gve, shutdown() tears down most internal data structures. If an ethtool operation is dispatched after shutdown(), it will dereference freed or NULL pointers, leading to a kernel panic. While graceful shutdown normally quiesces userspace before invoking the reboot syscall, forced shutdowns (as observed on GCP VMs) can still trigger this path. Fix by calling netif_device_detach() in shutdown(). This marks the device as detached so the ethtool ioctl handler will skip dispatching operations to the driver. Fixes: 974365e51861 ("gve: Implement suspend/resume/shutdown") Signed-off-by: Jordan Rhee Signed-off-by: Jeroen de Borst Link: https://patch.msgid.link/20250818211245.1156919-1-jeroendb@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 1f411d7c4373..1be1b1ef31ee 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -2870,6 +2870,8 @@ static void gve_shutdown(struct pci_dev *pdev) struct gve_priv *priv = netdev_priv(netdev); bool was_up = netif_running(priv->dev); + netif_device_detach(netdev); + rtnl_lock(); netdev_lock(netdev); if (was_up && gve_close(priv->dev)) { From 6d6714bf0c4e8eb2274081b4b023dfa01581c123 Mon Sep 17 00:00:00 2001 From: Yao Zi Date: Fri, 15 Aug 2025 10:48:03 +0000 Subject: [PATCH 196/246] net: stmmac: thead: Enable TX clock before MAC initialization The clk_tx_i clock must be supplied to the MAC for successful initialization. On TH1520 SoC, the clock is provided by an internal divider configured through GMAC_PLLCLK_DIV register when using RGMII interface. However, currently we don't setup the divider before initialization of the MAC, resulting in DMA reset failures if the bootloader/firmware doesn't enable the divider, [ 7.839601] thead-dwmac ffe7060000.ethernet eth0: Register MEM_TYPE_PAGE_POOL RxQ-0 [ 7.938338] thead-dwmac ffe7060000.ethernet eth0: PHY [stmmac-0:02] driver [RTL8211F Gigabit Ethernet] (irq=POLL) [ 8.160746] thead-dwmac ffe7060000.ethernet eth0: Failed to reset the dma [ 8.170118] thead-dwmac ffe7060000.ethernet eth0: stmmac_hw_setup: DMA engine initialization failed [ 8.179384] thead-dwmac ffe7060000.ethernet eth0: __stmmac_open: Hw setup failed Let's simply write GMAC_PLLCLK_DIV_EN to GMAC_PLLCLK_DIV to enable the divider before MAC initialization. Note that for reconfiguring the divisor, the divider must be disabled first and re-enabled later to make sure the new divisor take effect. The exact clock rate doesn't affect MAC's initialization according to my test. It's set to the speed required by RGMII when the linkspeed is 1Gbps and could be reclocked later after link is up if necessary. Fixes: 33a1a01e3afa ("net: stmmac: Add glue layer for T-HEAD TH1520 SoC") Signed-off-by: Yao Zi Reviewed-by: Drew Fustini Link: https://patch.msgid.link/20250815104803.55294-1-ziyao@disroot.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c index f2946bea0bc2..6c6c49e4b66f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c @@ -152,7 +152,7 @@ static int thead_set_clk_tx_rate(void *bsp_priv, struct clk *clk_tx_i, static int thead_dwmac_enable_clk(struct plat_stmmacenet_data *plat) { struct thead_dwmac *dwmac = plat->bsp_priv; - u32 reg; + u32 reg, div; switch (plat->mac_interface) { case PHY_INTERFACE_MODE_MII: @@ -164,6 +164,13 @@ static int thead_dwmac_enable_clk(struct plat_stmmacenet_data *plat) case PHY_INTERFACE_MODE_RGMII_RXID: case PHY_INTERFACE_MODE_RGMII_TXID: /* use pll */ + div = clk_get_rate(plat->stmmac_clk) / rgmii_clock(SPEED_1000); + reg = FIELD_PREP(GMAC_PLLCLK_DIV_EN, 1) | + FIELD_PREP(GMAC_PLLCLK_DIV_NUM, div); + + writel(0, dwmac->apb_base + GMAC_PLLCLK_DIV); + writel(reg, dwmac->apb_base + GMAC_PLLCLK_DIV); + writel(GMAC_GTXCLK_SEL_PLL, dwmac->apb_base + GMAC_GTXCLK_SEL); reg = GMAC_TX_CLK_EN | GMAC_TX_CLK_N_EN | GMAC_TX_CLK_OUT_EN | GMAC_RX_CLK_EN | GMAC_RX_CLK_N_EN; From d9cef55ed49117bd63695446fb84b4b91815c0b4 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Mon, 18 Aug 2025 13:46:18 +0800 Subject: [PATCH 197/246] net/smc: fix UAF on smcsk after smc_listen_out() BPF CI testing report a UAF issue: [ 16.446633] BUG: kernel NULL pointer dereference, address: 000000000000003 0 [ 16.447134] #PF: supervisor read access in kernel mod e [ 16.447516] #PF: error_code(0x0000) - not-present pag e [ 16.447878] PGD 0 P4D 0 [ 16.448063] Oops: Oops: 0000 [#1] PREEMPT SMP NOPT I [ 16.448409] CPU: 0 UID: 0 PID: 9 Comm: kworker/0:1 Tainted: G OE 6.13.0-rc3-g89e8a75fda73-dirty #4 2 [ 16.449124] Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODUL E [ 16.449502] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/201 4 [ 16.450201] Workqueue: smc_hs_wq smc_listen_wor k [ 16.450531] RIP: 0010:smc_listen_work+0xc02/0x159 0 [ 16.452158] RSP: 0018:ffffb5ab40053d98 EFLAGS: 0001024 6 [ 16.452526] RAX: 0000000000000001 RBX: 0000000000000002 RCX: 000000000000030 0 [ 16.452994] RDX: 0000000000000280 RSI: 00003513840053f0 RDI: 000000000000000 0 [ 16.453492] RBP: ffffa097808e3800 R08: ffffa09782dba1e0 R09: 000000000000000 5 [ 16.453987] R10: 0000000000000000 R11: 0000000000000000 R12: ffffa0978274640 0 [ 16.454497] R13: 0000000000000000 R14: 0000000000000000 R15: ffffa09782d4092 0 [ 16.454996] FS: 0000000000000000(0000) GS:ffffa097bbc00000(0000) knlGS:000000000000000 0 [ 16.455557] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003 3 [ 16.455961] CR2: 0000000000000030 CR3: 0000000102788004 CR4: 0000000000770ef 0 [ 16.456459] PKRU: 5555555 4 [ 16.456654] Call Trace : [ 16.456832] [ 16.456989] ? __die+0x23/0x7 0 [ 16.457215] ? page_fault_oops+0x180/0x4c 0 [ 16.457508] ? __lock_acquire+0x3e6/0x249 0 [ 16.457801] ? exc_page_fault+0x68/0x20 0 [ 16.458080] ? asm_exc_page_fault+0x26/0x3 0 [ 16.458389] ? smc_listen_work+0xc02/0x159 0 [ 16.458689] ? smc_listen_work+0xc02/0x159 0 [ 16.458987] ? lock_is_held_type+0x8f/0x10 0 [ 16.459284] process_one_work+0x1ea/0x6d 0 [ 16.459570] worker_thread+0x1c3/0x38 0 [ 16.459839] ? __pfx_worker_thread+0x10/0x1 0 [ 16.460144] kthread+0xe0/0x11 0 [ 16.460372] ? __pfx_kthread+0x10/0x1 0 [ 16.460640] ret_from_fork+0x31/0x5 0 [ 16.460896] ? __pfx_kthread+0x10/0x1 0 [ 16.461166] ret_from_fork_asm+0x1a/0x3 0 [ 16.461453] [ 16.461616] Modules linked in: bpf_testmod(OE) [last unloaded: bpf_testmod(OE) ] [ 16.462134] CR2: 000000000000003 0 [ 16.462380] ---[ end trace 0000000000000000 ]--- [ 16.462710] RIP: 0010:smc_listen_work+0xc02/0x1590 The direct cause of this issue is that after smc_listen_out_connected(), newclcsock->sk may be NULL since it will releases the smcsk. Therefore, if the application closes the socket immediately after accept, newclcsock->sk can be NULL. A possible execution order could be as follows: smc_listen_work | userspace ----------------------------------------------------------------- lock_sock(sk) | smc_listen_out_connected() | | \- smc_listen_out | | | \- release_sock | | |- sk->sk_data_ready() | | fd = accept(); | close(fd); | \- socket->sk = NULL; /* newclcsock->sk is NULL now */ SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk)) Since smc_listen_out_connected() will not fail, simply swapping the order of the code can easily fix this issue. Fixes: 3b2dec2603d5 ("net/smc: restructure client and server code in af_smc") Signed-off-by: D. Wythe Reviewed-by: Guangguan Wang Reviewed-by: Alexandra Winter Reviewed-by: Dust Li Link: https://patch.msgid.link/20250818054618.41615-1-alibuda@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9311c38f7abe..e0e48f24cd61 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2568,8 +2568,9 @@ static void smc_listen_work(struct work_struct *work) goto out_decl; } - smc_listen_out_connected(new_smc); SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); + /* smc_listen_out() will release smcsk */ + smc_listen_out_connected(new_smc); goto out_free; out_unlock: From 2462c1b9217246a889ec318b3894d84e4dd709c6 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Aug 2025 23:23:17 +0300 Subject: [PATCH 198/246] net/mlx5: HWS, fix bad parameter in CQ creation 'cqe_sz' valid value should be 0 for 64-byte CQE. Fixes: 2ca62599aa0b ("net/mlx5: HWS, added send engine and context handling") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Vlad Dogaru Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-2-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c index c4b22be19a9b..b0595c9b09e4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c @@ -964,7 +964,6 @@ static int hws_send_ring_open_cq(struct mlx5_core_dev *mdev, return -ENOMEM; MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.uar->index); - MLX5_SET(cqc, cqc_data, cqe_sz, queue->num_entries); MLX5_SET(cqc, cqc_data, log_cq_size, ilog2(queue->num_entries)); err = hws_send_ring_alloc_cq(mdev, numa_node, queue, cqc_data, cq); From 615b690612b7785ab8632f6a5a941550622e4e36 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Aug 2025 23:23:18 +0300 Subject: [PATCH 199/246] net/mlx5: HWS, fix simple rules rehash error flow Moving rules from matcher to matcher should not fail. However, if it does fail due to various reasons, the error flow should allow the kernel to continue functioning (albeit with broken steering rules) instead of going into series of soft lock-ups or some other problematic behaviour. This patch fixes the error flow for moving simple rules: - If new rule creation fails before it was even enqeued, do not poll for completion - If TIMEOUT happened while moving the rule, no point trying to poll for completions for other rules. Something is broken, completion won't come, just abort the rehash sequence. - If some other completion with error received, don't give up. Continue handling rest of the rules to minimize the damage. - Make sure that the first error code that was received will be actually returned to the caller instead of replacing it with the generic error code. All the aforementioned issues stem from the same bad error flow, so no point fixing them one by one and leaving partially broken code - fixing them in one patch. Fixes: ef94799a8741 ("net/mlx5: HWS, rework rehash loop") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Vlad Dogaru Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-3-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/bwc.c | 61 +++++++++++++------ 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index 92de4b761a83..0219a49b2326 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -74,9 +74,9 @@ static void hws_bwc_matcher_init_attr(struct mlx5hws_bwc_matcher *bwc_matcher, static int hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) { - bool move_error = false, poll_error = false, drain_error = false; struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; struct mlx5hws_matcher *matcher = bwc_matcher->matcher; + int drain_error = 0, move_error = 0, poll_error = 0; u16 bwc_queues = mlx5hws_bwc_queues(ctx); struct mlx5hws_rule_attr rule_attr; struct mlx5hws_bwc_rule *bwc_rule; @@ -99,11 +99,15 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) ret = mlx5hws_matcher_resize_rule_move(matcher, bwc_rule->rule, &rule_attr); - if (unlikely(ret && !move_error)) { - mlx5hws_err(ctx, - "Moving BWC rule: move failed (%d), attempting to move rest of the rules\n", - ret); - move_error = true; + if (unlikely(ret)) { + if (!move_error) { + mlx5hws_err(ctx, + "Moving BWC rule: move failed (%d), attempting to move rest of the rules\n", + ret); + move_error = ret; + } + /* Rule wasn't queued, no need to poll */ + continue; } pending_rules++; @@ -111,11 +115,19 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) rule_attr.queue_id, &pending_rules, false); - if (unlikely(ret && !poll_error)) { - mlx5hws_err(ctx, - "Moving BWC rule: poll failed (%d), attempting to move rest of the rules\n", - ret); - poll_error = true; + if (unlikely(ret)) { + if (ret == -ETIMEDOUT) { + mlx5hws_err(ctx, + "Moving BWC rule: timeout polling for completions (%d), aborting rehash\n", + ret); + return ret; + } + if (!poll_error) { + mlx5hws_err(ctx, + "Moving BWC rule: polling for completions failed (%d), attempting to move rest of the rules\n", + ret); + poll_error = ret; + } } } @@ -126,17 +138,30 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) rule_attr.queue_id, &pending_rules, true); - if (unlikely(ret && !drain_error)) { - mlx5hws_err(ctx, - "Moving BWC rule: drain failed (%d), attempting to move rest of the rules\n", - ret); - drain_error = true; + if (unlikely(ret)) { + if (ret == -ETIMEDOUT) { + mlx5hws_err(ctx, + "Moving bwc rule: timeout draining completions (%d), aborting rehash\n", + ret); + return ret; + } + if (!drain_error) { + mlx5hws_err(ctx, + "Moving bwc rule: drain failed (%d), attempting to move rest of the rules\n", + ret); + drain_error = ret; + } } } } - if (move_error || poll_error || drain_error) - ret = -EINVAL; + /* Return the first error that happened */ + if (unlikely(move_error)) + return move_error; + if (unlikely(poll_error)) + return poll_error; + if (unlikely(drain_error)) + return drain_error; return ret; } From 4a842b1bf18a32ee0c25dd6dd98728b786a76fe4 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Aug 2025 23:23:19 +0300 Subject: [PATCH 200/246] net/mlx5: HWS, fix complex rules rehash error flow Moving rules from matcher to matcher should not fail. However, if it does fail due to various reasons, the error flow should allow the kernel to continue functioning (albeit with broken steering rules) instead of going into series of soft lock-ups or some other problematic behaviour. Similar to the simple rules, complex rules rehash logic suffers from the same problems. This patch fixes the error flow for moving complex rules: - If new rule creation fails before it was even enqeued, do not poll for completion - If TIMEOUT happened while moving the rule, no point trying to poll for completions for other rules. Something is broken, completion won't come, just abort the rehash sequence. - If some other completion with error received, don't give up. Continue handling rest of the rules to minimize the damage. - Make sure that the first error code that was received will be actually returned to the caller instead of replacing it with the generic error code. All the aforementioned issues stem from the same bad error flow, so no point fixing them one by one and leaving partially broken code - fixing them in one patch. Fixes: 17e0accac577 ("net/mlx5: HWS, support complex matchers") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Vlad Dogaru Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-4-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../mlx5/core/steering/hws/bwc_complex.c | 41 +++++++++++++------ 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c index ca7501c57468..14e79579c719 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c @@ -1328,11 +1328,11 @@ mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher) { struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; struct mlx5hws_matcher *matcher = bwc_matcher->matcher; - bool move_error = false, poll_error = false; u16 bwc_queues = mlx5hws_bwc_queues(ctx); struct mlx5hws_bwc_rule *tmp_bwc_rule; struct mlx5hws_rule_attr rule_attr; struct mlx5hws_table *isolated_tbl; + int move_error = 0, poll_error = 0; struct mlx5hws_rule *tmp_rule; struct list_head *rules_list; u32 expected_completions = 1; @@ -1391,11 +1391,15 @@ mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher) ret = mlx5hws_matcher_resize_rule_move(matcher, tmp_rule, &rule_attr); - if (unlikely(ret && !move_error)) { - mlx5hws_err(ctx, - "Moving complex BWC rule failed (%d), attempting to move rest of the rules\n", - ret); - move_error = true; + if (unlikely(ret)) { + if (!move_error) { + mlx5hws_err(ctx, + "Moving complex BWC rule: move failed (%d), attempting to move rest of the rules\n", + ret); + move_error = ret; + } + /* Rule wasn't queued, no need to poll */ + continue; } expected_completions = 1; @@ -1403,11 +1407,19 @@ mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher) rule_attr.queue_id, &expected_completions, true); - if (unlikely(ret && !poll_error)) { - mlx5hws_err(ctx, - "Moving complex BWC rule: poll failed (%d), attempting to move rest of the rules\n", - ret); - poll_error = true; + if (unlikely(ret)) { + if (ret == -ETIMEDOUT) { + mlx5hws_err(ctx, + "Moving complex BWC rule: timeout polling for completions (%d), aborting rehash\n", + ret); + return ret; + } + if (!poll_error) { + mlx5hws_err(ctx, + "Moving complex BWC rule: polling for completions failed (%d), attempting to move rest of the rules\n", + ret); + poll_error = ret; + } } /* Done moving the rule to the new matcher, @@ -1422,8 +1434,11 @@ mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher) } } - if (move_error || poll_error) - ret = -EINVAL; + /* Return the first error that happened */ + if (unlikely(move_error)) + return move_error; + if (unlikely(poll_error)) + return poll_error; return ret; } From 1a72298d27ce4d41b3fd405f6921e8711815767a Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Aug 2025 23:23:20 +0300 Subject: [PATCH 201/246] net/mlx5: HWS, prevent rehash from filling up the queues While moving the rules during rehash, CQ is not drained. The flush and drain happens only when all the rules of a certain queue have been moved. This behaviour can lead to accumulating large quantity of rules that haven't got their completion yet, and eventually will fill up the queue and will cause the rehash to fail. Fix this problem by requiring drain once the number of outstanding completions reaches a certain threshold. Fixes: ef94799a8741 ("net/mlx5: HWS, rework rehash loop") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Vlad Dogaru Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-5-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index 0219a49b2326..2a59be11fe55 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -84,6 +84,7 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) struct list_head *rules_list; u32 pending_rules; int i, ret = 0; + bool drain; mlx5hws_bwc_rule_fill_attr(bwc_matcher, 0, 0, &rule_attr); @@ -111,10 +112,12 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) } pending_rules++; + drain = pending_rules >= + hws_bwc_get_burst_th(ctx, rule_attr.queue_id); ret = mlx5hws_bwc_queue_poll(ctx, rule_attr.queue_id, &pending_rules, - false); + drain); if (unlikely(ret)) { if (ret == -ETIMEDOUT) { mlx5hws_err(ctx, From 7c60952f83584bc4950057cfed2cc3c87343b5db Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Aug 2025 23:23:21 +0300 Subject: [PATCH 202/246] net/mlx5: HWS, don't rehash on every kind of insertion failure If rule creation failed due to a full queue, due to timeout in polling for completion, or due to matcher being in resize, don't try to initiate rehash sequence - rehash would have failed anyway. Fixes: 2111bb970c78 ("net/mlx5: HWS, added backward-compatible API handling") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Vlad Dogaru Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-6-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/bwc.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index 2a59be11fe55..adeccc588e5d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -1063,6 +1063,21 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, return 0; /* rule inserted successfully */ } + /* Rule insertion could fail due to queue being full, timeout, or + * matcher in resize. In such cases, no point in trying to rehash. + */ + if (ret == -EBUSY || ret == -ETIMEDOUT || ret == -EAGAIN) { + mutex_unlock(queue_lock); + mlx5hws_err(ctx, + "BWC rule insertion failed - %s (%d)\n", + ret == -EBUSY ? "queue is full" : + ret == -ETIMEDOUT ? "timeout" : + ret == -EAGAIN ? "matcher in resize" : "N/A", + ret); + hws_bwc_rule_cnt_dec(bwc_rule); + return ret; + } + /* At this point the rule wasn't added. * It could be because there was collision, or some other problem. * Try rehash by size and insert rule again - last chance. From 8a51507320ebddaab32610199774f69cd7d53e78 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Sun, 17 Aug 2025 23:23:22 +0300 Subject: [PATCH 203/246] net/mlx5: HWS, Fix table creation UID During table creation, caller passes a UID using ft_attr. The UID value was ignored, which leads to problems when the caller sets the UID to a non-zero value, such as SHARED_RESOURCE_UID (0xffff) - the internal FT objects will be created with UID=0. Fixes: 0869701cba3d ("net/mlx5: HWS, added FW commands handling") Signed-off-by: Alex Vesker Reviewed-by: Yevgeny Kliteynik Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-7-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/steering/hws/cmd.c | 1 + .../ethernet/mellanox/mlx5/core/steering/hws/cmd.h | 1 + .../mellanox/mlx5/core/steering/hws/fs_hws.c | 1 + .../mellanox/mlx5/core/steering/hws/matcher.c | 5 ++++- .../mellanox/mlx5/core/steering/hws/mlx5hws.h | 1 + .../mellanox/mlx5/core/steering/hws/table.c | 13 ++++++++++--- .../mellanox/mlx5/core/steering/hws/table.h | 3 ++- 7 files changed, 20 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c index 9c83753e4592..0bdcab2e5cf3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c @@ -55,6 +55,7 @@ int mlx5hws_cmd_flow_table_create(struct mlx5_core_dev *mdev, MLX5_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE); MLX5_SET(create_flow_table_in, in, table_type, ft_attr->type); + MLX5_SET(create_flow_table_in, in, uid, ft_attr->uid); ft_ctx = MLX5_ADDR_OF(create_flow_table_in, in, flow_table_context); MLX5_SET(flow_table_context, ft_ctx, level, ft_attr->level); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h index fa6bff210266..122ccc671628 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h @@ -36,6 +36,7 @@ struct mlx5hws_cmd_set_fte_attr { struct mlx5hws_cmd_ft_create_attr { u8 type; u8 level; + u16 uid; bool rtc_valid; bool decap_en; bool reformat_en; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c index 57592b92e24b..131e74b2b774 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c @@ -267,6 +267,7 @@ static int mlx5_cmd_hws_create_flow_table(struct mlx5_flow_root_namespace *ns, tbl_attr.type = MLX5HWS_TABLE_TYPE_FDB; tbl_attr.level = ft_attr->level; + tbl_attr.uid = ft_attr->uid; tbl = mlx5hws_table_create(ctx, &tbl_attr); if (!tbl) { mlx5_core_err(ns->dev, "Failed creating hws flow_table\n"); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c index f3ea09caba2b..32f87fdf3213 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c @@ -85,6 +85,7 @@ static int hws_matcher_create_end_ft_isolated(struct mlx5hws_matcher *matcher) ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, tbl, + 0, &matcher->end_ft_id); if (ret) { mlx5hws_err(tbl->ctx, "Isolated matcher: failed to create end flow table\n"); @@ -112,7 +113,9 @@ static int hws_matcher_create_end_ft(struct mlx5hws_matcher *matcher) if (mlx5hws_matcher_is_isolated(matcher)) ret = hws_matcher_create_end_ft_isolated(matcher); else - ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, tbl, + ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, + tbl, + 0, &matcher->end_ft_id); if (ret) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h index 59c14745ed0c..2498ceff2060 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h @@ -75,6 +75,7 @@ struct mlx5hws_context_attr { struct mlx5hws_table_attr { enum mlx5hws_table_type type; u32 level; + u16 uid; }; enum mlx5hws_matcher_flow_src { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c index 568f691733f3..6113383ae47b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c @@ -9,6 +9,7 @@ u32 mlx5hws_table_get_id(struct mlx5hws_table *tbl) } static void hws_table_init_next_ft_attr(struct mlx5hws_table *tbl, + u16 uid, struct mlx5hws_cmd_ft_create_attr *ft_attr) { ft_attr->type = tbl->fw_ft_type; @@ -16,7 +17,9 @@ static void hws_table_init_next_ft_attr(struct mlx5hws_table *tbl, ft_attr->level = tbl->ctx->caps->fdb_ft.max_level - 1; else ft_attr->level = tbl->ctx->caps->nic_ft.max_level - 1; + ft_attr->rtc_valid = true; + ft_attr->uid = uid; } static void hws_table_set_cap_attr(struct mlx5hws_table *tbl, @@ -119,12 +122,12 @@ static int hws_table_connect_to_default_miss_tbl(struct mlx5hws_table *tbl, u32 int mlx5hws_table_create_default_ft(struct mlx5_core_dev *mdev, struct mlx5hws_table *tbl, - u32 *ft_id) + u16 uid, u32 *ft_id) { struct mlx5hws_cmd_ft_create_attr ft_attr = {0}; int ret; - hws_table_init_next_ft_attr(tbl, &ft_attr); + hws_table_init_next_ft_attr(tbl, uid, &ft_attr); hws_table_set_cap_attr(tbl, &ft_attr); ret = mlx5hws_cmd_flow_table_create(mdev, &ft_attr, ft_id); @@ -189,7 +192,10 @@ static int hws_table_init(struct mlx5hws_table *tbl) } mutex_lock(&ctx->ctrl_lock); - ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, tbl, &tbl->ft_id); + ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, + tbl, + tbl->uid, + &tbl->ft_id); if (ret) { mlx5hws_err(tbl->ctx, "Failed to create flow table object\n"); mutex_unlock(&ctx->ctrl_lock); @@ -239,6 +245,7 @@ struct mlx5hws_table *mlx5hws_table_create(struct mlx5hws_context *ctx, tbl->ctx = ctx; tbl->type = attr->type; tbl->level = attr->level; + tbl->uid = attr->uid; ret = hws_table_init(tbl); if (ret) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h index 0400cce0c317..1246f9bd8422 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h @@ -18,6 +18,7 @@ struct mlx5hws_table { enum mlx5hws_table_type type; u32 fw_ft_type; u32 level; + u16 uid; struct list_head matchers_list; struct list_head tbl_list_node; struct mlx5hws_default_miss default_miss; @@ -47,7 +48,7 @@ u32 mlx5hws_table_get_res_fw_ft_type(enum mlx5hws_table_type tbl_type, int mlx5hws_table_create_default_ft(struct mlx5_core_dev *mdev, struct mlx5hws_table *tbl, - u32 *ft_id); + u16 uid, u32 *ft_id); void mlx5hws_table_destroy_default_ft(struct mlx5hws_table *tbl, u32 ft_id); From d2d6f950cb43be6845a41cac5956cb2a10e657e5 Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Sun, 17 Aug 2025 23:23:23 +0300 Subject: [PATCH 204/246] net/mlx5: CT: Use the correct counter offset Specifying the counter action is not enough, as it is used by multiple counters that were allocated in a bulk. By omitting the offset, rules will be associated with a different counter from the same bulk. Subsequently, the CT subsystem checks the correct counter, assumes that no traffic has triggered the rule, and ages out the rule. The end result is intermittent offloading of long lived connections, as rules are aged out then promptly re-added. Fix this by specifying the correct offset along with the counter rule. Fixes: 34eea5b12a10 ("net/mlx5e: CT: Add initial support for Hardware Steering") Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-8-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c index a4263137fef5..01d522b02947 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c @@ -173,6 +173,8 @@ static void mlx5_ct_fs_hmfs_fill_rule_actions(struct mlx5_ct_fs_hmfs *fs_hmfs, memset(rule_actions, 0, NUM_CT_HMFS_RULES * sizeof(*rule_actions)); rule_actions[0].action = mlx5_fc_get_hws_action(fs_hmfs->ctx, attr->counter); + rule_actions[0].counter.offset = + attr->counter->id - attr->counter->bulk->base_id; /* Modify header is special, it may require extra arguments outside the action itself. */ if (mh_action->mh_data) { rule_actions[1].modify_header.offset = mh_action->mh_data->offset; From 1683fd1b2fa79864d3c7a951d9cea0a9ba1a1923 Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Mon, 18 Aug 2025 11:35:13 +0530 Subject: [PATCH 205/246] microchip: lan865x: fix missing netif_start_queue() call on device open This fixes an issue where the transmit queue is started implicitly only the very first time the device is registered. When the device is taken down and brought back up again (using `ip` or `ifconfig`), the transmit queue is not restarted, causing packet transmission to hang. Adding an explicit call to netif_start_queue() in lan865x_net_open() ensures the transmit queue is properly started every time the device is reopened. Fixes: 5cd2340cb6a3 ("microchip: lan865x: add driver support for Microchip's LAN865X MAC-PHY") Signed-off-by: Parthiban Veerasooran Link: https://patch.msgid.link/20250818060514.52795-2-parthiban.veerasooran@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan865x/lan865x.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/microchip/lan865x/lan865x.c b/drivers/net/ethernet/microchip/lan865x/lan865x.c index dd436bdff0f8..d03f5a8de58d 100644 --- a/drivers/net/ethernet/microchip/lan865x/lan865x.c +++ b/drivers/net/ethernet/microchip/lan865x/lan865x.c @@ -311,6 +311,8 @@ static int lan865x_net_open(struct net_device *netdev) phy_start(netdev->phydev); + netif_start_queue(netdev); + return 0; } From 2cd58fec912acec273cb155911ab8f06ddbb131a Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Mon, 18 Aug 2025 11:35:14 +0530 Subject: [PATCH 206/246] microchip: lan865x: fix missing Timer Increment config for Rev.B0/B1 Fix missing configuration for LAN865x silicon revisions B0 and B1 as per Microchip Application Note AN1760 (Rev F, June 2024). The Timer Increment register was not being set, which is required for accurate timestamping. As per the application note, configure the MAC to set timestamping at the end of the Start of Frame Delimiter (SFD), and set the Timer Increment register to 40 ns (corresponding to a 25 MHz internal clock). Link: https://www.microchip.com/en-us/application-notes/an1760 Fixes: 5cd2340cb6a3 ("microchip: lan865x: add driver support for Microchip's LAN865X MAC-PHY") Signed-off-by: Parthiban Veerasooran Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250818060514.52795-3-parthiban.veerasooran@microchip.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/microchip/lan865x/lan865x.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/net/ethernet/microchip/lan865x/lan865x.c b/drivers/net/ethernet/microchip/lan865x/lan865x.c index d03f5a8de58d..84c41f193561 100644 --- a/drivers/net/ethernet/microchip/lan865x/lan865x.c +++ b/drivers/net/ethernet/microchip/lan865x/lan865x.c @@ -32,6 +32,10 @@ /* MAC Specific Addr 1 Top Reg */ #define LAN865X_REG_MAC_H_SADDR1 0x00010023 +/* MAC TSU Timer Increment Register */ +#define LAN865X_REG_MAC_TSU_TIMER_INCR 0x00010077 +#define MAC_TSU_TIMER_INCR_COUNT_NANOSECONDS 0x0028 + struct lan865x_priv { struct work_struct multicast_work; struct net_device *netdev; @@ -346,6 +350,21 @@ static int lan865x_probe(struct spi_device *spi) goto free_netdev; } + /* LAN865x Rev.B0/B1 configuration parameters from AN1760 + * As per the Configuration Application Note AN1760 published in the + * link, https://www.microchip.com/en-us/application-notes/an1760 + * Revision F (DS60001760G - June 2024), configure the MAC to set time + * stamping at the end of the Start of Frame Delimiter (SFD) and set the + * Timer Increment reg to 40 ns to be used as a 25 MHz internal clock. + */ + ret = oa_tc6_write_register(priv->tc6, LAN865X_REG_MAC_TSU_TIMER_INCR, + MAC_TSU_TIMER_INCR_COUNT_NANOSECONDS); + if (ret) { + dev_err(&spi->dev, "Failed to config TSU Timer Incr reg: %d\n", + ret); + goto oa_tc6_exit; + } + /* As per the point s3 in the below errata, SPI receive Ethernet frame * transfer may halt when starting the next frame in the same data block * (chunk) as the end of a previous frame. The RFA field should be From a47bc954cf0eb51f2828e1607d169d487df7f11f Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 20 Aug 2025 22:23:15 +0800 Subject: [PATCH 207/246] objtool/LoongArch: Get table size correctly if LTO is enabled When compiling with LLVM and CONFIG_LTO_CLANG is set, there exist many objtool warnings "sibling call from callable instruction with modified stack frame". For this special case, the related object file shows that there is no generated relocation section '.rela.discard.tablejump_annotate' for the table jump instruction jirl, thus objtool can not know that what is the actual destination address. It needs to do something on the LLVM side to make sure that there is the relocation section '.rela.discard.tablejump_annotate' if LTO is enabled, but in order to maintain compatibility for the current LLVM compiler, this can be done in the kernel Makefile for now. Ensure it is aware of linker with LTO, '--loongarch-annotate-tablejump' needs to be passed via '-mllvm' to ld.lld. Before doing the above changes, it should handle the special case of the relocation section '.rela.discard.tablejump_annotate' to get the correct table size first, otherwise there are many objtool warnings and errors if LTO is enabled. There are many different rodata for each function if LTO is enabled, it is necessary to enhance get_rodata_table_size_by_table_annotate(). Fixes: b95f852d3af2 ("objtool/LoongArch: Add support for switch table") Closes: https://lore.kernel.org/loongarch/20250731175655.GA1455142@ax162/ Reported-by: Nathan Chancellor Tested-by: Nathan Chancellor Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- tools/objtool/arch/loongarch/special.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tools/objtool/arch/loongarch/special.c b/tools/objtool/arch/loongarch/special.c index e39f86d97002..a80b75f7b061 100644 --- a/tools/objtool/arch/loongarch/special.c +++ b/tools/objtool/arch/loongarch/special.c @@ -27,6 +27,7 @@ static void get_rodata_table_size_by_table_annotate(struct objtool_file *file, struct table_info *next_table; unsigned long tmp_insn_offset; unsigned long tmp_rodata_offset; + bool is_valid_list = false; rsec = find_section_by_name(file->elf, ".rela.discard.tablejump_annotate"); if (!rsec) @@ -35,6 +36,12 @@ static void get_rodata_table_size_by_table_annotate(struct objtool_file *file, INIT_LIST_HEAD(&table_list); for_each_reloc(rsec, reloc) { + if (reloc->sym->sec->rodata) + continue; + + if (strcmp(insn->sec->name, reloc->sym->sec->name)) + continue; + orig_table = malloc(sizeof(struct table_info)); if (!orig_table) { WARN("malloc failed"); @@ -49,6 +56,22 @@ static void get_rodata_table_size_by_table_annotate(struct objtool_file *file, if (reloc_idx(reloc) + 1 == sec_num_entries(rsec)) break; + + if (strcmp(insn->sec->name, (reloc + 1)->sym->sec->name)) { + list_for_each_entry(orig_table, &table_list, jump_info) { + if (orig_table->insn_offset == insn->offset) { + is_valid_list = true; + break; + } + } + + if (!is_valid_list) { + list_del_init(&table_list); + continue; + } + + break; + } } list_for_each_entry(orig_table, &table_list, jump_info) { From 5dfea6644d201bfeffaa7e0d79d62309856613b7 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 20 Aug 2025 22:23:15 +0800 Subject: [PATCH 208/246] LoongArch: Pass annotate-tablejump option if LTO is enabled When compiling with LLVM and CONFIG_LTO_CLANG is set, there exist many objtool warnings "sibling call from callable instruction with modified stack frame". For this special case, the related object file shows that there is no generated relocation section '.rela.discard.tablejump_annotate' for the table jump instruction jirl, thus objtool can not know that what is the actual destination address. It needs to do something on the LLVM side to make sure that there is the relocation section '.rela.discard.tablejump_annotate' if LTO is enabled, but in order to maintain compatibility for the current LLVM compiler, this can be done in the kernel Makefile for now. Ensure it is aware of linker with LTO, '--loongarch-annotate-tablejump' needs to be passed via '-mllvm' to ld.lld. Note that it should also pass the compiler option -mannotate-tablejump rather than only pass '-mllvm --loongarch-annotate-tablejump' to ld.lld if LTO is enabled, otherwise there are no jump info for some table jump instructions. Fixes: e20ab7d454ee ("LoongArch: Enable jump table for objtool") Closes: https://lore.kernel.org/loongarch/20250731175655.GA1455142@ax162/ Reported-by: Nathan Chancellor Tested-by: Nathan Chancellor Co-developed-by: WANG Rui Signed-off-by: WANG Rui Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index b0703a4e02a2..a3a9759414f4 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -102,7 +102,13 @@ KBUILD_CFLAGS += $(call cc-option,-mthin-add-sub) $(call cc-option,-Wa$(comma) ifdef CONFIG_OBJTOOL ifdef CONFIG_CC_HAS_ANNOTATE_TABLEJUMP +# The annotate-tablejump option can not be passed to LLVM backend when LTO is enabled. +# Ensure it is aware of linker with LTO, '--loongarch-annotate-tablejump' also needs to +# be passed via '-mllvm' to ld.lld. KBUILD_CFLAGS += -mannotate-tablejump +ifdef CONFIG_LTO_CLANG +KBUILD_LDFLAGS += -mllvm --loongarch-annotate-tablejump +endif else KBUILD_CFLAGS += -fno-jump-tables # keep compatibility with older compilers endif From f7794a4d92ade518c813de69a01b27ca6d8d86f3 Mon Sep 17 00:00:00 2001 From: Ming Wang Date: Wed, 20 Aug 2025 22:23:16 +0800 Subject: [PATCH 209/246] LoongArch: Increase COMMAND_LINE_SIZE up to 4096 The default COMMAND_LINE_SIZE of 512, inherited from asm-generic, is too small for modern use cases. For example, kdump configurations or extensive debugging parameters can easily exceed this limit. Therefore, increase the command line size to 4096 bytes, aligning LoongArch with the MIPS architecture. This change follows a broader trend among architectures to raise this limit to support modern needs; for instance, PowerPC increased its value for similar reasons in the commit a5980d064fe2 ("powerpc: Bump COMMAND_LINE_SIZE to 2048"). Similar to the change made for RISC-V in the commit 61fc1ee8be26 ("riscv: Bump COMMAND_LINE_SIZE value to 1024"), this is considered a safe change. The broader kernel community has reached a consensus that modifying COMMAND_LINE_SIZE from UAPI headers does not constitute a uABI breakage, as well-behaved userspace applications should not rely on this macro. Suggested-by: Huang Cun Signed-off-by: Ming Wang Signed-off-by: Huacai Chen --- arch/loongarch/include/uapi/asm/setup.h | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 arch/loongarch/include/uapi/asm/setup.h diff --git a/arch/loongarch/include/uapi/asm/setup.h b/arch/loongarch/include/uapi/asm/setup.h new file mode 100644 index 000000000000..d46363ce3e02 --- /dev/null +++ b/arch/loongarch/include/uapi/asm/setup.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#ifndef _UAPI_ASM_LOONGARCH_SETUP_H +#define _UAPI_ASM_LOONGARCH_SETUP_H + +#define COMMAND_LINE_SIZE 4096 + +#endif /* _UAPI_ASM_LOONGARCH_SETUP_H */ From 8ef7f3132e4005a103b382e71abea7ad01fbeb86 Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Wed, 20 Aug 2025 22:23:44 +0800 Subject: [PATCH 210/246] LoongArch: Add cpuhotplug hooks to fix high cpu usage of vCPU threads When the CPU is offline, the timer of LoongArch is not correctly closed. This is harmless for real machines, but resulting in an excessively high cpu usage rate of the offline vCPU thread in the virtual machines. To correctly close the timer, we have made the following modifications: Register the cpu hotplug event (CPUHP_AP_LOONGARCH_ARCH_TIMER_STARTING) for LoongArch. This event's hooks will be called to close the timer when the CPU is offline. Clear the timer interrupt when the timer is turned off. Since before the timer is turned off, there may be a timer interrupt that has already been in the pending state due to the interruption of the disabled, which also affects the halt state of the offline vCPU. Signed-off-by: Xianglai Li Signed-off-by: Huacai Chen --- arch/loongarch/kernel/time.c | 22 ++++++++++++++++++++++ include/linux/cpuhotplug.h | 1 + 2 files changed, 23 insertions(+) diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c index 367906b10f81..f3092f2de8b5 100644 --- a/arch/loongarch/kernel/time.c +++ b/arch/loongarch/kernel/time.c @@ -5,6 +5,7 @@ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ #include +#include #include #include #include @@ -102,6 +103,23 @@ static int constant_timer_next_event(unsigned long delta, struct clock_event_dev return 0; } +static int arch_timer_starting(unsigned int cpu) +{ + set_csr_ecfg(ECFGF_TIMER); + + return 0; +} + +static int arch_timer_dying(unsigned int cpu) +{ + constant_set_state_shutdown(this_cpu_ptr(&constant_clockevent_device)); + + /* Clear Timer Interrupt */ + write_csr_tintclear(CSR_TINTCLR_TI); + + return 0; +} + static unsigned long get_loops_per_jiffy(void) { unsigned long lpj = (unsigned long)const_clock_freq; @@ -172,6 +190,10 @@ int constant_clockevent_init(void) lpj_fine = get_loops_per_jiffy(); pr_info("Constant clock event device register\n"); + cpuhp_setup_state(CPUHP_AP_LOONGARCH_ARCH_TIMER_STARTING, + "clockevents/loongarch/timer:starting", + arch_timer_starting, arch_timer_dying); + return 0; } diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index edfa61d80702..62cd7b35a29c 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -168,6 +168,7 @@ enum cpuhp_state { CPUHP_AP_QCOM_TIMER_STARTING, CPUHP_AP_TEGRA_TIMER_STARTING, CPUHP_AP_ARMADA_TIMER_STARTING, + CPUHP_AP_LOONGARCH_ARCH_TIMER_STARTING, CPUHP_AP_MIPS_GIC_TIMER_STARTING, CPUHP_AP_ARC_TIMER_STARTING, CPUHP_AP_REALTEK_TIMER_STARTING, From 63dbd8fb2af3a89466538599a9acb2d11ef65c06 Mon Sep 17 00:00:00 2001 From: Kanglong Wang Date: Wed, 20 Aug 2025 22:23:44 +0800 Subject: [PATCH 211/246] LoongArch: Optimize module load time by optimizing PLT/GOT counting When enabling CONFIG_KASAN, CONFIG_PREEMPT_VOLUNTARY_BUILD and CONFIG_PREEMPT_VOLUNTARY at the same time, there will be soft deadlock, the relevant logs are as follows: rcu: INFO: rcu_sched self-detected stall on CPU ... Call Trace: [<900000000024f9e4>] show_stack+0x5c/0x180 [<90000000002482f4>] dump_stack_lvl+0x94/0xbc [<9000000000224544>] rcu_dump_cpu_stacks+0x1fc/0x280 [<900000000037ac80>] rcu_sched_clock_irq+0x720/0xf88 [<9000000000396c34>] update_process_times+0xb4/0x150 [<90000000003b2474>] tick_nohz_handler+0xf4/0x250 [<9000000000397e28>] __hrtimer_run_queues+0x1d0/0x428 [<9000000000399b2c>] hrtimer_interrupt+0x214/0x538 [<9000000000253634>] constant_timer_interrupt+0x64/0x80 [<9000000000349938>] __handle_irq_event_percpu+0x78/0x1a0 [<9000000000349a78>] handle_irq_event_percpu+0x18/0x88 [<9000000000354c00>] handle_percpu_irq+0x90/0xf0 [<9000000000348c74>] handle_irq_desc+0x94/0xb8 [<9000000001012b28>] handle_cpu_irq+0x68/0xa0 [<9000000001def8c0>] handle_loongarch_irq+0x30/0x48 [<9000000001def958>] do_vint+0x80/0xd0 [<9000000000268a0c>] kasan_mem_to_shadow.part.0+0x2c/0x2a0 [<90000000006344f4>] __asan_load8+0x4c/0x120 [<900000000025c0d0>] module_frob_arch_sections+0x5c8/0x6b8 [<90000000003895f0>] load_module+0x9e0/0x2958 [<900000000038b770>] __do_sys_init_module+0x208/0x2d0 [<9000000001df0c34>] do_syscall+0x94/0x190 [<900000000024d6fc>] handle_syscall+0xbc/0x158 After analysis, this is because the slow speed of loading the amdgpu module leads to the long time occupation of the cpu and then the soft deadlock. When loading a module, module_frob_arch_sections() tries to figure out the number of PLTs/GOTs that will be needed to handle all the RELAs. It will call the count_max_entries() to find in an out-of-order date which counting algorithm has O(n^2) complexity. To make it faster, we sort the relocation list by info and addend. That way, to check for a duplicate relocation, it just needs to compare with the previous entry. This reduces the complexity of the algorithm to O(n log n), as done in commit d4e0340919fb ("arm64/module: Optimize module load time by optimizing PLT counting"). This gives sinificant reduction in module load time for modules with large number of relocations. After applying this patch, the soft deadlock problem has been solved, and the kernel starts normally without "Call Trace". Using the default configuration to test some modules, the results are as follows: Module Size ip_tables 36K fat 143K radeon 2.5MB amdgpu 16MB Without this patch: Module Module load time (ms) Count(PLTs/GOTs) ip_tables 18 59/6 fat 0 162/14 radeon 54 1221/84 amdgpu 1411 4525/1098 With this patch: Module Module load time (ms) Count(PLTs/GOTs) ip_tables 18 59/6 fat 0 162/14 radeon 22 1221/84 amdgpu 45 4525/1098 Fixes: fcdfe9d22bed ("LoongArch: Add ELF and module support") Signed-off-by: Kanglong Wang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/module-sections.c | 38 ++++++++++++------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/loongarch/kernel/module-sections.c b/arch/loongarch/kernel/module-sections.c index e2f30ff9afde..a43ba7f9f987 100644 --- a/arch/loongarch/kernel/module-sections.c +++ b/arch/loongarch/kernel/module-sections.c @@ -8,6 +8,7 @@ #include #include #include +#include Elf_Addr module_emit_got_entry(struct module *mod, Elf_Shdr *sechdrs, Elf_Addr val) { @@ -61,39 +62,38 @@ Elf_Addr module_emit_plt_entry(struct module *mod, Elf_Shdr *sechdrs, Elf_Addr v return (Elf_Addr)&plt[nr]; } -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y) +#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b)) + +static int compare_rela(const void *x, const void *y) { - return x->r_info == y->r_info && x->r_addend == y->r_addend; -} + int ret; + const Elf_Rela *rela_x = x, *rela_y = y; -static bool duplicate_rela(const Elf_Rela *rela, int idx) -{ - int i; + ret = cmp_3way(rela_x->r_info, rela_y->r_info); + if (ret == 0) + ret = cmp_3way(rela_x->r_addend, rela_y->r_addend); - for (i = 0; i < idx; i++) { - if (is_rela_equal(&rela[i], &rela[idx])) - return true; - } - - return false; + return ret; } static void count_max_entries(Elf_Rela *relas, int num, unsigned int *plts, unsigned int *gots) { - unsigned int i, type; + unsigned int i; + + sort(relas, num, sizeof(Elf_Rela), compare_rela, NULL); for (i = 0; i < num; i++) { - type = ELF_R_TYPE(relas[i].r_info); - switch (type) { + if (i && !compare_rela(&relas[i-1], &relas[i])) + continue; + + switch (ELF_R_TYPE(relas[i].r_info)) { case R_LARCH_SOP_PUSH_PLT_PCREL: case R_LARCH_B26: - if (!duplicate_rela(relas, i)) - (*plts)++; + (*plts)++; break; case R_LARCH_GOT_PC_HI20: - if (!duplicate_rela(relas, i)) - (*gots)++; + (*gots)++; break; default: break; /* Do nothing. */ From 112ca94f6c3b3e0b2002a240de43c487a33e0234 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 20 Aug 2025 22:23:44 +0800 Subject: [PATCH 212/246] LoongArch: Save LBT before FPU in setup_sigcontext() Now if preemption happens between protected_save_fpu_context() and protected_save_lbt_context(), FTOP context is lost. Because FTOP is saved by protected_save_lbt_context() but protected_save_fpu_context() disables TM before that. So save LBT before FPU in setup_sigcontext() to avoid this potential risk. Signed-off-by: Hanlu Li Signed-off-by: Huacai Chen --- arch/loongarch/kernel/signal.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/loongarch/kernel/signal.c b/arch/loongarch/kernel/signal.c index 4740cb5b2388..c9f7ca778364 100644 --- a/arch/loongarch/kernel/signal.c +++ b/arch/loongarch/kernel/signal.c @@ -677,6 +677,11 @@ static int setup_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, for (i = 1; i < 32; i++) err |= __put_user(regs->regs[i], &sc->sc_regs[i]); +#ifdef CONFIG_CPU_HAS_LBT + if (extctx->lbt.addr) + err |= protected_save_lbt_context(extctx); +#endif + if (extctx->lasx.addr) err |= protected_save_lasx_context(extctx); else if (extctx->lsx.addr) @@ -684,11 +689,6 @@ static int setup_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, else if (extctx->fpu.addr) err |= protected_save_fpu_context(extctx); -#ifdef CONFIG_CPU_HAS_LBT - if (extctx->lbt.addr) - err |= protected_save_lbt_context(extctx); -#endif - /* Set the "end" magic */ info = (struct sctx_info *)extctx->end.addr; err |= __put_user(0, &info->magic); From 0078e94a4733454d1ffa3888afe88bf19c81b91c Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 20 Aug 2025 22:23:44 +0800 Subject: [PATCH 213/246] LoongArch: Rename GCC_PLUGIN_STACKLEAK to KSTACK_ERASE Commit 57fbad15c2eee772 ("stackleak: Rename STACKLEAK to KSTACK_ERASE") misses the stackframe.h part for LoongArch, so fix it. Fixes: 57fbad15c2eee772 ("stackleak: Rename STACKLEAK to KSTACK_ERASE") Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/stackframe.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/include/asm/stackframe.h b/arch/loongarch/include/asm/stackframe.h index 3eda298702b1..5cb568a60cf8 100644 --- a/arch/loongarch/include/asm/stackframe.h +++ b/arch/loongarch/include/asm/stackframe.h @@ -58,7 +58,7 @@ .endm .macro STACKLEAK_ERASE -#ifdef CONFIG_GCC_PLUGIN_STACKLEAK +#ifdef CONFIG_KSTACK_ERASE bl stackleak_erase_on_task_stack #endif .endm From ec879e1a0be8007aa232ffedcf6a6445dfc1a3d7 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 16 Aug 2025 23:10:51 +0900 Subject: [PATCH 214/246] tracing: fprobe-event: Sanitize wildcard for fprobe event name Fprobe event accepts wildcards for the target functions, but unless user specifies its event name, it makes an event with the wildcards. /sys/kernel/tracing # echo 'f mutex*' >> dynamic_events /sys/kernel/tracing # cat dynamic_events f:fprobes/mutex*__entry mutex* /sys/kernel/tracing # ls events/fprobes/ enable filter mutex*__entry To fix this, replace the wildcard ('*') with an underscore. Link: https://lore.kernel.org/all/175535345114.282990.12294108192847938710.stgit@devnote2/ Fixes: 334e5519c375 ("tracing/probes: Add fprobe events for tracing function entry and exit.") Signed-off-by: Masami Hiramatsu (Google) Cc: stable@vger.kernel.org --- kernel/trace/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1dbf1d3cf2f1..5a6688832da8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -2204,7 +2204,7 @@ static inline bool is_good_system_name(const char *name) static inline void sanitize_event_name(char *name) { while (*name++ != '\0') - if (*name == ':' || *name == '.') + if (*name == ':' || *name == '.' || *name == '*') *name = '_'; } From 4be8cefc132606b4a6e851f37f8e8c40c406c910 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 20 Aug 2025 22:51:14 +0800 Subject: [PATCH 215/246] LoongArch: KVM: Make function kvm_own_lbt() robust Add the flag KVM_LARCH_LBT checking in function kvm_own_lbt(), so that it can be called safely rather than duplicated enabling again. Cc: stable@vger.kernel.org Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/vcpu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index d1b8c50941ca..ce478151466c 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -1283,9 +1283,11 @@ int kvm_own_lbt(struct kvm_vcpu *vcpu) return -EINVAL; preempt_disable(); - set_csr_euen(CSR_EUEN_LBTEN); - _restore_lbt(&vcpu->arch.lbt); - vcpu->arch.aux_inuse |= KVM_LARCH_LBT; + if (!(vcpu->arch.aux_inuse & KVM_LARCH_LBT)) { + set_csr_euen(CSR_EUEN_LBTEN); + _restore_lbt(&vcpu->arch.lbt); + vcpu->arch.aux_inuse |= KVM_LARCH_LBT; + } preempt_enable(); return 0; From 5c68549c81bcca70fc464e305ffeefd9af968287 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 20 Aug 2025 22:51:15 +0800 Subject: [PATCH 216/246] LoongArch: KVM: Fix stack protector issue in send_ipi_data() Function kvm_io_bus_read() is called in function send_ipi_data(), buffer size of parameter *val should be at least 8 bytes. Since some emulation functions like loongarch_ipi_readl() and kvm_eiointc_read() will write the buffer *val with 8 bytes signed extension regardless parameter len. Otherwise there will be buffer overflow issue when CONFIG_STACKPROTECTOR is enabled. The bug report is shown as follows: Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: send_ipi_data+0x194/0x1a0 [kvm] CPU: 11 UID: 107 PID: 2692 Comm: CPU 0/KVM Not tainted 6.17.0-rc1+ #102 PREEMPT(full) Stack : 9000000005901568 0000000000000000 9000000003af371c 900000013c68c000 900000013c68f850 900000013c68f858 0000000000000000 900000013c68f998 900000013c68f990 900000013c68f990 900000013c68f6c0 fffffffffffdb058 fffffffffffdb0e0 900000013c68f858 911e1d4d39cf0ec2 9000000105657a00 0000000000000001 fffffffffffffffe 0000000000000578 282049464555206e 6f73676e6f6f4c20 0000000000000001 00000000086b4000 0000000000000000 0000000000000000 0000000000000000 9000000005709968 90000000058f9000 900000013c68fa68 900000013c68fab4 90000000029279f0 900000010153f940 900000010001f360 0000000000000000 9000000003af3734 000000004390000c 00000000000000b0 0000000000000004 0000000000000000 0000000000071c1d ... Call Trace: [<9000000003af3734>] show_stack+0x5c/0x180 [<9000000003aed168>] dump_stack_lvl+0x6c/0x9c [<9000000003ad0ab0>] vpanic+0x108/0x2c4 [<9000000003ad0ca8>] panic+0x3c/0x40 [<9000000004eb0a1c>] __stack_chk_fail+0x14/0x18 [] send_ipi_data+0x190/0x1a0 [kvm] [] __kvm_io_bus_write+0xa4/0xe8 [kvm] [] kvm_io_bus_write+0x54/0x90 [kvm] [] kvm_emu_iocsr+0x180/0x310 [kvm] [] kvm_handle_gspr+0x280/0x478 [kvm] [] kvm_handle_exit+0xc0/0x130 [kvm] Cc: stable@vger.kernel.org Fixes: daee2f9cae551 ("LoongArch: KVM: Add IPI read and write function") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/ipi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/loongarch/kvm/intc/ipi.c b/arch/loongarch/kvm/intc/ipi.c index e658d5b37c04..7925651d2ccf 100644 --- a/arch/loongarch/kvm/intc/ipi.c +++ b/arch/loongarch/kvm/intc/ipi.c @@ -99,7 +99,7 @@ static void write_mailbox(struct kvm_vcpu *vcpu, int offset, uint64_t data, int static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data) { int i, idx, ret; - uint32_t val = 0, mask = 0; + uint64_t val = 0, mask = 0; /* * Bit 27-30 is mask for byte writing. @@ -108,7 +108,7 @@ static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data) if ((data >> 27) & 0xf) { /* Read the old val */ idx = srcu_read_lock(&vcpu->kvm->srcu); - ret = kvm_io_bus_read(vcpu, KVM_IOCSR_BUS, addr, sizeof(val), &val); + ret = kvm_io_bus_read(vcpu, KVM_IOCSR_BUS, addr, 4, &val); srcu_read_unlock(&vcpu->kvm->srcu, idx); if (unlikely(ret)) { kvm_err("%s: : read data from addr %llx failed\n", __func__, addr); @@ -124,7 +124,7 @@ static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data) } val |= ((uint32_t)(data >> 32) & ~mask); idx = srcu_read_lock(&vcpu->kvm->srcu); - ret = kvm_io_bus_write(vcpu, KVM_IOCSR_BUS, addr, sizeof(val), &val); + ret = kvm_io_bus_write(vcpu, KVM_IOCSR_BUS, addr, 4, &val); srcu_read_unlock(&vcpu->kvm->srcu, idx); if (unlikely(ret)) kvm_err("%s: : write data to addr %llx failed\n", __func__, addr); From 0dfd9ea7bf80fabe11f5b775d762a5cd168cdf41 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Wed, 20 Aug 2025 22:51:15 +0800 Subject: [PATCH 217/246] LoongArch: KVM: Use kvm_get_vcpu_by_id() instead of kvm_get_vcpu() Since using kvm_get_vcpu() may fail to retrieve the vCPU context, kvm_get_vcpu_by_id() should be used instead. Fixes: 8e3054261bc3 ("LoongArch: KVM: Add IPI user mode read and write function") Fixes: 3956a52bc05b ("LoongArch: KVM: Add EIOINTC read and write functions") Reviewed-by: Yanteng Si Signed-off-by: Song Gao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/eiointc.c | 7 ++++++- arch/loongarch/kvm/intc/ipi.c | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index a3a12af9ecbf..026b139dcff2 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -45,7 +45,12 @@ static void eiointc_update_irq(struct loongarch_eiointc *s, int irq, int level) } cpu = s->sw_coremap[irq]; - vcpu = kvm_get_vcpu(s->kvm, cpu); + vcpu = kvm_get_vcpu_by_id(s->kvm, cpu); + if (unlikely(vcpu == NULL)) { + kvm_err("%s: invalid target cpu: %d\n", __func__, cpu); + return; + } + if (level) { /* if not enable return false */ if (!test_bit(irq, (unsigned long *)s->enable.reg_u32)) diff --git a/arch/loongarch/kvm/intc/ipi.c b/arch/loongarch/kvm/intc/ipi.c index 7925651d2ccf..5a8481dda052 100644 --- a/arch/loongarch/kvm/intc/ipi.c +++ b/arch/loongarch/kvm/intc/ipi.c @@ -298,7 +298,7 @@ static int kvm_ipi_regs_access(struct kvm_device *dev, cpu = (attr->attr >> 16) & 0x3ff; addr = attr->attr & 0xff; - vcpu = kvm_get_vcpu(dev->kvm, cpu); + vcpu = kvm_get_vcpu_by_id(dev->kvm, cpu); if (unlikely(vcpu == NULL)) { kvm_err("%s: invalid target cpu: %d\n", __func__, cpu); return -EINVAL; From 538c06e3964a8e94b645686cc58ccc4a06fa6330 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 20 Aug 2025 22:51:15 +0800 Subject: [PATCH 218/246] LoongArch: KVM: Add address alignment check in pch_pic register access With pch_pic device, its register is based on MMIO address space, different access size 1/2/4/8 is supported. And base address should be naturally aligned with its access size, here add alignment check in its register access emulation function. Cc: stable@vger.kernel.org Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/pch_pic.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/loongarch/kvm/intc/pch_pic.c b/arch/loongarch/kvm/intc/pch_pic.c index 6f00ffe05c54..119290bcea79 100644 --- a/arch/loongarch/kvm/intc/pch_pic.c +++ b/arch/loongarch/kvm/intc/pch_pic.c @@ -195,6 +195,11 @@ static int kvm_pch_pic_read(struct kvm_vcpu *vcpu, return -EINVAL; } + if (addr & (len - 1)) { + kvm_err("%s: pch pic not aligned addr %llx len %d\n", __func__, addr, len); + return -EINVAL; + } + /* statistics of pch pic reading */ vcpu->stat.pch_pic_read_exits++; ret = loongarch_pch_pic_read(s, addr, len, val); @@ -302,6 +307,11 @@ static int kvm_pch_pic_write(struct kvm_vcpu *vcpu, return -EINVAL; } + if (addr & (len - 1)) { + kvm_err("%s: pch pic not aligned addr %llx len %d\n", __func__, addr, len); + return -EINVAL; + } + /* statistics of pch pic writing */ vcpu->stat.pch_pic_write_exits++; ret = loongarch_pch_pic_write(s, addr, len, val); From 729dc340a4ed1267774fc8518284e976e2210bdc Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sun, 17 Aug 2025 16:21:46 +0200 Subject: [PATCH 219/246] bootconfig: Fix negative seeks on 32-bit with LFS enabled Commit 26dda5769509 "tools/bootconfig: Cleanup bootconfig footer size calculations" replaced some expressions of type int with the BOOTCONFIG_FOOTER_SIZE macro, which expands to an expression of type size_t, which is unsigned. On 32-bit architectures with LFS enabled (i.e. off_t is 64-bit), the seek offset of -BOOTCONFIG_FOOTER_SIZE now turns into a positive value. Fix this by casting the size to off_t before negating it. Just in case someone changes BOOTCONFIG_MAGIC_LEN to have type size_t later, do the same thing to the seek offset of -BOOTCONFIG_MAGIC_LEN. Link: https://lore.kernel.org/all/aKHlevxeg6Y7UQrz@decadent.org.uk/ Fixes: 26dda5769509 ("tools/bootconfig: Cleanup bootconfig footer size calculations") Signed-off-by: Ben Hutchings Signed-off-by: Masami Hiramatsu (Google) --- tools/bootconfig/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index 57c669d2aa90..55d59ed507d5 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -193,7 +193,7 @@ static int load_xbc_from_initrd(int fd, char **buf) if (stat.st_size < BOOTCONFIG_FOOTER_SIZE) return 0; - if (lseek(fd, -BOOTCONFIG_MAGIC_LEN, SEEK_END) < 0) + if (lseek(fd, -(off_t)BOOTCONFIG_MAGIC_LEN, SEEK_END) < 0) return pr_errno("Failed to lseek for magic", -errno); if (read(fd, magic, BOOTCONFIG_MAGIC_LEN) < 0) @@ -203,7 +203,7 @@ static int load_xbc_from_initrd(int fd, char **buf) if (memcmp(magic, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN) != 0) return 0; - if (lseek(fd, -BOOTCONFIG_FOOTER_SIZE, SEEK_END) < 0) + if (lseek(fd, -(off_t)BOOTCONFIG_FOOTER_SIZE, SEEK_END) < 0) return pr_errno("Failed to lseek for size", -errno); if (read(fd, &size, sizeof(uint32_t)) < 0) From 4d4d9ef9dfee877d494e5418f68a1016ef08cad6 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Tue, 19 Aug 2025 15:19:57 -0700 Subject: [PATCH 220/246] ixgbe: xsk: resolve the negative overflow of budget in ixgbe_xmit_zc Resolve the budget negative overflow which leads to returning true in ixgbe_xmit_zc even when the budget of descs are thoroughly consumed. Before this patch, when the budget is decreased to zero and finishes sending the last allowed desc in ixgbe_xmit_zc, it will always turn back and enter into the while() statement to see if it should keep processing packets, but in the meantime it unexpectedly decreases the value again to 'unsigned int (0--)', namely, UINT_MAX. Finally, the ixgbe_xmit_zc returns true, showing 'we complete cleaning the budget'. That also means 'clean_complete = true' in ixgbe_poll. The true theory behind this is if that budget number of descs are consumed, it implies that we might have more descs to be done. So we should return false in ixgbe_xmit_zc to tell napi poll to find another chance to start polling to handle the rest of descs. On the contrary, returning true here means job done and we know we finish all the possible descs this time and we don't intend to start a new napi poll. It is apparently against our expectations. Please also see how ixgbe_clean_tx_irq() handles the problem: it uses do..while() statement to make sure the budget can be decreased to zero at most and the negative overflow never happens. The patch adds 'likely' because we rarely would not hit the loop condition since the standard budget is 256. Fixes: 8221c5eba8c1 ("ixgbe: add AF_XDP zero-copy Tx support") Signed-off-by: Jason Xing Reviewed-by: Larysa Zaremba Reviewed-by: Paul Menzel Reviewed-by: Aleksandr Loktionov Tested-by: Priya Singh Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20250819222000.3504873-4-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index ac58964b2f08..7b941505a9d0 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -398,7 +398,7 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget) dma_addr_t dma; u32 cmd_type; - while (budget-- > 0) { + while (likely(budget)) { if (unlikely(!ixgbe_desc_unused(xdp_ring))) { work_done = false; break; @@ -433,6 +433,8 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget) xdp_ring->next_to_use++; if (xdp_ring->next_to_use == xdp_ring->count) xdp_ring->next_to_use = 0; + + budget--; } if (tx_desc) { From f3d9f7fa7f5dbfd4fdb1e69c25fc5627700d19dd Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 19 Aug 2025 15:19:58 -0700 Subject: [PATCH 221/246] ixgbe: fix ndo_xdp_xmit() workloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently ixgbe driver checks periodically in its watchdog subtask if there is anything to be transmitted (considering both Tx and XDP rings) under state of carrier not being 'ok'. Such event is interpreted as Tx hang and therefore results in interface reset. This is currently problematic for ndo_xdp_xmit() as it is allowed to produce descriptors when interface is going through reset or its carrier is turned off. Furthermore, XDP rings should not really be objects of Tx hang detection. This mechanism is rather a matter of ndo_tx_timeout() being called from dev_watchdog against Tx rings exposed to networking stack. Taking into account issues described above, let us have a two fold fix - do not respect XDP rings in local ixgbe watchdog and do not produce Tx descriptors in ndo_xdp_xmit callback when there is some problem with carrier currently. For now, keep the Tx hang checks in clean Tx irq routine, but adjust it to not execute for XDP rings. Cc: Tobias Böhm Reported-by: Marcus Wichelmann Closes: https://lore.kernel.org/netdev/eca1880f-253a-4955-afe6-732d7c6926ee@hetzner-cloud.de/ Fixes: 6453073987ba ("ixgbe: add initial support for xdp redirect") Fixes: 33fdc82f0883 ("ixgbe: add support for XDP_TX action") Reviewed-by: Aleksandr Loktionov Tested-by: Marcus Wichelmann Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20250819222000.3504873-5-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 34 ++++++------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 6122a0abb41f..80e6a2ef1350 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -968,10 +968,6 @@ static void ixgbe_update_xoff_rx_lfc(struct ixgbe_adapter *adapter) for (i = 0; i < adapter->num_tx_queues; i++) clear_bit(__IXGBE_HANG_CHECK_ARMED, &adapter->tx_ring[i]->state); - - for (i = 0; i < adapter->num_xdp_queues; i++) - clear_bit(__IXGBE_HANG_CHECK_ARMED, - &adapter->xdp_ring[i]->state); } static void ixgbe_update_xoff_received(struct ixgbe_adapter *adapter) @@ -1214,7 +1210,7 @@ static void ixgbe_pf_handle_tx_hang(struct ixgbe_ring *tx_ring, struct ixgbe_adapter *adapter = netdev_priv(tx_ring->netdev); struct ixgbe_hw *hw = &adapter->hw; - e_err(drv, "Detected Tx Unit Hang%s\n" + e_err(drv, "Detected Tx Unit Hang\n" " Tx Queue <%d>\n" " TDH, TDT <%x>, <%x>\n" " next_to_use <%x>\n" @@ -1222,16 +1218,14 @@ static void ixgbe_pf_handle_tx_hang(struct ixgbe_ring *tx_ring, "tx_buffer_info[next_to_clean]\n" " time_stamp <%lx>\n" " jiffies <%lx>\n", - ring_is_xdp(tx_ring) ? " (XDP)" : "", tx_ring->queue_index, IXGBE_READ_REG(hw, IXGBE_TDH(tx_ring->reg_idx)), IXGBE_READ_REG(hw, IXGBE_TDT(tx_ring->reg_idx)), tx_ring->next_to_use, next, tx_ring->tx_buffer_info[next].time_stamp, jiffies); - if (!ring_is_xdp(tx_ring)) - netif_stop_subqueue(tx_ring->netdev, - tx_ring->queue_index); + netif_stop_subqueue(tx_ring->netdev, + tx_ring->queue_index); } /** @@ -1451,6 +1445,9 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector, total_bytes); adapter->tx_ipsec += total_ipsec; + if (ring_is_xdp(tx_ring)) + return !!budget; + if (check_for_tx_hang(tx_ring) && ixgbe_check_tx_hang(tx_ring)) { if (adapter->hw.mac.type == ixgbe_mac_e610) ixgbe_handle_mdd_event(adapter, tx_ring); @@ -1468,9 +1465,6 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector, return true; } - if (ring_is_xdp(tx_ring)) - return !!budget; - #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2) txq = netdev_get_tx_queue(tx_ring->netdev, tx_ring->queue_index); if (!__netif_txq_completed_wake(txq, total_packets, total_bytes, @@ -7974,12 +7968,9 @@ static void ixgbe_check_hang_subtask(struct ixgbe_adapter *adapter) return; /* Force detection of hung controller */ - if (netif_carrier_ok(adapter->netdev)) { + if (netif_carrier_ok(adapter->netdev)) for (i = 0; i < adapter->num_tx_queues; i++) set_check_for_tx_hang(adapter->tx_ring[i]); - for (i = 0; i < adapter->num_xdp_queues; i++) - set_check_for_tx_hang(adapter->xdp_ring[i]); - } if (!(adapter->flags & IXGBE_FLAG_MSIX_ENABLED)) { /* @@ -8199,13 +8190,6 @@ static bool ixgbe_ring_tx_pending(struct ixgbe_adapter *adapter) return true; } - for (i = 0; i < adapter->num_xdp_queues; i++) { - struct ixgbe_ring *ring = adapter->xdp_ring[i]; - - if (ring->next_to_use != ring->next_to_clean) - return true; - } - return false; } @@ -11005,6 +10989,10 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n, if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state))) return -ENETDOWN; + if (!netif_carrier_ok(adapter->netdev) || + !netif_running(adapter->netdev)) + return -ENETDOWN; + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; From 1468c1f97cf32418e34dbb40b784ed9333b9e123 Mon Sep 17 00:00:00 2001 From: ValdikSS Date: Tue, 19 Aug 2025 15:19:59 -0700 Subject: [PATCH 222/246] igc: fix disabling L1.2 PCI-E link substate on I226 on init Device ID comparison in igc_is_device_id_i226 is performed before the ID is set, resulting in always failing check on init. Before the patch: * L1.2 is not disabled on init * L1.2 is properly disabled after suspend-resume cycle With the patch: * L1.2 is properly disabled both on init and after suspend-resume How to test: Connect to the 1G link with 300+ mbit/s Internet speed, and run the download speed test, such as: curl -o /dev/null http://speedtest.selectel.ru/1GB Without L1.2 disabled, the speed would be no more than ~200 mbit/s. With L1.2 disabled, the speed would reach 1 gbit/s. Note: it's required that the latency between your host and the remote be around 3-5 ms, the test inside LAN (<1 ms latency) won't trigger the issue. Link: https://lore.kernel.org/intel-wired-lan/15248b4f-3271-42dd-8e35-02bfc92b25e1@intel.com Fixes: 0325143b59c6 ("igc: disable L1.2 PCI-E link substate to avoid performance issue") Signed-off-by: ValdikSS Reviewed-by: Vitaly Lifshits Reviewed-by: Paul Menzel Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20250819222000.3504873-6-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igc/igc_main.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 458e5eaa92e5..e79b14d50b24 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7149,6 +7149,13 @@ static int igc_probe(struct pci_dev *pdev, adapter->port_num = hw->bus.func; adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE); + /* PCI config space info */ + hw->vendor_id = pdev->vendor; + hw->device_id = pdev->device; + hw->revision_id = pdev->revision; + hw->subsystem_vendor_id = pdev->subsystem_vendor; + hw->subsystem_device_id = pdev->subsystem_device; + /* Disable ASPM L1.2 on I226 devices to avoid packet loss */ if (igc_is_device_id_i226(hw)) pci_disable_link_state(pdev, PCIE_LINK_STATE_L1_2); @@ -7175,13 +7182,6 @@ static int igc_probe(struct pci_dev *pdev, netdev->mem_start = pci_resource_start(pdev, 0); netdev->mem_end = pci_resource_end(pdev, 0); - /* PCI config space info */ - hw->vendor_id = pdev->vendor; - hw->device_id = pdev->device; - hw->revision_id = pdev->revision; - hw->subsystem_vendor_id = pdev->subsystem_vendor; - hw->subsystem_device_id = pdev->subsystem_device; - /* Copy the default MAC and PHY function pointers */ memcpy(&hw->mac.ops, ei->mac_ops, sizeof(hw->mac.ops)); memcpy(&hw->phy.ops, ei->phy_ops, sizeof(hw->phy.ops)); From e318cd6714592fb762fcab59c5684a442243a12f Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Mon, 18 Aug 2025 18:04:57 -0700 Subject: [PATCH 223/246] net: dsa: microchip: Fix KSZ9477 HSR port setup issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ksz9477_hsr_join() is called once to setup the HSR port membership, but the port can be enabled later, or disabled and enabled back and the port membership is not set correctly inside ksz_update_port_member(). The added code always use the correct HSR port membership for HSR port that is enabled. Fixes: 2d61298fdd7b ("net: dsa: microchip: Enable HSR offloading for KSZ9477") Reported-by: Frieder Schrempf Signed-off-by: Tristram Ha Reviewed-by: Łukasz Majewski Tested-by: Frieder Schrempf Reviewed-by: Frieder Schrempf Link: https://patch.msgid.link/20250819010457.563286-1-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_common.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 4cb14288ff0f..9568cc391fe3 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -2457,6 +2457,12 @@ static void ksz_update_port_member(struct ksz_device *dev, int port) dev->dev_ops->cfg_port_member(dev, i, val | cpu_port); } + /* HSR ports are setup once so need to use the assigned membership + * when the port is enabled. + */ + if (!port_member && p->stp_state == BR_STATE_FORWARDING && + (dev->hsr_ports & BIT(port))) + port_member = dev->hsr_ports; dev->dev_ops->cfg_port_member(dev, port, port_member | cpu_port); } From 15de71d06a400f7fdc15bf377a2552b0ec437cf5 Mon Sep 17 00:00:00 2001 From: William Liu Date: Tue, 19 Aug 2025 03:36:28 +0000 Subject: [PATCH 224/246] net/sched: Make cake_enqueue return NET_XMIT_CN when past buffer_limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following setup can trigger a WARNING in htb_activate due to the condition: !cl->leaf.q->q.qlen tc qdisc del dev lo root tc qdisc add dev lo root handle 1: htb default 1 tc class add dev lo parent 1: classid 1:1 \ htb rate 64bit tc qdisc add dev lo parent 1:1 handle f: \ cake memlimit 1b ping -I lo -f -c1 -s64 -W0.001 127.0.0.1 This is because the low memlimit leads to a low buffer_limit, which causes packet dropping. However, cake_enqueue still returns NET_XMIT_SUCCESS, causing htb_enqueue to call htb_activate with an empty child qdisc. We should return NET_XMIT_CN when packets are dropped from the same tin and flow. I do not believe return value of NET_XMIT_CN is necessary for packet drops in the case of ack filtering, as that is meant to optimize performance, not to signal congestion. Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: William Liu Reviewed-by: Savino Dicanosa Acked-by: Toke Høiland-Jørgensen Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20250819033601.579821-1-will@willsroot.io Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index dbcfb948c867..32bacfc314c2 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1750,7 +1750,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, ktime_t now = ktime_get(); struct cake_tin_data *b; struct cake_flow *flow; - u32 idx; + u32 idx, tin; /* choose flow to insert into */ idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); @@ -1760,6 +1760,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, __qdisc_drop(skb, to_free); return ret; } + tin = (u32)(b - q->tins); idx--; flow = &b->flows[idx]; @@ -1927,13 +1928,22 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->buffer_max_used = q->buffer_used; if (q->buffer_used > q->buffer_limit) { + bool same_flow = false; u32 dropped = 0; + u32 drop_id; while (q->buffer_used > q->buffer_limit) { dropped++; - cake_drop(sch, to_free); + drop_id = cake_drop(sch, to_free); + + if ((drop_id >> 16) == tin && + (drop_id & 0xFFFF) == idx) + same_flow = true; } b->drop_overlimit += dropped; + + if (same_flow) + return NET_XMIT_CN; } return NET_XMIT_SUCCESS; } From 2c2192e5f9c7c2892fe2363244d1387f62710d83 Mon Sep 17 00:00:00 2001 From: William Liu Date: Tue, 19 Aug 2025 03:36:59 +0000 Subject: [PATCH 225/246] net/sched: Remove unnecessary WARNING condition for empty child qdisc in htb_activate The WARN_ON trigger based on !cl->leaf.q->q.qlen is unnecessary in htb_activate. htb_dequeue_tree already accounts for that scenario. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: William Liu Reviewed-by: Savino Dicanosa Link: https://patch.msgid.link/20250819033632.579854-1-will@willsroot.io Signed-off-by: Jakub Kicinski --- net/sched/sch_htb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index c968ea763774..b5e40c51655a 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -592,7 +592,7 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) */ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) { - WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen); + WARN_ON(cl->level || !cl->leaf.q); if (!cl->prio_activity) { cl->prio_activity = 1 << cl->prio; From 7af76e9d18a9fd6f8611b3313c86c190f9b6a5a7 Mon Sep 17 00:00:00 2001 From: Jakub Acs Date: Tue, 19 Aug 2025 08:28:42 +0000 Subject: [PATCH 226/246] net, hsr: reject HSR frame if skb can't hold tag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Receiving HSR frame with insufficient space to hold HSR tag in the skb can result in a crash (kernel BUG): [ 45.390915] skbuff: skb_under_panic: text:ffffffff86f32cac len:26 put:14 head:ffff888042418000 data:ffff888042417ff4 tail:0xe end:0x180 dev:bridge_slave_1 [ 45.392559] ------------[ cut here ]------------ [ 45.392912] kernel BUG at net/core/skbuff.c:211! [ 45.393276] Oops: invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN NOPTI [ 45.393809] CPU: 1 UID: 0 PID: 2496 Comm: reproducer Not tainted 6.15.0 #12 PREEMPT(undef) [ 45.394433] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 45.395273] RIP: 0010:skb_panic+0x15b/0x1d0 [ 45.402911] Call Trace: [ 45.403105] [ 45.404470] skb_push+0xcd/0xf0 [ 45.404726] br_dev_queue_push_xmit+0x7c/0x6c0 [ 45.406513] br_forward_finish+0x128/0x260 [ 45.408483] __br_forward+0x42d/0x590 [ 45.409464] maybe_deliver+0x2eb/0x420 [ 45.409763] br_flood+0x174/0x4a0 [ 45.410030] br_handle_frame_finish+0xc7c/0x1bc0 [ 45.411618] br_handle_frame+0xac3/0x1230 [ 45.413674] __netif_receive_skb_core.constprop.0+0x808/0x3df0 [ 45.422966] __netif_receive_skb_one_core+0xb4/0x1f0 [ 45.424478] __netif_receive_skb+0x22/0x170 [ 45.424806] process_backlog+0x242/0x6d0 [ 45.425116] __napi_poll+0xbb/0x630 [ 45.425394] net_rx_action+0x4d1/0xcc0 [ 45.427613] handle_softirqs+0x1a4/0x580 [ 45.427926] do_softirq+0x74/0x90 [ 45.428196] This issue was found by syzkaller. The panic happens in br_dev_queue_push_xmit() once it receives a corrupted skb with ETH header already pushed in linear data. When it attempts the skb_push() call, there's not enough headroom and skb_push() panics. The corrupted skb is put on the queue by HSR layer, which makes a sequence of unintended transformations when it receives a specific corrupted HSR frame (with incomplete TAG). Fix it by dropping and consuming frames that are not long enough to contain both ethernet and hsr headers. Alternative fix would be to check for enough headroom before skb_push() in br_dev_queue_push_xmit(). In the reproducer, this is injected via AF_PACKET, but I don't easily see why it couldn't be sent over the wire from adjacent network. Further Details: In the reproducer, the following network interface chain is set up: ┌────────────────┐ ┌────────────────┐ │ veth0_to_hsr ├───┤ hsr_slave0 ┼───┐ └────────────────┘ └────────────────┘ │ │ ┌──────┐ ├─┤ hsr0 ├───┐ │ └──────┘ │ ┌────────────────┐ ┌────────────────┐ │ │┌────────┐ │ veth1_to_hsr ┼───┤ hsr_slave1 ├───┘ └┤ │ └────────────────┘ └────────────────┘ ┌┼ bridge │ ││ │ │└────────┘ │ ┌───────┐ │ │ ... ├──────┘ └───────┘ To trigger the events leading up to crash, reproducer sends a corrupted HSR frame with incomplete TAG, via AF_PACKET socket on 'veth0_to_hsr'. The first HSR-layer function to process this frame is hsr_handle_frame(). It and then checks if the protocol is ETH_P_PRP or ETH_P_HSR. If it is, it calls skb_set_network_header(skb, ETH_HLEN + HSR_HLEN), without checking that the skb is long enough. For the crashing frame it is not, and hence the skb->network_header and skb->mac_len fields are set incorrectly, pointing after the end of the linear buffer. I will call this a BUG#1 and it is what is addressed by this patch. In the crashing scenario before the fix, the skb continues to go down the hsr path as follows. hsr_handle_frame() then calls this sequence hsr_forward_skb() fill_frame_info() hsr->proto_ops->fill_frame_info() hsr_fill_frame_info() hsr_fill_frame_info() contains a check that intends to check whether the skb actually contains the HSR header. But the check relies on the skb->mac_len field which was erroneously setup due to BUG#1, so the check passes and the execution continues back in the hsr_forward_skb(): hsr_forward_skb() hsr_forward_do() hsr->proto_ops->get_untagged_frame() hsr_get_untagged_frame() create_stripped_skb_hsr() In create_stripped_skb_hsr(), a copy of the skb is created and is further corrupted by operation that attempts to strip the HSR tag in a call to __pskb_copy(). The skb enters create_stripped_skb_hsr() with ethernet header pushed in linear buffer. The skb_pull(skb_in, HSR_HLEN) thus pulls 6 bytes of ethernet header into the headroom, creating skb_in with a headroom of size 8. The subsequent __pskb_copy() then creates an skb with headroom of just 2 and skb->len of just 12, this is how it looks after the copy: gdb) p skb->len $10 = 12 (gdb) p skb->data $11 = (unsigned char *) 0xffff888041e45382 "\252\252\252\252\252!\210\373", (gdb) p skb->head $12 = (unsigned char *) 0xffff888041e45380 "" It seems create_stripped_skb_hsr() assumes that ETH header is pulled in the headroom when it's entered, because it just pulls HSR header on top. But that is not the case in our code-path and we end up with the corrupted skb instead. I will call this BUG#2 *I got confused here because it seems that under no conditions can create_stripped_skb_hsr() work well, the assumption it makes is not true during the processing of hsr frames - since the skb_push() in hsr_handle_frame to skb_pull in hsr_deliver_master(). I wonder whether I missed something here.* Next, the execution arrives in hsr_deliver_master(). It calls skb_pull(ETH_HLEN), which just returns NULL - the SKB does not have enough space for the pull (as it only has 12 bytes in total at this point). *The skb_pull() here further suggests that ethernet header is meant to be pushed through the whole hsr processing and create_stripped_skb_hsr() should pull it before doing the HSR header pull.* hsr_deliver_master() then puts the corrupted skb on the queue, it is then picked up from there by bridge frame handling layer and finally lands in br_dev_queue_push_xmit where it panics. Cc: stable@kernel.org Fixes: 48b491a5cc74 ("net: hsr: fix mac_len checks") Reported-by: syzbot+a81f2759d022496b40ab@syzkaller.appspotmail.com Signed-off-by: Jakub Acs Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250819082842.94378-1-acsjakub@amazon.de Signed-off-by: Jakub Kicinski --- net/hsr/hsr_slave.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index b87b6a6fe070..102eccf5ead7 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -63,8 +63,14 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); if ((!hsr->prot_version && protocol == htons(ETH_P_PRP)) || - protocol == htons(ETH_P_HSR)) + protocol == htons(ETH_P_HSR)) { + if (!pskb_may_pull(skb, ETH_HLEN + HSR_HLEN)) { + kfree_skb(skb); + goto finish_consume; + } + skb_set_network_header(skb, ETH_HLEN + HSR_HLEN); + } skb_reset_mac_len(skb); /* Only the frames received over the interlink port will assign a From a458b2902115b26a25d67393b12ddd57d1216aaa Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Aug 2025 13:27:24 -0700 Subject: [PATCH 227/246] ipv6: sr: Fix MAC comparison to be constant-time To prevent timing attacks, MACs need to be compared in constant time. Use the appropriate helper function for this. Fixes: bf355b8d2c30 ("ipv6: sr: add core files for SR HMAC support") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Reviewed-by: Andrea Mayer Link: https://patch.msgid.link/20250818202724.15713-1-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv6/seg6_hmac.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index d77b52523b6a..fd58426f222b 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -280,7 +281,7 @@ bool seg6_hmac_validate_skb(struct sk_buff *skb) if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output)) return false; - if (memcmp(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN) != 0) + if (crypto_memneq(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN)) return false; return true; From c42be534547d6e45c155c347dd792b6ad9c24def Mon Sep 17 00:00:00 2001 From: Ryan Wanner Date: Tue, 19 Aug 2025 09:32:30 -0700 Subject: [PATCH 228/246] Revert "net: cadence: macb: sama7g5_emac: Remove USARIO CLKEN flag" This reverts commit db400061b5e7cc55f9b4dd15443e9838964119ea. This commit can cause a Devicetree ABI break for older DTS files that rely this flag for RMII configuration. Adding this back in ensures that the older DTBs will not break. Fixes: db400061b5e7 ("net: cadence: macb: sama7g5_emac: Remove USARIO CLKEN flag") Signed-off-by: Ryan Wanner Link: https://patch.msgid.link/20250819163236.100680-1-Ryan.Wanner@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cadence/macb_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index ce95fad8cedd..9693f0289435 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -5113,7 +5113,8 @@ static const struct macb_config sama7g5_gem_config = { static const struct macb_config sama7g5_emac_config = { .caps = MACB_CAPS_USRIO_DEFAULT_IS_MII_GMII | - MACB_CAPS_MIIONRGMII | MACB_CAPS_GEM_HAS_PTP, + MACB_CAPS_USRIO_HAS_CLKEN | MACB_CAPS_MIIONRGMII | + MACB_CAPS_GEM_HAS_PTP, .dma_burst_length = 16, .clk_init = macb_clk_init, .init = macb_init, From b64d035f77b1f02ab449393342264b44950a75ae Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 15 Aug 2025 06:19:58 +0000 Subject: [PATCH 229/246] bonding: update LACP activity flag after setting lacp_active The port's actor_oper_port_state activity flag should be updated immediately after changing the lacp_active option to reflect the current mode correctly. Fixes: 3a755cd8b7c6 ("bonding: add new option lacp_active") Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250815062000.22220-2-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_3ad.c | 25 +++++++++++++++++++++++++ drivers/net/bonding/bond_options.c | 1 + include/net/bond_3ad.h | 1 + 3 files changed, 27 insertions(+) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 2fca8e84ab10..414fecfd2a0e 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -2883,6 +2883,31 @@ void bond_3ad_update_lacp_rate(struct bonding *bond) spin_unlock_bh(&bond->mode_lock); } +/** + * bond_3ad_update_lacp_active - change the lacp active + * @bond: bonding struct + * + * Update actor_oper_port_state when lacp_active is modified. + */ +void bond_3ad_update_lacp_active(struct bonding *bond) +{ + struct port *port = NULL; + struct list_head *iter; + struct slave *slave; + int lacp_active; + + lacp_active = bond->params.lacp_active; + spin_lock_bh(&bond->mode_lock); + bond_for_each_slave(bond, slave, iter) { + port = &(SLAVE_AD_INFO(slave)->port); + if (lacp_active) + port->actor_oper_port_state |= LACP_STATE_LACP_ACTIVITY; + else + port->actor_oper_port_state &= ~LACP_STATE_LACP_ACTIVITY; + } + spin_unlock_bh(&bond->mode_lock); +} + size_t bond_3ad_stats_size(void) { return nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_RX */ diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 1d639a3be6ba..3b6f815c55ff 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -1660,6 +1660,7 @@ static int bond_option_lacp_active_set(struct bonding *bond, netdev_dbg(bond->dev, "Setting LACP active to %s (%llu)\n", newval->string, newval->value); bond->params.lacp_active = newval->value; + bond_3ad_update_lacp_active(bond); return 0; } diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index 2053cd8e788a..dba369a2cf27 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -307,6 +307,7 @@ int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); int bond_3ad_set_carrier(struct bonding *bond); void bond_3ad_update_lacp_rate(struct bonding *bond); +void bond_3ad_update_lacp_active(struct bonding *bond); void bond_3ad_update_ad_actor_settings(struct bonding *bond); int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats); size_t bond_3ad_stats_size(void); From 0599640a21e98f0d6a3e9ff85c0a687c90a8103b Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 15 Aug 2025 06:19:59 +0000 Subject: [PATCH 230/246] bonding: send LACPDUs periodically in passive mode after receiving partner's LACPDU When `lacp_active` is set to `off`, the bond operates in passive mode, meaning it only "speaks when spoken to." However, the current kernel implementation only sends an LACPDU in response when the partner's state changes. As a result, once LACP negotiation succeeds, the actor stops sending LACPDUs until the partner times out and sends an "expired" LACPDU. This causes continuous LACP state flapping. According to IEEE 802.1AX-2014, 6.4.13 Periodic Transmission machine. The values of Partner_Oper_Port_State.LACP_Activity and Actor_Oper_Port_State.LACP_Activity determine whether periodic transmissions take place. If either or both parameters are set to Active LACP, then periodic transmissions occur; if both are set to Passive LACP, then periodic transmissions do not occur. To comply with this, we remove the `!bond->params.lacp_active` check in `ad_periodic_machine()`. Instead, we initialize the actor's port's `LACP_STATE_LACP_ACTIVITY` state based on `lacp_active` setting. Additionally, we avoid setting the partner's state to `LACP_STATE_LACP_ACTIVITY` in the EXPIRED state, since we should not assume the partner is active by default. This ensures that in passive mode, the bond starts sending periodic LACPDUs after receiving one from the partner, and avoids flapping due to inactivity. Fixes: 3a755cd8b7c6 ("bonding: add new option lacp_active") Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250815062000.22220-3-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_3ad.c | 42 +++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 414fecfd2a0e..4edc8e6b6b64 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -95,13 +95,13 @@ static int ad_marker_send(struct port *port, struct bond_marker *marker); static void ad_mux_machine(struct port *port, bool *update_slave_arr); static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port); static void ad_tx_machine(struct port *port); -static void ad_periodic_machine(struct port *port, struct bond_params *bond_params); +static void ad_periodic_machine(struct port *port); static void ad_port_selection_logic(struct port *port, bool *update_slave_arr); static void ad_agg_selection_logic(struct aggregator *aggregator, bool *update_slave_arr); static void ad_clear_agg(struct aggregator *aggregator); static void ad_initialize_agg(struct aggregator *aggregator); -static void ad_initialize_port(struct port *port, int lacp_fast); +static void ad_initialize_port(struct port *port, const struct bond_params *bond_params); static void ad_enable_collecting(struct port *port); static void ad_disable_distributing(struct port *port, bool *update_slave_arr); @@ -1307,10 +1307,16 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) * case of EXPIRED even if LINK_DOWN didn't arrive for * the port. */ - port->partner_oper.port_state &= ~LACP_STATE_SYNCHRONIZATION; port->sm_vars &= ~AD_PORT_MATCHED; + /* Based on IEEE 8021AX-2014, Figure 6-18 - Receive + * machine state diagram, the statue should be + * Partner_Oper_Port_State.Synchronization = FALSE; + * Partner_Oper_Port_State.LACP_Timeout = Short Timeout; + * start current_while_timer(Short Timeout); + * Actor_Oper_Port_State.Expired = TRUE; + */ + port->partner_oper.port_state &= ~LACP_STATE_SYNCHRONIZATION; port->partner_oper.port_state |= LACP_STATE_LACP_TIMEOUT; - port->partner_oper.port_state |= LACP_STATE_LACP_ACTIVITY; port->sm_rx_timer_counter = __ad_timer_to_ticks(AD_CURRENT_WHILE_TIMER, (u16)(AD_SHORT_TIMEOUT)); port->actor_oper_port_state |= LACP_STATE_EXPIRED; port->sm_vars |= AD_PORT_CHURNED; @@ -1417,11 +1423,10 @@ static void ad_tx_machine(struct port *port) /** * ad_periodic_machine - handle a port's periodic state machine * @port: the port we're looking at - * @bond_params: bond parameters we will use * * Turn ntt flag on priodically to perform periodic transmission of lacpdu's. */ -static void ad_periodic_machine(struct port *port, struct bond_params *bond_params) +static void ad_periodic_machine(struct port *port) { periodic_states_t last_state; @@ -1430,8 +1435,7 @@ static void ad_periodic_machine(struct port *port, struct bond_params *bond_para /* check if port was reinitialized */ if (((port->sm_vars & AD_PORT_BEGIN) || !(port->sm_vars & AD_PORT_LACP_ENABLED) || !port->is_enabled) || - (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY)) || - !bond_params->lacp_active) { + (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY))) { port->sm_periodic_state = AD_NO_PERIODIC; } /* check if state machine should change state */ @@ -1955,16 +1959,16 @@ static void ad_initialize_agg(struct aggregator *aggregator) /** * ad_initialize_port - initialize a given port's parameters * @port: the port we're looking at - * @lacp_fast: boolean. whether fast periodic should be used + * @bond_params: bond parameters we will use */ -static void ad_initialize_port(struct port *port, int lacp_fast) +static void ad_initialize_port(struct port *port, const struct bond_params *bond_params) { static const struct port_params tmpl = { .system_priority = 0xffff, .key = 1, .port_number = 1, .port_priority = 0xff, - .port_state = 1, + .port_state = 0, }; static const struct lacpdu lacpdu = { .subtype = 0x01, @@ -1982,12 +1986,14 @@ static void ad_initialize_port(struct port *port, int lacp_fast) port->actor_port_priority = 0xff; port->actor_port_aggregator_identifier = 0; port->ntt = false; - port->actor_admin_port_state = LACP_STATE_AGGREGATION | - LACP_STATE_LACP_ACTIVITY; - port->actor_oper_port_state = LACP_STATE_AGGREGATION | - LACP_STATE_LACP_ACTIVITY; + port->actor_admin_port_state = LACP_STATE_AGGREGATION; + port->actor_oper_port_state = LACP_STATE_AGGREGATION; + if (bond_params->lacp_active) { + port->actor_admin_port_state |= LACP_STATE_LACP_ACTIVITY; + port->actor_oper_port_state |= LACP_STATE_LACP_ACTIVITY; + } - if (lacp_fast) + if (bond_params->lacp_fast) port->actor_oper_port_state |= LACP_STATE_LACP_TIMEOUT; memcpy(&port->partner_admin, &tmpl, sizeof(tmpl)); @@ -2201,7 +2207,7 @@ void bond_3ad_bind_slave(struct slave *slave) /* port initialization */ port = &(SLAVE_AD_INFO(slave)->port); - ad_initialize_port(port, bond->params.lacp_fast); + ad_initialize_port(port, &bond->params); port->slave = slave; port->actor_port_number = SLAVE_AD_INFO(slave)->id; @@ -2513,7 +2519,7 @@ void bond_3ad_state_machine_handler(struct work_struct *work) } ad_rx_machine(NULL, port); - ad_periodic_machine(port, &bond->params); + ad_periodic_machine(port); ad_port_selection_logic(port, &update_slave_arr); ad_mux_machine(port, &update_slave_arr); ad_tx_machine(port); From 87951b566446da04eed1fe8100f99a512ef02756 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 15 Aug 2025 06:20:00 +0000 Subject: [PATCH 231/246] selftests: bonding: add test for passive LACP mode Add a selftest to verify bonding behavior when `lacp_active` is set to `off`. The test checks the following: - The passive LACP bond should not send LACPDUs before receiving a partner's LACPDU. - The transmitted LACPDUs must not include the active flag. - After transitioning to EXPIRED and DEFAULTED states, the passive side should still not initiate LACPDUs. Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250815062000.22220-4-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- .../selftests/drivers/net/bonding/Makefile | 3 +- .../drivers/net/bonding/bond_passive_lacp.sh | 105 ++++++++++++++++++ .../selftests/drivers/net/bonding/config | 1 + 3 files changed, 108 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh diff --git a/tools/testing/selftests/drivers/net/bonding/Makefile b/tools/testing/selftests/drivers/net/bonding/Makefile index 2b10854e4b1e..44b98f17f8ff 100644 --- a/tools/testing/selftests/drivers/net/bonding/Makefile +++ b/tools/testing/selftests/drivers/net/bonding/Makefile @@ -10,7 +10,8 @@ TEST_PROGS := \ mode-2-recovery-updelay.sh \ bond_options.sh \ bond-eth-type-change.sh \ - bond_macvlan_ipvlan.sh + bond_macvlan_ipvlan.sh \ + bond_passive_lacp.sh TEST_FILES := \ lag_lib.sh \ diff --git a/tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh b/tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh new file mode 100755 index 000000000000..9c3b089813df --- /dev/null +++ b/tools/testing/selftests/drivers/net/bonding/bond_passive_lacp.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test if a bond interface works with lacp_active=off. + +# shellcheck disable=SC2034 +REQUIRE_MZ=no +NUM_NETIFS=0 +lib_dir=$(dirname "$0") +# shellcheck disable=SC1091 +source "$lib_dir"/../../../net/forwarding/lib.sh + +# shellcheck disable=SC2317 +check_port_state() +{ + local netns=$1 + local port=$2 + local state=$3 + + ip -n "${netns}" -d -j link show "$port" | \ + jq -e ".[].linkinfo.info_slave_data.ad_actor_oper_port_state_str | index(\"${state}\") != null" > /dev/null +} + +check_pkt_count() +{ + RET=0 + local ns="$1" + local iface="$2" + + # wait 65s, one per 30s + slowwait_for_counter 65 2 tc_rule_handle_stats_get \ + "dev ${iface} egress" 101 ".packets" "-n ${ns}" &> /dev/null +} + +setup() { + setup_ns c_ns s_ns + + # shellcheck disable=SC2154 + ip -n "${c_ns}" link add eth0 type veth peer name eth0 netns "${s_ns}" + ip -n "${c_ns}" link add eth1 type veth peer name eth1 netns "${s_ns}" + + # Add tc filter to count the pkts + tc -n "${c_ns}" qdisc add dev eth0 clsact + tc -n "${c_ns}" filter add dev eth0 egress handle 101 protocol 0x8809 matchall action pass + tc -n "${s_ns}" qdisc add dev eth1 clsact + tc -n "${s_ns}" filter add dev eth1 egress handle 101 protocol 0x8809 matchall action pass + + ip -n "${s_ns}" link add bond0 type bond mode 802.3ad lacp_active on lacp_rate fast + ip -n "${s_ns}" link set eth0 master bond0 + ip -n "${s_ns}" link set eth1 master bond0 + + ip -n "${c_ns}" link add bond0 type bond mode 802.3ad lacp_active off lacp_rate fast + ip -n "${c_ns}" link set eth0 master bond0 + ip -n "${c_ns}" link set eth1 master bond0 + +} + +trap cleanup_all_ns EXIT +setup + +# The bond will send 2 lacpdu pkts during init time, let's wait at least 2s +# after interface up +ip -n "${c_ns}" link set bond0 up +sleep 2 + +# 1. The passive side shouldn't send LACPDU. +check_pkt_count "${c_ns}" "eth0" && RET=1 +log_test "802.3ad lacp_active off" "init port" + +ip -n "${s_ns}" link set bond0 up +# 2. The passive side should not have the 'active' flag. +RET=0 +slowwait 2 check_port_state "${c_ns}" "eth0" "active" && RET=1 +log_test "802.3ad lacp_active off" "port state active" + +# 3. The active side should have the 'active' flag. +RET=0 +slowwait 2 check_port_state "${s_ns}" "eth0" "active" || RET=1 +log_test "802.3ad lacp_active on" "port state active" + +# 4. Make sure the connection is not expired. +RET=0 +slowwait 5 check_port_state "${s_ns}" "eth0" "distributing" +slowwait 10 check_port_state "${s_ns}" "eth0" "expired" && RET=1 +log_test "bond 802.3ad lacp_active off" "port connection" + +# After testing, disconnect one port on each side to check the state. +ip -n "${s_ns}" link set eth0 nomaster +ip -n "${s_ns}" link set eth0 up +ip -n "${c_ns}" link set eth1 nomaster +ip -n "${c_ns}" link set eth1 up +# Due to Periodic Machine and Rx Machine state change, the bond will still +# send lacpdu pkts in a few seconds. sleep at lease 5s to make sure +# negotiation finished +sleep 5 + +# 5. The active side should keep sending LACPDU. +check_pkt_count "${s_ns}" "eth1" || RET=1 +log_test "bond 802.3ad lacp_active on" "port pkt after disconnect" + +# 6. The passive side shouldn't send LACPDU anymore. +check_pkt_count "${c_ns}" "eth0" && RET=1 +log_test "bond 802.3ad lacp_active off" "port pkt after disconnect" + +exit "$EXIT_STATUS" diff --git a/tools/testing/selftests/drivers/net/bonding/config b/tools/testing/selftests/drivers/net/bonding/config index dad4e5fda4db..4d16a69ffc65 100644 --- a/tools/testing/selftests/drivers/net/bonding/config +++ b/tools/testing/selftests/drivers/net/bonding/config @@ -6,6 +6,7 @@ CONFIG_MACVLAN=y CONFIG_IPVLAN=y CONFIG_NET_ACT_GACT=y CONFIG_NET_CLS_FLOWER=y +CONFIG_NET_CLS_MATCHALL=m CONFIG_NET_SCH_INGRESS=y CONFIG_NLMON=y CONFIG_VETH=y From 9f6b606b6b37e61427412708411e8e04b1a858e8 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 18 Aug 2025 11:58:25 +0200 Subject: [PATCH 232/246] net: airoha: ppe: Do not invalid PPE entries in case of SW hash collision SW hash computed by airoha_ppe_foe_get_entry_hash routine (used for foe_flow hlist) can theoretically produce collisions between two different HW PPE entries. In airoha_ppe_foe_insert_entry() if the collision occurs we will mark the second PPE entry in the list as stale (setting the hw hash to 0xffff). Stale entries are no more updated in airoha_ppe_foe_flow_entry_update routine and so they are removed by Netfilter. Fix the problem not marking the second entry as stale in airoha_ppe_foe_insert_entry routine if we have already inserted the brand new entry in the PPE table and let Netfilter remove real stale entries according to their timestamp. Please note this is just a theoretical issue spotted reviewing the code and not faced running the system. Fixes: cd53f622611f9 ("net: airoha: Add L2 hw acceleration support") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250818-airoha-en7581-hash-collision-fix-v1-1-d190c4b53d1c@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/airoha/airoha_ppe.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index 47411d2cbd28..88694b08afa1 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -736,10 +736,8 @@ static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, continue; } - if (commit_done || !airoha_ppe_foe_compare_entry(e, hwe)) { - e->hash = 0xffff; + if (!airoha_ppe_foe_compare_entry(e, hwe)) continue; - } airoha_ppe_foe_commit_entry(ppe, &e->data, hash); commit_done = true; From 62708b9452f8eb77513115b17c4f8d1a22ebf843 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 19 Aug 2025 19:19:51 -0700 Subject: [PATCH 233/246] tls: fix handling of zero-length records on the rx_list Each recvmsg() call must process either - only contiguous DATA records (any number of them) - one non-DATA record If the next record has different type than what has already been processed we break out of the main processing loop. If the record has already been decrypted (which may be the case for TLS 1.3 where we don't know type until decryption) we queue the pending record to the rx_list. Next recvmsg() will pick it up from there. Queuing the skb to rx_list after zero-copy decrypt is not possible, since in that case we decrypted directly to the user space buffer, and we don't have an skb to queue (darg.skb points to the ciphertext skb for access to metadata like length). Only data records are allowed zero-copy, and we break the processing loop after each non-data record. So we should never zero-copy and then find out that the record type has changed. The corner case we missed is when the initial record comes from rx_list, and it's zero length. Reported-by: Muhammad Alifa Ramdhan Reported-by: Billy Jheng Bing-Jhong Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250820021952.143068-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 51c98a007dda..bac65d0d4e3e 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1808,6 +1808,9 @@ int decrypt_skb(struct sock *sk, struct scatterlist *sgout) return tls_decrypt_sg(sk, NULL, sgout, &darg); } +/* All records returned from a recvmsg() call must have the same type. + * 0 is not a valid content type. Use it as "no type reported, yet". + */ static int tls_record_content_type(struct msghdr *msg, struct tls_msg *tlm, u8 *control) { @@ -2051,8 +2054,10 @@ int tls_sw_recvmsg(struct sock *sk, if (err < 0) goto end; + /* process_rx_list() will set @control if it processed any records */ copied = err; - if (len <= copied || (copied && control != TLS_RECORD_TYPE_DATA) || rx_more) + if (len <= copied || rx_more || + (control && control != TLS_RECORD_TYPE_DATA)) goto end; target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); From a61a3e961baff65b0a49f862fe21ce304f279b24 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 19 Aug 2025 19:19:52 -0700 Subject: [PATCH 234/246] selftests: tls: add tests for zero-length records Test various combinations of zero-length records. Unfortunately, kernel cannot be coerced into producing those, so hardcode the ciphertext messages in the test. Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250820021952.143068-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tls.c | 300 +++++++++++++++++++++++++++++- 1 file changed, 295 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 2b8387a83bc7..0f5640d8dc7f 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -181,13 +181,12 @@ static int tls_send_cmsg(int fd, unsigned char record_type, return sendmsg(fd, &msg, flags); } -static int tls_recv_cmsg(struct __test_metadata *_metadata, - int fd, unsigned char record_type, - void *data, size_t len, int flags) +static int __tls_recv_cmsg(struct __test_metadata *_metadata, + int fd, unsigned char *ctype, + void *data, size_t len, int flags) { char cbuf[CMSG_SPACE(sizeof(char))]; struct cmsghdr *cmsg; - unsigned char ctype; struct msghdr msg; struct iovec vec; int n; @@ -206,7 +205,20 @@ static int tls_recv_cmsg(struct __test_metadata *_metadata, EXPECT_NE(cmsg, NULL); EXPECT_EQ(cmsg->cmsg_level, SOL_TLS); EXPECT_EQ(cmsg->cmsg_type, TLS_GET_RECORD_TYPE); - ctype = *((unsigned char *)CMSG_DATA(cmsg)); + if (ctype) + *ctype = *((unsigned char *)CMSG_DATA(cmsg)); + + return n; +} + +static int tls_recv_cmsg(struct __test_metadata *_metadata, + int fd, unsigned char record_type, + void *data, size_t len, int flags) +{ + unsigned char ctype; + int n; + + n = __tls_recv_cmsg(_metadata, fd, &ctype, data, len, flags); EXPECT_EQ(ctype, record_type); return n; @@ -2164,6 +2176,284 @@ TEST_F(tls, rekey_poll_delay) } } +struct raw_rec { + unsigned int plain_len; + unsigned char plain_data[100]; + unsigned int cipher_len; + unsigned char cipher_data[128]; +}; + +/* TLS 1.2, AES_CCM, data, seqno:0, plaintext: 'Hello world' */ +static const struct raw_rec id0_data_l11 = { + .plain_len = 11, + .plain_data = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, + 0x72, 0x6c, 0x64, + }, + .cipher_len = 40, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x26, 0xa2, 0x33, + 0xde, 0x8d, 0x94, 0xf0, 0x29, 0x6c, 0xb1, 0xaf, + 0x6a, 0x75, 0xb2, 0x93, 0xad, 0x45, 0xd5, 0xfd, + 0x03, 0x51, 0x57, 0x8f, 0xf9, 0xcc, 0x3b, 0x42, + }, +}; + +/* TLS 1.2, AES_CCM, ctrl, seqno:0, plaintext: '' */ +static const struct raw_rec id0_ctrl_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x16, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x38, 0x7b, + 0xa6, 0x1c, 0xdd, 0xa7, 0x19, 0x33, 0xab, 0xae, + 0x88, 0xe1, 0xd2, 0x08, 0x4f, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:0, plaintext: '' */ +static const struct raw_rec id0_data_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0xc5, 0x37, 0x90, + 0x70, 0x45, 0x89, 0xfb, 0x5c, 0xc7, 0x89, 0x03, + 0x68, 0x80, 0xd3, 0xd8, 0xcc, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:1, plaintext: 'Hello world' */ +static const struct raw_rec id1_data_l11 = { + .plain_len = 11, + .plain_data = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, + 0x72, 0x6c, 0x64, + }, + .cipher_len = 40, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x3a, 0x1a, 0x9c, + 0xd0, 0xa8, 0x9a, 0xd6, 0x69, 0xd6, 0x1a, 0xe3, + 0xb5, 0x1f, 0x0d, 0x2c, 0xe2, 0x97, 0x46, 0xff, + 0x2b, 0xcc, 0x5a, 0xc4, 0xa3, 0xb9, 0xef, 0xba, + }, +}; + +/* TLS 1.2, AES_CCM, ctrl, seqno:1, plaintext: '' */ +static const struct raw_rec id1_ctrl_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x16, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x3e, 0xf0, 0xfe, + 0xee, 0xd9, 0xe2, 0x5d, 0xc7, 0x11, 0x4c, 0xe6, + 0xb4, 0x7e, 0xef, 0x40, 0x2b, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:1, plaintext: '' */ +static const struct raw_rec id1_data_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0xce, 0xfc, 0x86, + 0xc8, 0xf0, 0x55, 0xf9, 0x47, 0x3f, 0x74, 0xdc, + 0xc9, 0xbf, 0xfe, 0x5b, 0xb1, + }, +}; + +/* TLS 1.2, AES_CCM, ctrl, seqno:2, plaintext: 'Hello world' */ +static const struct raw_rec id2_ctrl_l11 = { + .plain_len = 11, + .plain_data = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, + 0x72, 0x6c, 0x64, + }, + .cipher_len = 40, + .cipher_data = { + 0x16, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0xe5, 0x3d, 0x19, + 0x3d, 0xca, 0xb8, 0x16, 0xb6, 0xff, 0x79, 0x87, + 0x2a, 0x04, 0x11, 0x3d, 0xf8, 0x64, 0x5f, 0x36, + 0x8b, 0xa8, 0xee, 0x4c, 0x6d, 0x62, 0xa5, 0x00, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:2, plaintext: 'Hello world' */ +static const struct raw_rec id2_data_l11 = { + .plain_len = 11, + .plain_data = { + 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, + 0x72, 0x6c, 0x64, + }, + .cipher_len = 40, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0xe5, 0x3d, 0x19, + 0x3d, 0xca, 0xb8, 0x16, 0xb6, 0xff, 0x79, 0x87, + 0x8e, 0xa1, 0xd0, 0xcd, 0x33, 0xb5, 0x86, 0x2b, + 0x17, 0xf1, 0x52, 0x2a, 0x55, 0x62, 0x65, 0x11, + }, +}; + +/* TLS 1.2, AES_CCM, ctrl, seqno:2, plaintext: '' */ +static const struct raw_rec id2_ctrl_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x16, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0xdc, 0x5c, 0x0e, + 0x41, 0xdd, 0xba, 0xd3, 0xcc, 0xcf, 0x6d, 0xd9, + 0x06, 0xdb, 0x79, 0xe5, 0x5d, + }, +}; + +/* TLS 1.2, AES_CCM, data, seqno:2, plaintext: '' */ +static const struct raw_rec id2_data_l0 = { + .plain_len = 0, + .plain_data = { + }, + .cipher_len = 29, + .cipher_data = { + 0x17, 0x03, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0xc3, 0xca, 0x26, + 0x22, 0xe4, 0x25, 0xfb, 0x5f, 0x6d, 0xbf, 0x83, + 0x30, 0x48, 0x69, 0x1a, 0x47, + }, +}; + +FIXTURE(zero_len) +{ + int fd, cfd; + bool notls; +}; + +FIXTURE_VARIANT(zero_len) +{ + const struct raw_rec *recs[4]; + ssize_t recv_ret[4]; +}; + +FIXTURE_VARIANT_ADD(zero_len, data_data_data) +{ + .recs = { &id0_data_l11, &id1_data_l11, &id2_data_l11, }, + .recv_ret = { 33, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, data_0ctrl_data) +{ + .recs = { &id0_data_l11, &id1_ctrl_l0, &id2_data_l11, }, + .recv_ret = { 11, 0, 11, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0data_0data_0data) +{ + .recs = { &id0_data_l0, &id1_data_l0, &id2_data_l0, }, + .recv_ret = { -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0data_0data_ctrl) +{ + .recs = { &id0_data_l0, &id1_data_l0, &id2_ctrl_l11, }, + .recv_ret = { 0, 11, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0data_0data_0ctrl) +{ + .recs = { &id0_data_l0, &id1_data_l0, &id2_ctrl_l0, }, + .recv_ret = { 0, 0, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0ctrl_0ctrl_0ctrl) +{ + .recs = { &id0_ctrl_l0, &id1_ctrl_l0, &id2_ctrl_l0, }, + .recv_ret = { 0, 0, 0, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, 0data_0data_data) +{ + .recs = { &id0_data_l0, &id1_data_l0, &id2_data_l11, }, + .recv_ret = { 11, -EAGAIN, }, +}; + +FIXTURE_VARIANT_ADD(zero_len, data_0data_0data) +{ + .recs = { &id0_data_l11, &id1_data_l0, &id2_data_l0, }, + .recv_ret = { 11, -EAGAIN, }, +}; + +FIXTURE_SETUP(zero_len) +{ + struct tls_crypto_info_keys tls12; + int ret; + + tls_crypto_info_init(TLS_1_2_VERSION, TLS_CIPHER_AES_CCM_128, + &tls12, 0); + + ulp_sock_pair(_metadata, &self->fd, &self->cfd, &self->notls); + if (self->notls) + return; + + /* Don't install keys on fd, we'll send raw records */ + ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len); + ASSERT_EQ(ret, 0); +} + +FIXTURE_TEARDOWN(zero_len) +{ + close(self->fd); + close(self->cfd); +} + +TEST_F(zero_len, test) +{ + const struct raw_rec *const *rec; + unsigned char buf[128]; + int rec_off; + int i; + + for (i = 0; i < 4 && variant->recs[i]; i++) + EXPECT_EQ(send(self->fd, variant->recs[i]->cipher_data, + variant->recs[i]->cipher_len, 0), + variant->recs[i]->cipher_len); + + rec = &variant->recs[0]; + rec_off = 0; + for (i = 0; i < 4; i++) { + int j, ret; + + ret = variant->recv_ret[i] >= 0 ? variant->recv_ret[i] : -1; + EXPECT_EQ(__tls_recv_cmsg(_metadata, self->cfd, NULL, + buf, sizeof(buf), MSG_DONTWAIT), ret); + if (ret == -1) + EXPECT_EQ(errno, -variant->recv_ret[i]); + if (variant->recv_ret[i] == -EAGAIN) + break; + + for (j = 0; j < ret; j++) { + while (rec_off == (*rec)->plain_len) { + rec++; + rec_off = 0; + } + EXPECT_EQ(buf[j], (*rec)->plain_data[rec_off]); + rec_off++; + } + } +}; + FIXTURE(tls_err) { int fd, cfd; From 8c5d95988c34f0aeba1f34cd5e4ba69494c90c5f Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Wed, 20 Aug 2025 12:09:18 +0530 Subject: [PATCH 235/246] Octeontx2-af: Skip overlap check for SPI field Octeontx2/CN10K silicon supports generating a 256-bit key per packet. The specific fields to be extracted from a packet for key generation are configurable via a Key Extraction (MKEX) Profile. The AF driver scans the configured extraction profile to ensure that fields from upper layers do not overwrite fields from lower layers in the key. Example Packet Field Layout: LA: DMAC + SMAC LB: VLAN LC: IPv4/IPv6 LD: TCP/UDP Valid MKEX Profile Configuration: LA -> DMAC -> key_offset[0-5] LC -> SIP -> key_offset[20-23] LD -> SPORT -> key_offset[30-31] Invalid MKEX profile configuration: LA -> DMAC -> key_offset[0-5] LC -> SIP -> key_offset[20-23] LD -> SPORT -> key_offset[2-3] // Overlaps with DMAC field In another scenario, if the MKEX profile is configured to extract the SPI field from both AH and ESP headers at the same key offset, the driver rejecting this configuration. In a regular traffic, ipsec packet will be having either AH(LD) or ESP (LE). This patch relaxes the check for the same. Fixes: 12aa0a3b93f3 ("octeontx2-af: Harden rule validation.") Signed-off-by: Hariprasad Kelam Link: https://patch.msgid.link/20250820063919.1463518-1-hkelam@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c index 1b765045aa63..b56395ac5a74 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c @@ -606,8 +606,8 @@ static void npc_set_features(struct rvu *rvu, int blkaddr, u8 intf) if (!npc_check_field(rvu, blkaddr, NPC_LB, intf)) *features &= ~BIT_ULL(NPC_OUTER_VID); - /* Set SPI flag only if AH/ESP and IPSEC_SPI are in the key */ - if (npc_check_field(rvu, blkaddr, NPC_IPSEC_SPI, intf) && + /* Allow extracting SPI field from AH and ESP headers at same offset */ + if (npc_is_field_present(rvu, NPC_IPSEC_SPI, intf) && (*features & (BIT_ULL(NPC_IPPROTO_ESP) | BIT_ULL(NPC_IPPROTO_AH)))) *features |= BIT_ULL(NPC_IPSEC_SPI); From 1c67f9c54cdc70627e3f6472b89cd3d895df974c Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 20 Aug 2025 15:27:07 +0200 Subject: [PATCH 236/246] net: pse-pd: pd692x0: Fix power budget leak in manager setup error path Fix a resource leak where manager power budgets were freed on both success and error paths during manager setup. Power budgets should only be freed on error paths after regulator registration or during driver removal. Refactor cleanup logic by extracting OF node cleanup and power budget freeing into separate helper functions for better maintainability. Fixes: 359754013e6a ("net: pse-pd: pd692x0: Add support for PSE PI priority feature") Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20250820132708.837255-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/pse-pd/pd692x0.c | 59 +++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/drivers/net/pse-pd/pd692x0.c b/drivers/net/pse-pd/pd692x0.c index 399ce9febda4..395f6c662175 100644 --- a/drivers/net/pse-pd/pd692x0.c +++ b/drivers/net/pse-pd/pd692x0.c @@ -1162,12 +1162,44 @@ pd692x0_write_ports_matrix(struct pd692x0_priv *priv, return 0; } +static void pd692x0_of_put_managers(struct pd692x0_priv *priv, + struct pd692x0_manager *manager, + int nmanagers) +{ + int i, j; + + for (i = 0; i < nmanagers; i++) { + for (j = 0; j < manager[i].nports; j++) + of_node_put(manager[i].port_node[j]); + of_node_put(manager[i].node); + } +} + +static void pd692x0_managers_free_pw_budget(struct pd692x0_priv *priv) +{ + int i; + + for (i = 0; i < PD692X0_MAX_MANAGERS; i++) { + struct regulator *supply; + + if (!priv->manager_reg[i] || !priv->manager_pw_budget[i]) + continue; + + supply = priv->manager_reg[i]->supply; + if (!supply) + continue; + + regulator_free_power_budget(supply, + priv->manager_pw_budget[i]); + } +} + static int pd692x0_setup_pi_matrix(struct pse_controller_dev *pcdev) { struct pd692x0_manager *manager __free(kfree) = NULL; struct pd692x0_priv *priv = to_pd692x0_priv(pcdev); struct pd692x0_matrix port_matrix[PD692X0_MAX_PIS]; - int ret, i, j, nmanagers; + int ret, nmanagers; /* Should we flash the port matrix */ if (priv->fw_state != PD692X0_FW_OK && @@ -1185,31 +1217,27 @@ static int pd692x0_setup_pi_matrix(struct pse_controller_dev *pcdev) nmanagers = ret; ret = pd692x0_register_managers_regulator(priv, manager, nmanagers); if (ret) - goto out; + goto err_of_managers; ret = pd692x0_configure_managers(priv, nmanagers); if (ret) - goto out; + goto err_of_managers; ret = pd692x0_set_ports_matrix(priv, manager, nmanagers, port_matrix); if (ret) - goto out; + goto err_managers_req_pw; ret = pd692x0_write_ports_matrix(priv, port_matrix); if (ret) - goto out; + goto err_managers_req_pw; -out: - for (i = 0; i < nmanagers; i++) { - struct regulator *supply = priv->manager_reg[i]->supply; + pd692x0_of_put_managers(priv, manager, nmanagers); + return 0; - regulator_free_power_budget(supply, - priv->manager_pw_budget[i]); - - for (j = 0; j < manager[i].nports; j++) - of_node_put(manager[i].port_node[j]); - of_node_put(manager[i].node); - } +err_managers_req_pw: + pd692x0_managers_free_pw_budget(priv); +err_of_managers: + pd692x0_of_put_managers(priv, manager, nmanagers); return ret; } @@ -1748,6 +1776,7 @@ static void pd692x0_i2c_remove(struct i2c_client *client) { struct pd692x0_priv *priv = i2c_get_clientdata(client); + pd692x0_managers_free_pw_budget(priv); firmware_upload_unregister(priv->fwl); } From 7ef353879f714602b43f98662069f4fb86536761 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 20 Aug 2025 15:33:21 +0200 Subject: [PATCH 237/246] net: pse-pd: pd692x0: Skip power budget configuration when undefined If the power supply's power budget is not defined in the device tree, the current code still requests power and configures the PSE manager with a 0W power limit, which is undesirable behavior. Skip power budget configuration entirely when the budget is zero, avoiding unnecessary power requests and preventing invalid 0W limits from being set on the PSE manager. Fixes: 359754013e6a ("net: pse-pd: pd692x0: Add support for PSE PI priority feature") Signed-off-by: Kory Maincent Acked-by: Oleksij Rempel Link: https://patch.msgid.link/20250820133321.841054-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/pse-pd/pd692x0.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/pse-pd/pd692x0.c b/drivers/net/pse-pd/pd692x0.c index 395f6c662175..f4e91ba64a66 100644 --- a/drivers/net/pse-pd/pd692x0.c +++ b/drivers/net/pse-pd/pd692x0.c @@ -1041,6 +1041,10 @@ pd692x0_configure_managers(struct pd692x0_priv *priv, int nmanagers) int pw_budget; pw_budget = regulator_get_unclaimed_power_budget(supply); + if (!pw_budget) + /* Do nothing if no power budget */ + continue; + /* Max power budget per manager */ if (pw_budget > 6000000) pw_budget = 6000000; From bc17455bc843b2f4b206e0bb8139013eb3d3c08b Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Wed, 20 Aug 2025 16:32:02 +0300 Subject: [PATCH 238/246] net/mlx5: Base ECVF devlink port attrs from 0 Adjust the vport number by the base ECVF vport number so the port attributes start at 0. Previously the port attributes would start 1 after the maximum number of host VFs. Fixes: dc13180824b7 ("net/mlx5: Enable devlink port for embedded cpu VF vports") Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Reviewed-by: Saeed Mahameed Signed-off-by: Tariq Toukan Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-2-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c index b7102e14d23d..c33accadae0f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c @@ -47,10 +47,12 @@ static void mlx5_esw_offloads_pf_vf_devlink_port_attrs_set(struct mlx5_eswitch * devlink_port_attrs_pci_vf_set(dl_port, controller_num, pfnum, vport_num - 1, external); } else if (mlx5_core_is_ec_vf_vport(esw->dev, vport_num)) { + u16 base_vport = mlx5_core_ec_vf_vport_base(dev); + memcpy(dl_port->attrs.switch_id.id, ppid.id, ppid.id_len); dl_port->attrs.switch_id.id_len = ppid.id_len; devlink_port_attrs_pci_vf_set(dl_port, 0, pfnum, - vport_num - 1, false); + vport_num - base_vport, false); } } From 330f0f6713a39581936decac72331e6ab7f13529 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 20 Aug 2025 16:32:03 +0300 Subject: [PATCH 239/246] net/mlx5: Remove default QoS group and attach vports directly to root TSAR Currently, the driver creates a default group (`node0`) and attaches all vports to it unless the user explicitly sets a parent group. As a result, when a user configures tx_share on a group and tx_share on a VF, the expectation is for the group and the VF to share bandwidth relatively. However, since the VF is not connected to the same parent (but to the default node), the proportional share logic is not applied correctly. To fix this, remove the default group (`node0`) and instead connect vports directly to the root TSAR when no parent is specified. This ensures that vports and groups share the same root scheduler and their tx_share values are compared directly under the same hierarchy. Fixes: 0fe132eac38c ("net/mlx5: E-switch, Allow to add vports to rate groups") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-3-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 97 +++++++------------ .../net/ethernet/mellanox/mlx5/core/eswitch.h | 5 - 2 files changed, 33 insertions(+), 69 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 91d863c8c152..cd58d3934596 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -462,6 +462,7 @@ static int esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node, struct netlink_ext_ack *extack) { + struct mlx5_esw_sched_node *parent = vport_node->parent; u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; struct mlx5_core_dev *dev = vport_node->esw->dev; void *attr; @@ -477,7 +478,7 @@ esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node, attr = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes); MLX5_SET(vport_element, attr, vport_number, vport_node->vport->vport); MLX5_SET(scheduling_context, sched_ctx, parent_element_id, - vport_node->parent->ix); + parent ? parent->ix : vport_node->esw->qos.root_tsar_ix); MLX5_SET(scheduling_context, sched_ctx, max_average_bw, vport_node->max_rate); @@ -786,48 +787,15 @@ static int esw_qos_create(struct mlx5_eswitch *esw, struct netlink_ext_ack *exta return err; } - if (MLX5_CAP_QOS(dev, log_esw_max_sched_depth)) { - esw->qos.node0 = __esw_qos_create_vports_sched_node(esw, NULL, extack); - } else { - /* The eswitch doesn't support scheduling nodes. - * Create a software-only node0 using the root TSAR to attach vport QoS to. - */ - if (!__esw_qos_alloc_node(esw, - esw->qos.root_tsar_ix, - SCHED_NODE_TYPE_VPORTS_TSAR, - NULL)) - esw->qos.node0 = ERR_PTR(-ENOMEM); - else - list_add_tail(&esw->qos.node0->entry, - &esw->qos.domain->nodes); - } - if (IS_ERR(esw->qos.node0)) { - err = PTR_ERR(esw->qos.node0); - esw_warn(dev, "E-Switch create rate node 0 failed (%d)\n", err); - goto err_node0; - } refcount_set(&esw->qos.refcnt, 1); return 0; - -err_node0: - if (mlx5_destroy_scheduling_element_cmd(esw->dev, SCHEDULING_HIERARCHY_E_SWITCH, - esw->qos.root_tsar_ix)) - esw_warn(esw->dev, "E-Switch destroy root TSAR failed.\n"); - - return err; } static void esw_qos_destroy(struct mlx5_eswitch *esw) { int err; - if (esw->qos.node0->ix != esw->qos.root_tsar_ix) - __esw_qos_destroy_node(esw->qos.node0, NULL); - else - __esw_qos_free_node(esw->qos.node0); - esw->qos.node0 = NULL; - err = mlx5_destroy_scheduling_element_cmd(esw->dev, SCHEDULING_HIERARCHY_E_SWITCH, esw->qos.root_tsar_ix); @@ -990,13 +958,16 @@ esw_qos_vport_tc_enable(struct mlx5_vport *vport, enum sched_node_type type, struct netlink_ext_ack *extack) { struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; - int err, new_level, max_level; + struct mlx5_esw_sched_node *parent = vport_node->parent; + int err; if (type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) { + int new_level, max_level; + /* Increase the parent's level by 2 to account for both the * TC arbiter and the vports TC scheduling element. */ - new_level = vport_node->parent->level + 2; + new_level = (parent ? parent->level : 2) + 2; max_level = 1 << MLX5_CAP_QOS(vport_node->esw->dev, log_esw_max_sched_depth); if (new_level > max_level) { @@ -1033,9 +1004,7 @@ esw_qos_vport_tc_enable(struct mlx5_vport *vport, enum sched_node_type type, err_sched_nodes: if (type == SCHED_NODE_TYPE_RATE_LIMITER) { esw_qos_node_destroy_sched_element(vport_node, NULL); - list_add_tail(&vport_node->entry, - &vport_node->parent->children); - vport_node->level = vport_node->parent->level + 1; + esw_qos_node_attach_to_parent(vport_node); } else { esw_qos_tc_arbiter_scheduling_teardown(vport_node, NULL); } @@ -1083,7 +1052,6 @@ err_out: static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_ack *extack) { struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; - struct mlx5_esw_sched_node *parent = vport_node->parent; enum sched_node_type curr_type = vport_node->type; if (curr_type == SCHED_NODE_TYPE_VPORT) @@ -1093,7 +1061,7 @@ static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_a vport_node->bw_share = 0; list_del_init(&vport_node->entry); - esw_qos_normalize_min_rate(parent->esw, parent, extack); + esw_qos_normalize_min_rate(vport_node->esw, vport_node->parent, extack); trace_mlx5_esw_vport_qos_destroy(vport_node->esw->dev, vport); } @@ -1103,25 +1071,23 @@ static int esw_qos_vport_enable(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent, struct netlink_ext_ack *extack) { + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; int err; esw_assert_qos_lock_held(vport->dev->priv.eswitch); - esw_qos_node_set_parent(vport->qos.sched_node, parent); - if (type == SCHED_NODE_TYPE_VPORT) { - err = esw_qos_vport_create_sched_element(vport->qos.sched_node, - extack); - } else { + esw_qos_node_set_parent(vport_node, parent); + if (type == SCHED_NODE_TYPE_VPORT) + err = esw_qos_vport_create_sched_element(vport_node, extack); + else err = esw_qos_vport_tc_enable(vport, type, extack); - } if (err) return err; - vport->qos.sched_node->type = type; - esw_qos_normalize_min_rate(parent->esw, parent, extack); - trace_mlx5_esw_vport_qos_create(vport->dev, vport, - vport->qos.sched_node->max_rate, - vport->qos.sched_node->bw_share); + vport_node->type = type; + esw_qos_normalize_min_rate(vport_node->esw, parent, extack); + trace_mlx5_esw_vport_qos_create(vport->dev, vport, vport_node->max_rate, + vport_node->bw_share); return 0; } @@ -1132,6 +1098,7 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t { struct mlx5_eswitch *esw = vport->dev->priv.eswitch; struct mlx5_esw_sched_node *sched_node; + struct mlx5_eswitch *parent_esw; int err; esw_assert_qos_lock_held(esw); @@ -1139,10 +1106,12 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t if (err) return err; - parent = parent ?: esw->qos.node0; - sched_node = __esw_qos_alloc_node(parent->esw, 0, type, parent); + parent_esw = parent ? parent->esw : esw; + sched_node = __esw_qos_alloc_node(parent_esw, 0, type, parent); if (!sched_node) return -ENOMEM; + if (!parent) + list_add_tail(&sched_node->entry, &esw->qos.domain->nodes); sched_node->max_rate = max_rate; sched_node->min_rate = min_rate; @@ -1168,7 +1137,7 @@ void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport) goto unlock; parent = vport->qos.sched_node->parent; - WARN(parent != esw->qos.node0, "Disabling QoS on port before detaching it from node"); + WARN(parent, "Disabling QoS on port before detaching it from node"); esw_qos_vport_disable(vport, NULL); mlx5_esw_qos_vport_qos_free(vport); @@ -1268,7 +1237,6 @@ static int esw_qos_vport_update(struct mlx5_vport *vport, int err; esw_assert_qos_lock_held(vport->dev->priv.eswitch); - parent = parent ?: curr_parent; if (curr_type == type && curr_parent == parent) return 0; @@ -1306,16 +1274,16 @@ static int esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw esw_assert_qos_lock_held(esw); curr_parent = vport->qos.sched_node->parent; - parent = parent ?: esw->qos.node0; if (curr_parent == parent) return 0; /* Set vport QoS type based on parent node type if different from * default QoS; otherwise, use the vport's current QoS type. */ - if (parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) + if (parent && parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) type = SCHED_NODE_TYPE_RATE_LIMITER; - else if (curr_parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) + else if (curr_parent && + curr_parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) type = SCHED_NODE_TYPE_VPORT; else type = vport->qos.sched_node->type; @@ -1654,9 +1622,10 @@ static bool esw_qos_validate_unsupported_tc_bw(struct mlx5_eswitch *esw, static bool esw_qos_vport_validate_unsupported_tc_bw(struct mlx5_vport *vport, u32 *tc_bw) { - struct mlx5_eswitch *esw = vport->qos.sched_node ? - vport->qos.sched_node->parent->esw : - vport->dev->priv.eswitch; + struct mlx5_esw_sched_node *node = vport->qos.sched_node; + struct mlx5_eswitch *esw = vport->dev->priv.eswitch; + + esw = (node && node->parent) ? node->parent->esw : esw; return esw_qos_validate_unsupported_tc_bw(esw, tc_bw); } @@ -1763,7 +1732,7 @@ int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_leaf, if (disable) { if (vport_node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) err = esw_qos_vport_update(vport, SCHED_NODE_TYPE_VPORT, - NULL, extack); + vport_node->parent, extack); goto unlock; } @@ -1775,7 +1744,7 @@ int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_leaf, } else { err = esw_qos_vport_update(vport, SCHED_NODE_TYPE_TC_ARBITER_TSAR, - NULL, extack); + vport_node->parent, extack); } if (!err) esw_qos_set_tc_arbiter_bw_shares(vport_node, tc_bw, extack); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index b0b8ef3ec3c4..45506ad56847 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -373,11 +373,6 @@ struct mlx5_eswitch { refcount_t refcnt; u32 root_tsar_ix; struct mlx5_qos_domain *domain; - /* Contains all vports with QoS enabled but no explicit node. - * Cannot be NULL if QoS is enabled, but may be a fake node - * referencing the root TSAR if the esw doesn't support nodes. - */ - struct mlx5_esw_sched_node *node0; } qos; struct mlx5_esw_bridge_offloads *br_offloads; From e8f973576ca5387ffd2917b8ae661d3f9acde526 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 20 Aug 2025 16:32:04 +0300 Subject: [PATCH 240/246] net/mlx5e: Preserve tc-bw during parent changes When changing parent of a node/leaf with tc-bw configured, the code saves and restores tc-bw values. However, it was reading the converted hardware bw_share values (where 0 becomes 1) instead of the original user values, causing incorrect tc-bw calculations after parent change. Store original tc-bw values in the node structure and use them directly for save/restore operations. Fixes: cf7e73770d1b ("net/mlx5: Manage TC arbiter nodes and implement full support for tc-bw") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-4-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index cd58d3934596..4ed5968f1638 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -102,6 +102,8 @@ struct mlx5_esw_sched_node { u8 level; /* Valid only when this node represents a traffic class. */ u8 tc; + /* Valid only for a TC arbiter node or vport TC arbiter. */ + u32 tc_bw[DEVLINK_RATE_TCS_MAX]; }; static void esw_qos_node_attach_to_parent(struct mlx5_esw_sched_node *node) @@ -609,10 +611,7 @@ static void esw_qos_tc_arbiter_get_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node, u32 *tc_bw) { - struct mlx5_esw_sched_node *vports_tc_node; - - list_for_each_entry(vports_tc_node, &tc_arbiter_node->children, entry) - tc_bw[vports_tc_node->tc] = vports_tc_node->bw_share; + memcpy(tc_bw, tc_arbiter_node->tc_bw, sizeof(tc_arbiter_node->tc_bw)); } static void @@ -629,6 +628,7 @@ esw_qos_set_tc_arbiter_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node, u8 tc = vports_tc_node->tc; u32 bw_share; + tc_arbiter_node->tc_bw[tc] = tc_bw[tc]; bw_share = tc_bw[tc] * fw_max_bw_share; bw_share = esw_qos_calc_bw_share(bw_share, divider, fw_max_bw_share); @@ -1060,6 +1060,7 @@ static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_a esw_qos_vport_tc_disable(vport, extack); vport_node->bw_share = 0; + memset(vport_node->tc_bw, 0, sizeof(vport_node->tc_bw)); list_del_init(&vport_node->entry); esw_qos_normalize_min_rate(vport_node->esw, vport_node->parent, extack); @@ -1231,8 +1232,9 @@ static int esw_qos_vport_update(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent, struct netlink_ext_ack *extack) { - struct mlx5_esw_sched_node *curr_parent = vport->qos.sched_node->parent; - enum sched_node_type curr_type = vport->qos.sched_node->type; + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + struct mlx5_esw_sched_node *curr_parent = vport_node->parent; + enum sched_node_type curr_type = vport_node->type; u32 curr_tc_bw[DEVLINK_RATE_TCS_MAX] = {0}; int err; @@ -1244,10 +1246,8 @@ static int esw_qos_vport_update(struct mlx5_vport *vport, if (err) return err; - if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) { - esw_qos_tc_arbiter_get_bw_shares(vport->qos.sched_node, - curr_tc_bw); - } + if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) + esw_qos_tc_arbiter_get_bw_shares(vport_node, curr_tc_bw); esw_qos_vport_disable(vport, extack); @@ -1258,8 +1258,8 @@ static int esw_qos_vport_update(struct mlx5_vport *vport, } if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) { - esw_qos_set_tc_arbiter_bw_shares(vport->qos.sched_node, - curr_tc_bw, extack); + esw_qos_set_tc_arbiter_bw_shares(vport_node, curr_tc_bw, + extack); } return err; From b697ef4d1d136948d282384e6cc3d1af469ea123 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 20 Aug 2025 16:32:05 +0300 Subject: [PATCH 241/246] net/mlx5: Destroy vport QoS element when no configuration remains If a VF has been configured and the user later clears all QoS settings, the vport element remains in the firmware QoS tree. This leads to inconsistent behavior compared to VFs that were never configured, since the FW assumes that unconfigured VFs are outside the QoS hierarchy. As a result, the bandwidth share across VFs may differ, even though none of them appear to have any configuration. Align the driver behavior with the FW expectation by destroying the vport QoS element when all configurations are removed. Fixes: c9497c98901c ("net/mlx5: Add support for setting VF min rate") Fixes: cf7e73770d1b ("net/mlx5: Manage TC arbiter nodes and implement full support for tc-bw") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/20250820133209.389065-5-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 57 ++++++++++++++++--- 1 file changed, 49 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 4ed5968f1638..452a948a3e6d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -1127,6 +1127,19 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t return err; } +static void mlx5_esw_qos_vport_disable_locked(struct mlx5_vport *vport) +{ + struct mlx5_eswitch *esw = vport->dev->priv.eswitch; + + esw_assert_qos_lock_held(esw); + if (!vport->qos.sched_node) + return; + + esw_qos_vport_disable(vport, NULL); + mlx5_esw_qos_vport_qos_free(vport); + esw_qos_put(esw); +} + void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport) { struct mlx5_eswitch *esw = vport->dev->priv.eswitch; @@ -1140,9 +1153,7 @@ void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport) parent = vport->qos.sched_node->parent; WARN(parent, "Disabling QoS on port before detaching it from node"); - esw_qos_vport_disable(vport, NULL); - mlx5_esw_qos_vport_qos_free(vport); - esw_qos_put(esw); + mlx5_esw_qos_vport_disable_locked(vport); unlock: esw_qos_unlock(esw); } @@ -1642,6 +1653,21 @@ static bool esw_qos_tc_bw_disabled(u32 *tc_bw) return true; } +static void esw_vport_qos_prune_empty(struct mlx5_vport *vport) +{ + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + + esw_assert_qos_lock_held(vport->dev->priv.eswitch); + if (!vport_node) + return; + + if (vport_node->parent || vport_node->max_rate || + vport_node->min_rate || !esw_qos_tc_bw_disabled(vport_node->tc_bw)) + return; + + mlx5_esw_qos_vport_disable_locked(vport); +} + int mlx5_esw_qos_init(struct mlx5_eswitch *esw) { if (esw->qos.domain) @@ -1675,6 +1701,10 @@ int mlx5_esw_devlink_rate_leaf_tx_share_set(struct devlink_rate *rate_leaf, void esw_qos_lock(esw); err = mlx5_esw_qos_set_vport_min_rate(vport, tx_share, extack); + if (err) + goto out; + esw_vport_qos_prune_empty(vport); +out: esw_qos_unlock(esw); return err; } @@ -1696,6 +1726,10 @@ int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void * esw_qos_lock(esw); err = mlx5_esw_qos_set_vport_max_rate(vport, tx_max, extack); + if (err) + goto out; + esw_vport_qos_prune_empty(vport); +out: esw_qos_unlock(esw); return err; } @@ -1733,6 +1767,7 @@ int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_leaf, if (vport_node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) err = esw_qos_vport_update(vport, SCHED_NODE_TYPE_VPORT, vport_node->parent, extack); + esw_vport_qos_prune_empty(vport); goto unlock; } @@ -1893,14 +1928,20 @@ int mlx5_esw_devlink_rate_leaf_parent_set(struct devlink_rate *devlink_rate, void *priv, void *parent_priv, struct netlink_ext_ack *extack) { - struct mlx5_esw_sched_node *node; + struct mlx5_esw_sched_node *node = parent ? parent_priv : NULL; struct mlx5_vport *vport = priv; + int err; - if (!parent) - return mlx5_esw_qos_vport_update_parent(vport, NULL, extack); + err = mlx5_esw_qos_vport_update_parent(vport, node, extack); + if (!err) { + struct mlx5_eswitch *esw = vport->dev->priv.eswitch; - node = parent_priv; - return mlx5_esw_qos_vport_update_parent(vport, node, extack); + esw_qos_lock(esw); + esw_vport_qos_prune_empty(vport); + esw_qos_unlock(esw); + } + + return err; } static bool esw_qos_is_node_empty(struct mlx5_esw_sched_node *node) From 3c114fb2afe493066df5b9e560ef37216b153c90 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 20 Aug 2025 16:32:06 +0300 Subject: [PATCH 242/246] net/mlx5: Fix QoS reference leak in vport enable error path Add missing esw_qos_put() call when __esw_qos_alloc_node() fails in mlx5_esw_qos_vport_enable(). Fixes: be034baba83e ("net/mlx5: Make vport QoS enablement more flexible for future extensions") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-6-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 452a948a3e6d..41aec07bb6c2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -1109,8 +1109,10 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t parent_esw = parent ? parent->esw : esw; sched_node = __esw_qos_alloc_node(parent_esw, 0, type, parent); - if (!sched_node) + if (!sched_node) { + esw_qos_put(esw); return -ENOMEM; + } if (!parent) list_add_tail(&sched_node->entry, &esw->qos.domain->nodes); From 51b17c98e3dbb2093a81b0490050a0eaa919ebee Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 20 Aug 2025 16:32:07 +0300 Subject: [PATCH 243/246] net/mlx5: Restore missing scheduling node cleanup on vport enable failure Restore the __esw_qos_free_node() call removed by the offending commit. Fixes: 97733d1e00a0 ("net/mlx5: Add traffic class scheduling support for vport QoS") Signed-off-by: Carolina Jubran Reviewed-by: Tariq Toukan Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-7-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 41aec07bb6c2..8b4977650183 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -1122,6 +1122,7 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t vport->qos.sched_node = sched_node; err = esw_qos_vport_enable(vport, type, parent, extack); if (err) { + __esw_qos_free_node(sched_node); esw_qos_put(esw); vport->qos.sched_node = NULL; } From 451d2849ea66659040b59ae3cb7e50cc97404733 Mon Sep 17 00:00:00 2001 From: Alexei Lazar Date: Wed, 20 Aug 2025 16:32:08 +0300 Subject: [PATCH 244/246] net/mlx5e: Query FW for buffer ownership The SW currently saves local buffer ownership when setting the buffer. This means that the SW assumes it has ownership of the buffer after the command is set. If setting the buffer fails and we remain in FW ownership, the local buffer ownership state incorrectly remains as SW-owned. This leads to incorrect behavior in subsequent PFC commands, causing failures. Instead of saving local buffer ownership in SW, query the FW for buffer ownership when setting the buffer. This ensures that the buffer ownership state is accurately reflected, avoiding the issues caused by incorrect ownership states. Fixes: ecdf2dadee8e ("net/mlx5e: Receive buffer support for DCBX") Signed-off-by: Alexei Lazar Reviewed-by: Shahar Shitrit Reviewed-by: Dragos Tatulea Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-8-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/en/dcbnl.h | 1 - .../ethernet/mellanox/mlx5/core/en_dcbnl.c | 12 ++++++++--- .../ethernet/mellanox/mlx5/core/mlx5_core.h | 2 ++ .../net/ethernet/mellanox/mlx5/core/port.c | 20 +++++++++++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h b/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h index b59aee75de94..2c98a5299df3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h @@ -26,7 +26,6 @@ struct mlx5e_dcbx { u8 cap; /* Buffer configuration */ - bool manual_buffer; u32 cable_len; u32 xoff; u16 port_buff_cell_sz; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index 5fe016e477b3..d166c0d5189e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -362,6 +362,7 @@ static int mlx5e_dcbnl_ieee_getpfc(struct net_device *dev, static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev, struct ieee_pfc *pfc) { + u8 buffer_ownership = MLX5_BUF_OWNERSHIP_UNKNOWN; struct mlx5e_priv *priv = netdev_priv(dev); struct mlx5_core_dev *mdev = priv->mdev; u32 old_cable_len = priv->dcbx.cable_len; @@ -389,7 +390,14 @@ static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev, if (MLX5_BUFFER_SUPPORTED(mdev)) { pfc_new.pfc_en = (changed & MLX5E_PORT_BUFFER_PFC) ? pfc->pfc_en : curr_pfc_en; - if (priv->dcbx.manual_buffer) + ret = mlx5_query_port_buffer_ownership(mdev, + &buffer_ownership); + if (ret) + netdev_err(dev, + "%s, Failed to get buffer ownership: %d\n", + __func__, ret); + + if (buffer_ownership == MLX5_BUF_OWNERSHIP_SW_OWNED) ret = mlx5e_port_manual_buffer_config(priv, changed, dev->mtu, &pfc_new, NULL, NULL); @@ -982,7 +990,6 @@ static int mlx5e_dcbnl_setbuffer(struct net_device *dev, if (!changed) return 0; - priv->dcbx.manual_buffer = true; err = mlx5e_port_manual_buffer_config(priv, changed, dev->mtu, NULL, buffer_size, prio2buffer); return err; @@ -1252,7 +1259,6 @@ void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv) priv->dcbx.cap |= DCB_CAP_DCBX_HOST; priv->dcbx.port_buff_cell_sz = mlx5e_query_port_buffers_cell_size(priv); - priv->dcbx.manual_buffer = false; priv->dcbx.cable_len = MLX5E_DEFAULT_CABLE_LEN; mlx5e_ets_init(priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index b6d53db27cd5..9d3504f5abfa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -367,6 +367,8 @@ int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out); int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in); int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state); int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state); +int mlx5_query_port_buffer_ownership(struct mlx5_core_dev *mdev, + u8 *buffer_ownership); int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio); int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 549f1066d2a5..2d7adf7444ba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -968,6 +968,26 @@ int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state) return err; } +int mlx5_query_port_buffer_ownership(struct mlx5_core_dev *mdev, + u8 *buffer_ownership) +{ + u32 out[MLX5_ST_SZ_DW(pfcc_reg)] = {}; + int err; + + if (!MLX5_CAP_PCAM_FEATURE(mdev, buffer_ownership)) { + *buffer_ownership = MLX5_BUF_OWNERSHIP_UNKNOWN; + return 0; + } + + err = mlx5_query_pfcc_reg(mdev, out, sizeof(out)); + if (err) + return err; + + *buffer_ownership = MLX5_GET(pfcc_reg, out, buf_ownership); + + return 0; +} + int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio) { int sz = MLX5_ST_SZ_BYTES(qpdpm_reg); From 8b0587a885fdb34fd6090a3f8625cb7ac1444826 Mon Sep 17 00:00:00 2001 From: Armen Ratner Date: Wed, 20 Aug 2025 16:32:09 +0300 Subject: [PATCH 245/246] net/mlx5e: Preserve shared buffer capacity during headroom updates When port buffer headroom changes, port_update_shared_buffer() recalculates the shared buffer size and splits it in a 3:1 ratio (lossy:lossless) - Currently, the calculation is: lossless = shared / 4; lossy = (shared / 4) * 3; Meaning, the calculation dropped the remainder of shared % 4 due to integer division, unintentionally reducing the total shared buffer by up to three cells on each update. Over time, this could shrink the buffer below usable size. Fix it by changing the calculation to: lossless = shared / 4; lossy = shared - lossless; This retains all buffer cells while still approximating the intended 3:1 split, preventing capacity loss over time. While at it, perform headroom calculations in units of cells rather than in bytes for more accurate calculations avoiding extra divisions. Fixes: a440030d8946 ("net/mlx5e: Update shared buffer along with device buffer changes") Signed-off-by: Armen Ratner Signed-off-by: Maher Sanalla Reviewed-by: Tariq Toukan Signed-off-by: Alexei Lazar Signed-off-by: Mark Bloch Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/20250820133209.389065-9-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/en/port_buffer.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c index 5ae787656a7c..3efa8bf1d14e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c @@ -272,8 +272,8 @@ static int port_update_shared_buffer(struct mlx5_core_dev *mdev, /* Total shared buffer size is split in a ratio of 3:1 between * lossy and lossless pools respectively. */ - lossy_epool_size = (shared_buffer_size / 4) * 3; lossless_ipool_size = shared_buffer_size / 4; + lossy_epool_size = shared_buffer_size - lossless_ipool_size; mlx5e_port_set_sbpr(mdev, 0, MLX5_EGRESS_DIR, MLX5_LOSSY_POOL, 0, lossy_epool_size); @@ -288,14 +288,12 @@ static int port_set_buffer(struct mlx5e_priv *priv, u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz; struct mlx5_core_dev *mdev = priv->mdev; int sz = MLX5_ST_SZ_BYTES(pbmc_reg); - u32 new_headroom_size = 0; - u32 current_headroom_size; + u32 current_headroom_cells = 0; + u32 new_headroom_cells = 0; void *in; int err; int i; - current_headroom_size = port_buffer->headroom_size; - in = kzalloc(sz, GFP_KERNEL); if (!in) return -ENOMEM; @@ -306,12 +304,14 @@ static int port_set_buffer(struct mlx5e_priv *priv, for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { void *buffer = MLX5_ADDR_OF(pbmc_reg, in, buffer[i]); + current_headroom_cells += MLX5_GET(bufferx_reg, buffer, size); + u64 size = port_buffer->buffer[i].size; u64 xoff = port_buffer->buffer[i].xoff; u64 xon = port_buffer->buffer[i].xon; - new_headroom_size += size; do_div(size, port_buff_cell_sz); + new_headroom_cells += size; do_div(xoff, port_buff_cell_sz); do_div(xon, port_buff_cell_sz); MLX5_SET(bufferx_reg, buffer, size, size); @@ -320,10 +320,8 @@ static int port_set_buffer(struct mlx5e_priv *priv, MLX5_SET(bufferx_reg, buffer, xon_threshold, xon); } - new_headroom_size /= port_buff_cell_sz; - current_headroom_size /= port_buff_cell_sz; - err = port_update_shared_buffer(priv->mdev, current_headroom_size, - new_headroom_size); + err = port_update_shared_buffer(priv->mdev, current_headroom_cells, + new_headroom_cells); if (err) goto out; From 91a79b792204313153e1bdbbe5acbfc28903b3a5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 20 Aug 2025 14:37:07 +0200 Subject: [PATCH 246/246] netfilter: nf_reject: don't leak dst refcount for loopback packets recent patches to add a WARN() when replacing skb dst entry found an old bug: WARNING: include/linux/skbuff.h:1165 skb_dst_check_unset include/linux/skbuff.h:1164 [inline] WARNING: include/linux/skbuff.h:1165 skb_dst_set include/linux/skbuff.h:1210 [inline] WARNING: include/linux/skbuff.h:1165 nf_reject_fill_skb_dst+0x2a4/0x330 net/ipv4/netfilter/nf_reject_ipv4.c:234 [..] Call Trace: nf_send_unreach+0x17b/0x6e0 net/ipv4/netfilter/nf_reject_ipv4.c:325 nft_reject_inet_eval+0x4bc/0x690 net/netfilter/nft_reject_inet.c:27 expr_call_ops_eval net/netfilter/nf_tables_core.c:237 [inline] .. This is because blamed commit forgot about loopback packets. Such packets already have a dst_entry attached, even at PRE_ROUTING stage. Instead of checking hook just check if the skb already has a route attached to it. Fixes: f53b9b0bdc59 ("netfilter: introduce support for reject at prerouting stage") Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20250820123707.10671-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- net/ipv4/netfilter/nf_reject_ipv4.c | 6 ++---- net/ipv6/netfilter/nf_reject_ipv6.c | 5 ++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 87fd945a0d27..0d3cb2ba6fc8 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -247,8 +247,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, if (!oth) return; - if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) && - nf_reject_fill_skb_dst(oldskb) < 0) + if (!skb_dst(oldskb) && nf_reject_fill_skb_dst(oldskb) < 0) return; if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -321,8 +320,7 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) if (iph->frag_off & htons(IP_OFFSET)) return; - if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) && - nf_reject_fill_skb_dst(skb_in) < 0) + if (!skb_dst(skb_in) && nf_reject_fill_skb_dst(skb_in) < 0) return; if (skb_csum_unnecessary(skb_in) || diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 838295fa32e3..cb2d38e80de9 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -293,7 +293,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; - if (hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) { + if (!skb_dst(oldskb)) { nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); if (!dst) return; @@ -397,8 +397,7 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) skb_in->dev = net->loopback_dev; - if ((hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_INGRESS) && - nf_reject6_fill_skb_dst(skb_in) < 0) + if (!skb_dst(skb_in) && nf_reject6_fill_skb_dst(skb_in) < 0) return; icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0);