From 46f21952c492243b138281dc4cb755ab63b637c4 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai@fnnas.com>
Date: Sun, 16 Nov 2025 10:18:16 +0800
Subject: [PATCH 1/4] md/raid0: fix NULL pointer dereference in
 create_strip_zones() for dm-raid

Commit 2107457e31fa ("md/raid0: Move queue limit setup before r0conf
initialization") dereference mddev->gendisk unconditionally, which is
NULL for dm-raid.

Fix this problem by reverting to old codes for dm-raid.

Link: https://lore.kernel.org/linux-raid/20251116021816.107648-1-yukuai@fnnas.com
Fixes: 2107457e31fa ("md/raid0: Move queue limit setup before r0conf initialization")
Reported-and-tested-by: Changhui Zhong <czhong@redhat.com>
Closes: https://lore.kernel.org/all/CAGVVp+VqVnvGeneUoTbYvBv2cw6GwQRrR3B-iQ-_9rVfyumoKA@mail.gmail.com/
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Reviewed-by: Li Nan <linan122@huawei.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
---
 drivers/md/raid0.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 47aee1b1d4d1..985c377356eb 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -68,7 +68,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	struct strip_zone *zone;
 	int cnt;
 	struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
-	unsigned int blksize = queue_logical_block_size(mddev->gendisk->queue);
+	unsigned int blksize = 512;
+
+	if (!mddev_is_dm(mddev))
+		blksize = queue_logical_block_size(mddev->gendisk->queue);
 
 	*private_conf = ERR_PTR(-ENOMEM);
 	if (!conf)
@@ -84,6 +87,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		sector_div(sectors, mddev->chunk_sectors);
 		rdev1->sectors = sectors * mddev->chunk_sectors;
 
+		if (mddev_is_dm(mddev))
+			blksize = max(blksize, queue_logical_block_size(
+				      rdev1->bdev->bd_disk->queue));
+
 		rdev_for_each(rdev2, mddev) {
 			pr_debug("md/raid0:%s:   comparing %pg(%llu)"
 				 " with %pg(%llu)\n",

From 8c9e376b9d1a222fa02b93b615d2e25be0a91fed Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai@fnnas.com>
Date: Mon, 17 Nov 2025 16:55:56 +0800
Subject: [PATCH 2/4] md: warn about updating super block failure

Many personalities will handle IO error from daemon thread(like raid1d,
raid10d, raid5d), and sb will require to be clean before hanlding these
failed IO. However update sb can fail, for example array is broken by
IO failure, or user config sysfs api array_state.

This patch adds warning if updating sb failed first, in case this will
be related to IO hang.

Link: https://lore.kernel.org/linux-raid/20251117085557.770572-2-yukuai@fnnas.com
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
Reviewed-by: Li Nan <linan122@huawei.com>
---
 drivers/md/md.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7b5c5967568f..345b1e623aba 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2788,6 +2788,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
 	if (!md_is_rdwr(mddev)) {
 		if (force_change)
 			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
+		pr_err("%s: can't update sb for read-only array %s\n", __func__, mdname(mddev));
 		return;
 	}
 

From a913d1f6a7f607c110aeef8b58c8988f47a4b24e Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai@fnnas.com>
Date: Mon, 17 Nov 2025 16:55:57 +0800
Subject: [PATCH 3/4] md/raid5: fix IO hang when array is broken with IO
 inflight

Following test can cause IO hang:

mdadm -CvR /dev/md0 -l10 -n4 /dev/sd[abcd] --assume-clean --chunk=64K --bitmap=none
sleep 5
echo 1 > /sys/block/sda/device/delete
echo 1 > /sys/block/sdb/device/delete
echo 1 > /sys/block/sdc/device/delete
echo 1 > /sys/block/sdd/device/delete

dd if=/dev/md0 of=/dev/null bs=8k count=1 iflag=direct

Root cause:

1) all disks removed, however all rdevs in the array is still in sync,
IO will be issued normally.

2) IO failure from sda, and set badblocks failed, sda will be faulty
and MD_SB_CHANGING_PENDING will be set.

3) error recovery try to recover this IO from other disks, IO will be
issued to sdb, sdc, and sdd.

4) IO failure from sdb, and set badblocks failed again, now array is
broken and will become read-only.

5) IO failure from sdc and sdd, however, stripe can't be handled anymore
because MD_SB_CHANGING_PENDING is set:

handle_stripe
 handle_stripe
 if (test_bit MD_SB_CHANGING_PENDING)
  set_bit STRIPE_HANDLE
  goto finish
  // skip handling failed stripe

release_stripe
 if (test_bit STRIPE_HANDLE)
  list_add_tail conf->hand_list

6) later raid5d can't handle failed stripe as well:

raid5d
 md_check_recovery
  md_update_sb
   if (!md_is_rdwr())
    // can't clear pending bit
    return
 if (test_bit MD_SB_CHANGING_PENDING)
  break;
  // can't handle failed stripe

Since MD_SB_CHANGING_PENDING can never be cleared for read-only array,
fix this problem by skip this checking for read-only array.

Link: https://lore.kernel.org/linux-raid/20251117085557.770572-3-yukuai@fnnas.com
Fixes: d87f064f5874 ("md: never update metadata when array is read-only.")
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
Reviewed-by: Li Nan <linan122@huawei.com>
---
 drivers/md/raid5.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cdbc7eba5c54..e57ce3295292 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4956,7 +4956,8 @@ static void handle_stripe(struct stripe_head *sh)
 		goto finish;
 
 	if (s.handle_bad_blocks ||
-	    test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
+	    (md_is_rdwr(conf->mddev) &&
+	     test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags))) {
 		set_bit(STRIPE_HANDLE, &sh->state);
 		goto finish;
 	}
@@ -6768,7 +6769,8 @@ static void raid5d(struct md_thread *thread)
 		int batch_size, released;
 		unsigned int offset;
 
-		if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
+		if (md_is_rdwr(mddev) &&
+		    test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
 			break;
 
 		released = release_stripe_list(conf, conf->temp_inactive_list);

From fdd0c6a649d24107bbadd249c87feab67b9037c5 Mon Sep 17 00:00:00 2001
From: Tarun Sahu <tarunsahu@google.com>
Date: Fri, 21 Nov 2025 19:14:22 +0000
Subject: [PATCH 4/4] md: remove legacy 1s delay in md_notify_reboot

During system shutdown, the md driver registered notifier function
(md_notify_reboot) currently imposes a hardcoded one-second delay.

This delay was introduced approximately 23 years ago and was likely
necessary for the hardware generation of that time. Proposing this
patch to make sure there are no known devices that need this delay.

Link: https://lore.kernel.org/linux-raid/20251121191422.2758555-1-tarunsahu@google.com
Signed-off-by: Tarun Sahu <tarunsahu@google.com>
Reviewed-by: Yu Kuai<yukuai@fnnas.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 drivers/md/md.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 345b1e623aba..e5922a682953 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -10408,7 +10408,6 @@ static int md_notify_reboot(struct notifier_block *this,
 			    unsigned long code, void *x)
 {
 	struct mddev *mddev;
-	int need_delay = 0;
 
 	spin_lock(&all_mddevs_lock);
 	list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
@@ -10422,21 +10421,11 @@ static int md_notify_reboot(struct notifier_block *this,
 				mddev->safemode = 2;
 			mddev_unlock(mddev);
 		}
-		need_delay = 1;
 		spin_lock(&all_mddevs_lock);
 		mddev_put_locked(mddev);
 	}
 	spin_unlock(&all_mddevs_lock);
 
-	/*
-	 * certain more exotic SCSI devices are known to be
-	 * volatile wrt too early system reboots. While the
-	 * right place to handle this issue is the given
-	 * driver, we do want to have a safe RAID driver ...
-	 */
-	if (need_delay)
-		msleep(1000);
-
 	return NOTIFY_DONE;
 }