Skip to content

Commit

Permalink
Revert "md/raid10: improve raid10 discard request"
Browse files Browse the repository at this point in the history
This reverts commit bcc90d2.

Matthew Ruffell reported data corruption in raid10 due to the changes
in discard handling [1]. Revert these changes before we find a proper fix.

[1] https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1907262/
Cc: Matthew Ruffell <[email protected]>
Cc: Xiao Ni <[email protected]>
Signed-off-by: Song Liu <[email protected]>
  • Loading branch information
liu-song-6 committed Dec 10, 2020
1 parent 82fe9af commit d7cb6be
Showing 1 changed file with 1 addition and 255 deletions.
256 changes: 1 addition & 255 deletions drivers/md/raid10.c
Original file line number Diff line number Diff line change
Expand Up @@ -1516,256 +1516,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
raid10_write_request(mddev, bio, r10_bio);
}

static struct bio *raid10_split_bio(struct r10conf *conf,
struct bio *bio, sector_t sectors, bool want_first)
{
struct bio *split;

split = bio_split(bio, sectors, GFP_NOIO, &conf->bio_split);
bio_chain(split, bio);
allow_barrier(conf);
if (want_first) {
submit_bio_noacct(bio);
bio = split;
} else
submit_bio_noacct(split);
wait_barrier(conf);

return bio;
}

static void raid10_end_discard_request(struct bio *bio)
{
struct r10bio *r10_bio = bio->bi_private;
struct r10conf *conf = r10_bio->mddev->private;
struct md_rdev *rdev = NULL;
int dev;
int slot, repl;

/*
* We don't care the return value of discard bio
*/
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
set_bit(R10BIO_Uptodate, &r10_bio->state);

dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
if (repl)
rdev = conf->mirrors[dev].replacement;
if (!rdev) {
/* raid10_remove_disk uses smp_mb to make sure rdev is set to
* replacement before setting replacement to NULL. It can read
* rdev first without barrier protect even replacment is NULL
*/
smp_rmb();
rdev = conf->mirrors[dev].rdev;
}

if (atomic_dec_and_test(&r10_bio->remaining)) {
md_write_end(r10_bio->mddev);
raid_end_bio_io(r10_bio);
}

rdev_dec_pending(rdev, conf->mddev);
}

/* There are some limitations to handle discard bio
* 1st, the discard size is bigger than stripe_size*2.
* 2st, if the discard bio spans reshape progress, we use the old way to
* handle discard bio
*/
static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
{
struct r10conf *conf = mddev->private;
struct geom *geo = &conf->geo;
struct r10bio *r10_bio;

int disk;
sector_t chunk;
unsigned int stripe_size;
sector_t split_size;

sector_t bio_start, bio_end;
sector_t first_stripe_index, last_stripe_index;
sector_t start_disk_offset;
unsigned int start_disk_index;
sector_t end_disk_offset;
unsigned int end_disk_index;
unsigned int remainder;

if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return -EAGAIN;

wait_barrier(conf);

/* Check reshape again to avoid reshape happens after checking
* MD_RECOVERY_RESHAPE and before wait_barrier
*/
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
goto out;

stripe_size = geo->raid_disks << geo->chunk_shift;
bio_start = bio->bi_iter.bi_sector;
bio_end = bio_end_sector(bio);

/* Maybe one discard bio is smaller than strip size or across one stripe
* and discard region is larger than one stripe size. For far offset layout,
* if the discard region is not aligned with stripe size, there is hole
* when we submit discard bio to member disk. For simplicity, we only
* handle discard bio which discard region is bigger than stripe_size*2
*/
if (bio_sectors(bio) < stripe_size*2)
goto out;

/* For far offset layout, if bio is not aligned with stripe size, it splits
* the part that is not aligned with strip size.
*/
div_u64_rem(bio_start, stripe_size, &remainder);
if (geo->far_offset && remainder) {
split_size = stripe_size - remainder;
bio = raid10_split_bio(conf, bio, split_size, false);
}
div_u64_rem(bio_end, stripe_size, &remainder);
if (geo->far_offset && remainder) {
split_size = bio_sectors(bio) - remainder;
bio = raid10_split_bio(conf, bio, split_size, true);
}

r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
r10_bio->mddev = mddev;
r10_bio->state = 0;
r10_bio->sectors = 0;
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);

wait_blocked_dev(mddev, r10_bio);

r10_bio->master_bio = bio;

bio_start = bio->bi_iter.bi_sector;
bio_end = bio_end_sector(bio);

/* raid10 uses chunk as the unit to store data. It's similar like raid0.
* One stripe contains the chunks from all member disk (one chunk from
* one disk at the same HBA address). For layout detail, see 'man md 4'
*/
chunk = bio_start >> geo->chunk_shift;
chunk *= geo->near_copies;
first_stripe_index = chunk;
start_disk_index = sector_div(first_stripe_index, geo->raid_disks);
if (geo->far_offset)
first_stripe_index *= geo->far_copies;
start_disk_offset = (bio_start & geo->chunk_mask) +
(first_stripe_index << geo->chunk_shift);

chunk = bio_end >> geo->chunk_shift;
chunk *= geo->near_copies;
last_stripe_index = chunk;
end_disk_index = sector_div(last_stripe_index, geo->raid_disks);
if (geo->far_offset)
last_stripe_index *= geo->far_copies;
end_disk_offset = (bio_end & geo->chunk_mask) +
(last_stripe_index << geo->chunk_shift);

rcu_read_lock();
for (disk = 0; disk < geo->raid_disks; disk++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
struct md_rdev *rrdev = rcu_dereference(
conf->mirrors[disk].replacement);

r10_bio->devs[disk].bio = NULL;
r10_bio->devs[disk].repl_bio = NULL;

if (rdev && (test_bit(Faulty, &rdev->flags)))
rdev = NULL;
if (rrdev && (test_bit(Faulty, &rrdev->flags)))
rrdev = NULL;
if (!rdev && !rrdev)
continue;

if (rdev) {
r10_bio->devs[disk].bio = bio;
atomic_inc(&rdev->nr_pending);
}
if (rrdev) {
r10_bio->devs[disk].repl_bio = bio;
atomic_inc(&rrdev->nr_pending);
}
}
rcu_read_unlock();

atomic_set(&r10_bio->remaining, 1);
for (disk = 0; disk < geo->raid_disks; disk++) {
sector_t dev_start, dev_end;
struct bio *mbio, *rbio = NULL;
struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
struct md_rdev *rrdev = rcu_dereference(
conf->mirrors[disk].replacement);

/*
* Now start to calculate the start and end address for each disk.
* The space between dev_start and dev_end is the discard region.
*
* For dev_start, it needs to consider three conditions:
* 1st, the disk is before start_disk, you can imagine the disk in
* the next stripe. So the dev_start is the start address of next
* stripe.
* 2st, the disk is after start_disk, it means the disk is at the
* same stripe of first disk
* 3st, the first disk itself, we can use start_disk_offset directly
*/
if (disk < start_disk_index)
dev_start = (first_stripe_index + 1) * mddev->chunk_sectors;
else if (disk > start_disk_index)
dev_start = first_stripe_index * mddev->chunk_sectors;
else
dev_start = start_disk_offset;

if (disk < end_disk_index)
dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
else if (disk > end_disk_index)
dev_end = last_stripe_index * mddev->chunk_sectors;
else
dev_end = end_disk_offset;

/* It only handles discard bio which size is >= stripe size, so
* dev_end > dev_start all the time
*/
if (r10_bio->devs[disk].bio) {
mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
mbio->bi_end_io = raid10_end_discard_request;
mbio->bi_private = r10_bio;
r10_bio->devs[disk].bio = mbio;
r10_bio->devs[disk].devnum = disk;
atomic_inc(&r10_bio->remaining);
md_submit_discard_bio(mddev, rdev, mbio,
dev_start + choose_data_offset(r10_bio, rdev),
dev_end - dev_start);
bio_endio(mbio);
}
if (r10_bio->devs[disk].repl_bio) {
rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
rbio->bi_end_io = raid10_end_discard_request;
rbio->bi_private = r10_bio;
r10_bio->devs[disk].repl_bio = rbio;
r10_bio->devs[disk].devnum = disk;
atomic_inc(&r10_bio->remaining);
md_submit_discard_bio(mddev, rrdev, rbio,
dev_start + choose_data_offset(r10_bio, rrdev),
dev_end - dev_start);
bio_endio(rbio);
}
}

if (atomic_dec_and_test(&r10_bio->remaining)) {
md_write_end(r10_bio->mddev);
raid_end_bio_io(r10_bio);
}

return 0;
out:
allow_barrier(conf);
return -EAGAIN;
}

static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
{
struct r10conf *conf = mddev->private;
Expand All @@ -1780,10 +1530,6 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
if (!md_write_start(mddev, bio))
return false;

if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
if (!raid10_handle_discard(mddev, bio))
return true;

/*
* If this request crosses a chunk boundary, we need to split
* it.
Expand Down Expand Up @@ -4023,7 +3769,7 @@ static int raid10_run(struct mddev *mddev)

if (mddev->queue) {
blk_queue_max_discard_sectors(mddev->queue,
UINT_MAX);
mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, 0);
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
Expand Down

0 comments on commit d7cb6be

Please sign in to comment.