Hi all! This is my suggestion how to do on the fly repair for corrupted raid setups. Currently, btrfs can cope with a hardware failure in a way that it tries to find another mirror and ... that''s it. The bad mirror always stays bad and your data is lost when the last copy vanishes. Here is where I got on my way changing this. I built upon the retry code originally used for data (inode.c), moved it to a more central place (extent_io.c) and made it repair errors when possible. Those two steps are currently inlcuded in patch 4, because what I actually did was somewhat more iterative. If it helps reviewing, I can try to split that up in a move-commit and a change-commit - just tell me you''d like this. To test this, I made some bad sectors with hdparm (data and metadata) and had them corrected while reading the affected data. Anyway, this patch touches critical parts and can potentially screw up your data, in case i have an error in determination of the destination for corrective writes. You have been warned! But please, try it anyway :-) One remark concerning scrub: My latest scrub patches include a change that triggers a regular page read to correct some kind of errors. This code is meant to end up exactly in the error correction routines added here, too. There are some special cases (nodatasum and a certain state of page cache) where scrub comes across an error that it reports as incorrectable, which it isn''t. I have a patch for that as well, but as it is only relevant when you combine those two patch series, I did not include it. -Jan Jan Schmidt (4): btrfs: btrfs_multi_bio replaced with btrfs_bio btrfs: Do not use bio->bi_bdev after submission btrfs: Put mirror_num in bi_bdev btrfs: Moved repair code from inode.c to extent_io.c fs/btrfs/extent-tree.c | 10 +- fs/btrfs/extent_io.c | 386 +++++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/extent_io.h | 11 ++- fs/btrfs/inode.c | 155 +------------------- fs/btrfs/scrub.c | 20 ++-- fs/btrfs/volumes.c | 130 +++++++++-------- fs/btrfs/volumes.h | 10 +- 7 files changed, 485 insertions(+), 237 deletions(-) -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Jan Schmidt
2011-Jul-22 14:58 UTC
[RFC PATCH 1/4] btrfs: btrfs_multi_bio replaced with btrfs_bio
btrfs_bio is a bio abstraction able to split and not complete after the last bio has returned (like the old btrfs_multi_bio). Additionally, btrfs_bio tracks the mirror_num used to read data which can be used for error correction purposes. Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net> --- fs/btrfs/extent-tree.c | 10 ++-- fs/btrfs/scrub.c | 20 ++++---- fs/btrfs/volumes.c | 128 +++++++++++++++++++++++++---------------------- fs/btrfs/volumes.h | 10 +++- 4 files changed, 90 insertions(+), 78 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 71cd456..351efb3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1772,18 +1772,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, { int ret; u64 discarded_bytes = 0; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, - bytenr, &num_bytes, &multi, 0); + bytenr, &num_bytes, &bbio, 0); if (!ret) { - struct btrfs_bio_stripe *stripe = multi->stripes; + struct btrfs_bio_stripe *stripe = bbio->stripes; int i; - for (i = 0; i < multi->num_stripes; i++, stripe++) { + for (i = 0; i < bbio->num_stripes; i++, stripe++) { ret = btrfs_issue_discard(stripe->dev->bdev, stripe->physical, stripe->length); @@ -1792,7 +1792,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, else if (ret != -EOPNOTSUPP) break; } - kfree(multi); + kfree(bbio); } if (discarded_bytes && ret == -EOPNOTSUPP) ret = 0; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5..c04775e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -250,7 +250,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) struct scrub_dev *sdev = sbio->sdev; struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; u64 logical = sbio->logical + ix * PAGE_SIZE; u64 length; int i; @@ -269,8 +269,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) length = PAGE_SIZE; ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, - &multi, 0); - if (ret || !multi || length < PAGE_SIZE) { + &bbio, 0); + if (ret || !bbio || length < PAGE_SIZE) { printk(KERN_ERR "scrub_fixup: btrfs_map_block failed us for %llu\n", (unsigned long long)logical); @@ -278,19 +278,19 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) return; } - if (multi->num_stripes == 1) + if (bbio->num_stripes == 1) /* there aren''t any replicas */ goto uncorrectable; /* * first find a good copy */ - for (i = 0; i < multi->num_stripes; ++i) { + for (i = 0; i < bbio->num_stripes; ++i) { if (i == sbio->spag[ix].mirror_num) continue; - if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, - multi->stripes[i].physical >> 9, + if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, + bbio->stripes[i].physical >> 9, sbio->bio->bi_io_vec[ix].bv_page)) { /* I/O-error, this is not a good copy */ continue; @@ -299,7 +299,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) if (scrub_fixup_check(sbio, ix) == 0) break; } - if (i == multi->num_stripes) + if (i == bbio->num_stripes) goto uncorrectable; if (!sdev->readonly) { @@ -314,7 +314,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) } } - kfree(multi); + kfree(bbio); spin_lock(&sdev->stat_lock); ++sdev->stat.corrected_errors; spin_unlock(&sdev->stat_lock); @@ -325,7 +325,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) return; uncorrectable: - kfree(multi); + kfree(bbio); spin_lock(&sdev->stat_lock); ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 19450bc..e839b72 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2801,7 +2801,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num, static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, + struct btrfs_bio **bbio_ret, int mirror_num) { struct extent_map *em; @@ -2819,18 +2819,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int i; int num_stripes; int max_errors = 0; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; - if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) + if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) stripes_allocated = 1; again: - if (multi_ret) { - multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), + if (bbio_ret) { + bbio = kzalloc(btrfs_bio_size(stripes_allocated), GFP_NOFS); - if (!multi) + if (!bbio) return -ENOMEM; - atomic_set(&multi->error, 0); + atomic_set(&bbio->error, 0); } read_lock(&em_tree->lock); @@ -2851,7 +2851,7 @@ again: if (mirror_num > map->num_stripes) mirror_num = 0; - /* if our multi bio struct is too small, back off and try again */ + /* if our btrfs_bio struct is too small, back off and try again */ if (rw & REQ_WRITE) { if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) { @@ -2870,11 +2870,11 @@ again: stripes_required = map->num_stripes; } } - if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && + if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && stripes_allocated < stripes_required) { stripes_allocated = map->num_stripes; free_extent_map(em); - kfree(multi); + kfree(bbio); goto again; } stripe_nr = offset; @@ -2903,7 +2903,7 @@ again: *length = em->len - offset; } - if (!multi_ret) + if (!bbio_ret) goto out; num_stripes = 1; @@ -2928,13 +2928,17 @@ again: stripe_index = find_live_mirror(map, 0, map->num_stripes, current->pid % map->num_stripes); + mirror_num = stripe_index + 1; } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (REQ_WRITE | REQ_DISCARD)) + if (rw & (REQ_WRITE | REQ_DISCARD)) { num_stripes = map->num_stripes; - else if (mirror_num) + } else if (mirror_num) { stripe_index = mirror_num - 1; + } else { + mirror_num = 1; + } } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; @@ -2954,6 +2958,7 @@ again: stripe_index = find_live_mirror(map, stripe_index, map->sub_stripes, stripe_index + current->pid % map->sub_stripes); + mirror_num = stripe_index + 1; } } else { /* @@ -2962,15 +2967,16 @@ again: * stripe_index is the number of our device in the stripe array */ stripe_index = do_div(stripe_nr, map->num_stripes); + mirror_num = stripe_index + 1; } BUG_ON(stripe_index >= map->num_stripes); if (rw & REQ_DISCARD) { for (i = 0; i < num_stripes; i++) { - multi->stripes[i].physical + bbio->stripes[i].physical map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - multi->stripes[i].dev = map->stripes[stripe_index].dev; + bbio->stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { u64 stripes; @@ -2991,16 +2997,16 @@ again: } stripes = stripe_nr_end - 1 - j; do_div(stripes, map->num_stripes); - multi->stripes[i].length = map->stripe_len * + bbio->stripes[i].length = map->stripe_len * (stripes - stripe_nr + 1); if (i == 0) { - multi->stripes[i].length -+ bbio->stripes[i].length - stripe_offset; stripe_offset = 0; } if (stripe_index == last_stripe) - multi->stripes[i].length -+ bbio->stripes[i].length - stripe_end_offset; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { u64 stripes; @@ -3025,11 +3031,11 @@ again: } stripes = stripe_nr_end - 1 - j; do_div(stripes, factor); - multi->stripes[i].length = map->stripe_len * + bbio->stripes[i].length = map->stripe_len * (stripes - stripe_nr + 1); if (i < map->sub_stripes) { - multi->stripes[i].length -+ bbio->stripes[i].length - stripe_offset; if (i == map->sub_stripes - 1) stripe_offset = 0; @@ -3037,11 +3043,11 @@ again: if (stripe_index >= last_stripe && stripe_index <= (last_stripe + map->sub_stripes - 1)) { - multi->stripes[i].length -+ bbio->stripes[i].length - stripe_end_offset; } } else - multi->stripes[i].length = *length; + bbio->stripes[i].length = *length; stripe_index++; if (stripe_index == map->num_stripes) { @@ -3052,19 +3058,20 @@ again: } } else { for (i = 0; i < num_stripes; i++) { - multi->stripes[i].physical + bbio->stripes[i].physical map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - multi->stripes[i].dev + bbio->stripes[i].dev map->stripes[stripe_index].dev; stripe_index++; } } - if (multi_ret) { - *multi_ret = multi; - multi->num_stripes = num_stripes; - multi->max_errors = max_errors; + if (bbio_ret) { + *bbio_ret = bbio; + bbio->num_stripes = num_stripes; + bbio->max_errors = max_errors; + bbio->mirror_num = mirror_num; } out: free_extent_map(em); @@ -3073,9 +3080,9 @@ out: int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num) + struct btrfs_bio **bbio_ret, int mirror_num) { - return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, + return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, mirror_num); } @@ -3144,28 +3151,28 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } -static void end_bio_multi_stripe(struct bio *bio, int err) +static void btrfs_end_bio(struct bio *bio, int err) { - struct btrfs_multi_bio *multi = bio->bi_private; + struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; if (err) - atomic_inc(&multi->error); + atomic_inc(&bbio->error); - if (bio == multi->orig_bio) + if (bio == bbio->orig_bio) is_orig_bio = 1; - if (atomic_dec_and_test(&multi->stripes_pending)) { + if (atomic_dec_and_test(&bbio->stripes_pending)) { if (!is_orig_bio) { bio_put(bio); - bio = multi->orig_bio; + bio = bbio->orig_bio; } - bio->bi_private = multi->private; - bio->bi_end_io = multi->end_io; + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; /* only send an error to the higher layers if it is * beyond the tolerance of the multi-bio */ - if (atomic_read(&multi->error) > multi->max_errors) { + if (atomic_read(&bbio->error) > bbio->max_errors) { err = -EIO; } else if (err) { /* @@ -3175,7 +3182,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err) set_bit(BIO_UPTODATE, &bio->bi_flags); err = 0; } - kfree(multi); + kfree(bbio); bio_endio(bio, err); } else if (!is_orig_bio) { @@ -3255,20 +3262,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 logical = (u64)bio->bi_sector << 9; u64 length = 0; u64 map_length; - struct btrfs_multi_bio *multi = NULL; int ret; int dev_nr = 0; int total_devs = 1; + struct btrfs_bio *bbio = NULL; length = bio->bi_size; map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, + ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, mirror_num); BUG_ON(ret); - total_devs = multi->num_stripes; + total_devs = bbio->num_stripes; if (map_length < length) { printk(KERN_CRIT "mapping failed logical %llu bio len %llu " "len %llu\n", (unsigned long long)logical, @@ -3276,25 +3283,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, (unsigned long long)map_length); BUG(); } - multi->end_io = first_bio->bi_end_io; - multi->private = first_bio->bi_private; - multi->orig_bio = first_bio; - atomic_set(&multi->stripes_pending, multi->num_stripes); + + bbio->orig_bio = first_bio; + bbio->private = first_bio->bi_private; + bbio->end_io = first_bio->bi_end_io; + atomic_set(&bbio->stripes_pending, bbio->num_stripes); while (dev_nr < total_devs) { - if (total_devs > 1) { - if (dev_nr < total_devs - 1) { - bio = bio_clone(first_bio, GFP_NOFS); - BUG_ON(!bio); - } else { - bio = first_bio; - } - bio->bi_private = multi; - bio->bi_end_io = end_bio_multi_stripe; + if (dev_nr < total_devs - 1) { + bio = bio_clone(first_bio, GFP_NOFS); + BUG_ON(!bio); + } else { + bio = first_bio; } - bio->bi_sector = multi->stripes[dev_nr].physical >> 9; - dev = multi->stripes[dev_nr].dev; + bio->bi_private = bbio; + bio->bi_end_io = btrfs_end_bio; + bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; + dev = bbio->stripes[dev_nr].dev; if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { + pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " + "(%s id %llu), size=%u\n", rw, + (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, + dev->name, dev->devid, bio->bi_size); bio->bi_bdev = dev->bdev; if (async_submit) schedule_bio(root, dev, rw, bio); @@ -3307,8 +3317,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } dev_nr++; } - if (total_devs == 1) - kfree(multi); return 0; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7c12d61..f5ce92b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -134,7 +134,10 @@ struct btrfs_bio_stripe { u64 length; /* only used for discard mappings */ }; -struct btrfs_multi_bio { +struct btrfs_bio; +typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); + +struct btrfs_bio { atomic_t stripes_pending; bio_end_io_t *end_io; struct bio *orig_bio; @@ -142,6 +145,7 @@ struct btrfs_multi_bio { atomic_t error; int max_errors; int num_stripes; + int mirror_num; struct btrfs_bio_stripe stripes[]; }; @@ -169,7 +173,7 @@ struct map_lookup { int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); -#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ +#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ (sizeof(struct btrfs_bio_stripe) * (n))) int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, @@ -178,7 +182,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, u64 chunk_offset, u64 start, u64 num_bytes); int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num); + struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Jan Schmidt
2011-Jul-22 14:58 UTC
[RFC PATCH 2/4] btrfs: Do not use bio->bi_bdev after submission
The block layer modifies bio->bi_bdev and bio->bi_sector while working on the bio, they do _not_ come back unmodified in the completion callback. To call add_page, we need at least some bi_bdev set, which is why the code was working, previously. With this patch, we use the latest_bdev from fsinfo instead of the leftover in the bio. This gives us the possibility to use the bi_bdev field for another purpose. Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net> --- fs/btrfs/inode.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4a13730..6ec7a93 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1916,7 +1916,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, bio->bi_private = state; bio->bi_end_io = failed_bio->bi_end_io; bio->bi_sector = failrec->logical >> 9; - bio->bi_bdev = failed_bio->bi_bdev; + bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; bio->bi_size = 0; bio_add_page(bio, page, failrec->len, start - page_offset(page)); -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
The error correction code wants to make sure that only the bad mirror is rewritten. Thus, we need to know which mirror is the bad one. I did not find a more apropriate field than bi_bdev. But I think using this is fine, because it is modified by the block layer, anyway, and should not be read after the bio returned. Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net> --- fs/btrfs/volumes.c | 2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e839b72..55fbd4d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3169,6 +3169,8 @@ static void btrfs_end_bio(struct bio *bio, int err) } bio->bi_private = bbio->private; bio->bi_end_io = bbio->end_io; + bio->bi_bdev = (struct block_device *) + (unsigned long)bbio->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the multi-bio */ -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Jan Schmidt
2011-Jul-22 14:58 UTC
[RFC PATCH 4/4] btrfs: Moved repair code from inode.c to extent_io.c
The raid-retry code in inode.c can be generalized so that it works for metadata as well. Thus, this patch moves it to extent_io.c and makes the raid-retry code a raid-repair code. Repair works that way: Whenever a read error occurs and we have more mirrors to try, note the failed mirror, and retry another. If we find a good one, check if we did note a failure earlier and if so, do not allow the read to complete until after the bad sector was written with the good data we just fetched. As we have the extent locked while reading, no one can change the data in between. Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net> --- fs/btrfs/extent_io.c | 386 +++++++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/extent_io.h | 11 ++- fs/btrfs/inode.c | 155 +-------------------- 3 files changed, 393 insertions(+), 159 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b181a94..7fca3ed 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -15,6 +15,7 @@ #include "compat.h" #include "ctree.h" #include "btrfs_inode.h" +#include "volumes.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -1642,6 +1643,367 @@ static int check_page_writeback(struct extent_io_tree *tree, return 0; } +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can''t be found, the original + * bio end_io callback is called to indicate things have failed. + */ +struct io_failure_record { + struct page *page; + u64 start; + u64 len; + u64 logical; + unsigned long bio_flags; + int this_mirror; + int failed_mirror; + int in_validation; +}; + +static int free_io_failure(struct inode *inode, struct io_failure_record *rec, + int did_repair) +{ + int ret; + int err = 0; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + + set_state_private(failure_tree, rec->start, 0); + ret = clear_extent_bits(failure_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + if (ret) + err = ret; + + if (did_repair) { + ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_DAMAGED, GFP_NOFS); + if (ret && !err) + err = ret; + } + + kfree(rec); + return err; +} + +static void repair_io_failure_callback(struct bio *bio, int err) +{ + complete(bio->bi_private); +} + +/* + * this bypasses the standard btrfs submit functions deliberately, as + * the standard behavior is to write all copies in a raid setup. here we only + * want to write the one bad copy. so we do the mapping for ourselves and issue + * submit_bio directly. + * to avoid any synchonization issues, wait for the data after writing, which + * actually prevents the read that triggered the error from finishing. + * currently, there can be no more than two copies of every data bit. thus, + * exactly one rewrite is required. + */ +int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, + u64 length, u64 logical, struct page *page, + int mirror_num) +{ + struct bio *bio; + struct btrfs_device *dev; + DECLARE_COMPLETION_ONSTACK(compl); + u64 map_length = 0; + u64 sector; + struct btrfs_bio *bbio = NULL; + int ret; + + BUG_ON(!mirror_num); + + bio = bio_alloc(GFP_NOFS, 1); + if (!bio) + return -EIO; + bio->bi_private = &compl; + bio->bi_end_io = repair_io_failure_callback; + bio->bi_size = 0; + map_length = length; + + ret = btrfs_map_block(map_tree, WRITE, logical, + &map_length, &bbio, mirror_num); + if (ret) { + bio_put(bio); + return -EIO; + } + BUG_ON(mirror_num != bbio->mirror_num); + sector = bbio->stripes[mirror_num-1].physical >> 9; + bio->bi_sector = sector; + dev = bbio->stripes[mirror_num-1].dev; + kfree(bbio); + if (!dev || !dev->bdev || !dev->writeable) { + bio_put(bio); + return -EIO; + } + bio->bi_bdev = dev->bdev; + bio_add_page(bio, page, length, start-page_offset(page)); + submit_bio(WRITE_SYNC, bio); + wait_for_completion(&compl); + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + /* try to remap that extent elsewhere? */ + bio_put(bio); + return -EIO; + } + + printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " + "sector %llu)\n", page->mapping->host->i_ino, start, + dev->name, sector); + + bio_put(bio); + return 0; +} + +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */ +static int clean_io_failure(u64 start, struct page *page) +{ + u64 private; + u64 private_failure; + struct io_failure_record *failrec; + struct btrfs_mapping_tree *map_tree; + struct extent_state *state; + int num_copies; + int did_repair = 0; + int ret; + struct inode *inode = page->mapping->host; + + private = 0; + ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, + (u64)-1, 1, EXTENT_DIRTY, 0); + if (!ret) + return 0; + + ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, + &private_failure); + if (ret) + return 0; + + failrec = (struct io_failure_record *)(unsigned long) private_failure; + BUG_ON(!failrec->this_mirror); + + if (failrec->in_validation) { + /* there was no real error, just free the record */ + pr_debug("clean_io_failure: freeing dummy error at %llu\n", + failrec->start); + did_repair = 1; + goto out; + } + + spin_lock(&BTRFS_I(inode)->io_tree.lock); + state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, + failrec->start, + EXTENT_LOCKED); + spin_unlock(&BTRFS_I(inode)->io_tree.lock); + + if (state && state->start == failrec->start) { + map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; + num_copies = btrfs_num_copies(map_tree, failrec->logical, + failrec->len); + if (num_copies > 1) { + ret = repair_io_failure(map_tree, start, failrec->len, + failrec->logical, page, + failrec->failed_mirror); + did_repair = !ret; + } + } + +out: + if (!ret) + ret = free_io_failure(inode, failrec, did_repair); + + return ret; +} + +/* + * this is a generic handler for readpage errors (default + * readpage_io_failed_hook). if other copies exist, read those and write back + * good data to the failed position. does not investigate in remapping the + * failed extent elsewhere, hoping the device will be smart enough to do this as + * needed + */ + +static int bio_readpage_error(struct bio *failed_bio, struct page *page, + u64 start, u64 end, int failed_mirror, + struct extent_state *state) +{ + struct io_failure_record *failrec = NULL; + u64 private; + struct extent_map *em; + struct inode *inode = page->mapping->host; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct bio *bio; + int num_copies; + int ret; + int read_mode; + u64 logical; + + BUG_ON(failed_bio->bi_rw & REQ_WRITE); + + ret = get_state_private(failure_tree, start, &private); + if (ret) { + failrec = kzalloc(sizeof(*failrec), GFP_NOFS); + if (!failrec) + return -ENOMEM; + failrec->start = start; + failrec->len = end - start + 1; + failrec->this_mirror = 0; + failrec->bio_flags = 0; + failrec->in_validation = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (!em) { + kfree(failrec); + return -EIO; + } + + if (em->start > start || em->start + em->len < start) { + free_extent_map(em); + em = NULL; + } + read_unlock(&em_tree->lock); + + if (!em || IS_ERR(em)) { + kfree(failrec); + return -EIO; + } + logical = start - em->start; + logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + logical = em->block_start; + failrec->bio_flags = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&failrec->bio_flags, + em->compress_type); + } + pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " + "len=%llu\n", logical, start, failrec->len); + failrec->logical = logical; + free_extent_map(em); + + /* set the bits in the private failure tree */ + ret = set_extent_bits(failure_tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + if (ret >= 0) + ret = set_state_private(failure_tree, start, + (u64)(unsigned long)failrec); + /* set the bits in the inode''s tree */ + if (ret >= 0) + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, + GFP_NOFS); + if (ret < 0) { + kfree(failrec); + return ret; + } + } else { + failrec = (struct io_failure_record *)(unsigned long)private; + pr_debug("bio_readpage_error: (found) logical=%llu, " + "start=%llu, len=%llu, validation=%d\n", + failrec->logical, failrec->start, failrec->len, + failrec->in_validation); + /* + * when data can be on disk more than twice, add to failrec here + * (e.g. with a list for failed_mirror) to make + * clean_io_failure() clean all those errors at once. + */ + } + num_copies = btrfs_num_copies( + &BTRFS_I(inode)->root->fs_info->mapping_tree, + failrec->logical, failrec->len); + if (num_copies == 1) { + /* + * we only have a single copy of the data, so don''t bother with + * all the retry and error correction code that follows. no + * matter what the error is, it is very likely to persist. + */ + pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " + "state=%p, num_copies=%d, next_mirror %d, " + "failed_mirror %d\n", state, num_copies, + failrec->this_mirror, failed_mirror); + free_io_failure(inode, failrec, 0); + return -EIO; + } + + if (!state) { + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, failrec->start, + EXTENT_LOCKED); + if (state && state->start != failrec->start) + state = NULL; + spin_unlock(&tree->lock); + } + + /* + * there are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + */ + if (failed_bio->bi_vcnt > 1) { + /* + * to fulfill b), we need to know the exact failing sectors, as + * we don''t want to rewrite any more than the failed ones. thus, + * we need separate read requests for the failed bio + * + * if the following BUG_ON triggers, our validation request got + * merged. we need separate requests for our algorithm to work. + */ + BUG_ON(failrec->in_validation); + failrec->in_validation = 1; + failrec->this_mirror = failed_mirror; + read_mode = READ_SYNC | REQ_FAILFAST_DEV; + } else { + /* + * we''re ready to fulfill a) and b) alongside. get a good copy + * of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. + */ + if (failrec->in_validation) { + BUG_ON(failrec->this_mirror != failed_mirror); + failrec->in_validation = 0; + failrec->this_mirror = 0; + } + failrec->failed_mirror = failed_mirror; + failrec->this_mirror++; + if (failrec->this_mirror == failed_mirror) + failrec->this_mirror++; + read_mode = READ_SYNC; + } + + if (!state || failrec->this_mirror > num_copies) { + pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " + "next_mirror %d, failed_mirror %d\n", state, + num_copies, failrec->this_mirror, failed_mirror); + free_io_failure(inode, failrec, 0); + return -EIO; + } + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_private = state; + bio->bi_end_io = failed_bio->bi_end_io; + bio->bi_sector = failrec->logical >> 9; + bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + bio->bi_size = 0; + + bio_add_page(bio, page, failrec->len, start - page_offset(page)); + + pr_debug("bio_readpage_error: submitting new read[%#x] to " + "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, + failrec->this_mirror, num_copies, failrec->in_validation); + + tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, + failrec->bio_flags, 0); + return 0; +} + /* lots and lots of room for performance fixes in the end_bio funcs */ /* @@ -1740,6 +2102,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct extent_state *cached = NULL; struct extent_state *state; + pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " + "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, + (long int)bio->bi_bdev); tree = &BTRFS_I(page->mapping->host)->io_tree; start = ((u64)page->index << PAGE_CACHE_SHIFT) + @@ -1770,11 +2135,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err) state); if (ret) uptodate = 0; + else + clean_io_failure(start, page); } - if (!uptodate && tree->ops && - tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook(bio, page, - start, end, NULL); + if (!uptodate) { + u64 failed_mirror; + failed_mirror = (u64)bio->bi_bdev; + if (tree->ops && tree->ops->readpage_io_failed_hook) + ret = tree->ops->readpage_io_failed_hook( + bio, page, start, end, + failed_mirror, NULL); + else + ret = bio_readpage_error(bio, page, start, end, + failed_mirror, NULL); if (ret == 0) { uptodate test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -1854,6 +2227,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, mirror_num, bio_flags, start); else submit_bio(rw, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; bio_put(bio); @@ -2966,7 +3340,7 @@ out: return ret; } -static inline struct page *extent_buffer_page(struct extent_buffer *eb, +inline struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i) { struct page *p; @@ -2991,7 +3365,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb, return p; } -static inline unsigned long num_extent_pages(u64 start, u64 len) +inline unsigned long num_extent_pages(u64 start, u64 len) { return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a11a92e..e29d54d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -17,6 +17,7 @@ #define EXTENT_NODATASUM (1 << 10) #define EXTENT_DO_ACCOUNTING (1 << 11) #define EXTENT_FIRST_DELALLOC (1 << 12) +#define EXTENT_DAMAGED (1 << 13) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) @@ -67,7 +68,7 @@ struct extent_io_ops { unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, + u64 start, u64 end, u64 failed_mirror, struct extent_state *state); int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, @@ -243,6 +244,8 @@ void free_extent_buffer(struct extent_buffer *eb); int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num); +unsigned long num_extent_pages(u64 start, u64 len); +struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i); static inline void extent_buffer_get(struct extent_buffer *eb) { @@ -297,4 +300,10 @@ int extent_clear_unlock_delalloc(struct inode *inode, struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags); + +struct btrfs_mapping_tree; + +int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, + u64 length, u64 logical, struct page *page, + int mirror_num); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6ec7a93..86195d4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -45,10 +45,10 @@ #include "btrfs_inode.h" #include "ioctl.h" #include "print-tree.h" -#include "volumes.h" #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" +#include "volumes.h" #include "compression.h" #include "locking.h" #include "free-space-cache.h" @@ -1820,153 +1820,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, } /* - * When IO fails, either with EIO or csum verification fails, we - * try other mirrors that might have a good copy of the data. This - * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the page is set up to date - * and things continue. If a good mirror can''t be found, the original - * bio end_io callback is called to indicate things have failed. - */ -struct io_failure_record { - struct page *page; - u64 start; - u64 len; - u64 logical; - unsigned long bio_flags; - int last_mirror; -}; - -static int btrfs_io_failed_hook(struct bio *failed_bio, - struct page *page, u64 start, u64 end, - struct extent_state *state) -{ - struct io_failure_record *failrec = NULL; - u64 private; - struct extent_map *em; - struct inode *inode = page->mapping->host; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct bio *bio; - int num_copies; - int ret; - int rw; - u64 logical; - - ret = get_state_private(failure_tree, start, &private); - if (ret) { - failrec = kmalloc(sizeof(*failrec), GFP_NOFS); - if (!failrec) - return -ENOMEM; - failrec->start = start; - failrec->len = end - start + 1; - failrec->last_mirror = 0; - failrec->bio_flags = 0; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, failrec->len); - if (em->start > start || em->start + em->len < start) { - free_extent_map(em); - em = NULL; - } - read_unlock(&em_tree->lock); - - if (IS_ERR_OR_NULL(em)) { - kfree(failrec); - return -EIO; - } - logical = start - em->start; - logical = em->block_start + logical; - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - logical = em->block_start; - failrec->bio_flags = EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&failrec->bio_flags, - em->compress_type); - } - failrec->logical = logical; - free_extent_map(em); - set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | - EXTENT_DIRTY, GFP_NOFS); - set_state_private(failure_tree, start, - (u64)(unsigned long)failrec); - } else { - failrec = (struct io_failure_record *)(unsigned long)private; - } - num_copies = btrfs_num_copies( - &BTRFS_I(inode)->root->fs_info->mapping_tree, - failrec->logical, failrec->len); - failrec->last_mirror++; - if (!state) { - spin_lock(&BTRFS_I(inode)->io_tree.lock); - state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, - failrec->start, - EXTENT_LOCKED); - if (state && state->start != failrec->start) - state = NULL; - spin_unlock(&BTRFS_I(inode)->io_tree.lock); - } - if (!state || failrec->last_mirror > num_copies) { - set_state_private(failure_tree, failrec->start, 0); - clear_extent_bits(failure_tree, failrec->start, - failrec->start + failrec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); - kfree(failrec); - return -EIO; - } - bio = bio_alloc(GFP_NOFS, 1); - bio->bi_private = state; - bio->bi_end_io = failed_bio->bi_end_io; - bio->bi_sector = failrec->logical >> 9; - bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - bio->bi_size = 0; - - bio_add_page(bio, page, failrec->len, start - page_offset(page)); - if (failed_bio->bi_rw & REQ_WRITE) - rw = WRITE; - else - rw = READ; - - ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, - failrec->last_mirror, - failrec->bio_flags, 0); - return ret; -} - -/* - * each time an IO finishes, we do a fast check in the IO failure tree - * to see if we need to process or clean up an io_failure_record - */ -static int btrfs_clean_io_failures(struct inode *inode, u64 start) -{ - u64 private; - u64 private_failure; - struct io_failure_record *failure; - int ret; - - private = 0; - if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY, 0)) { - ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, - start, &private_failure); - if (ret == 0) { - failure = (struct io_failure_record *)(unsigned long) - private_failure; - set_state_private(&BTRFS_I(inode)->io_failure_tree, - failure->start, 0); - clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, - failure->start, - failure->start + failure->len - 1, - EXTENT_DIRTY | EXTENT_LOCKED, - GFP_NOFS); - kfree(failure); - } - } - return 0; -} - -/* * when reads are done, we need to check csums to verify the data is correct - * if there''s a match, we allow the bio to finish. If not, we go through - * the io_failure_record routines to find good copies + * if there''s a match, we allow the bio to finish. If not, the code in + * extent_io.c will try to find good copies for us. */ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) @@ -2012,10 +1868,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, kunmap_atomic(kaddr, KM_USER0); good: - /* if the io failure tree for this inode is non-empty, - * check to see if we''ve recovered from a failed IO - */ - btrfs_clean_io_failures(inode, start); return 0; zeroit: @@ -7384,7 +7236,6 @@ static struct extent_io_ops btrfs_extent_io_ops = { .readpage_end_io_hook = btrfs_readpage_end_io_hook, .writepage_end_io_hook = btrfs_writepage_end_io_hook, .writepage_start_hook = btrfs_writepage_start_hook, - .readpage_io_failed_hook = btrfs_io_failed_hook, .set_bit_hook = btrfs_set_bit_hook, .clear_bit_hook = btrfs_clear_bit_hook, .merge_extent_hook = btrfs_merge_extent_hook, -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Andi Kleen
2011-Jul-24 16:24 UTC
Re: [RFC PATCH 4/4] btrfs: Moved repair code from inode.c to extent_io.c
Jan Schmidt <list.btrfs@jan-o-sch.net> writes:> > Repair works that way: Whenever a read error occurs and we have more > mirrors to try, note the failed mirror, and retry another. If we find a > good one, check if we did note a failure earlier and if so, do not allow > the read to complete until after the bad sector was written with the good > data we just fetched. As we have the extent locked while reading, no one > can change the data in between.This has the potential for error loops: when the write fails too you get another error in the log and can flood the log etc. I assume this could get really noisy if that disk completely went away. Perhaps it needs a threshold to see if there aren''t too many errors on the mirror and then stop retrying at some point. -Andi -- ak@linux.intel.com -- Speaking for myself only -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Jan Schmidt
2011-Jul-24 17:28 UTC
Re: [RFC PATCH 4/4] btrfs: Moved repair code from inode.c to extent_io.c
On 24.07.2011 18:24, Andi Kleen wrote:> Jan Schmidt <list.btrfs@jan-o-sch.net> writes: >> >> Repair works that way: Whenever a read error occurs and we have more >> mirrors to try, note the failed mirror, and retry another. If we find a >> good one, check if we did note a failure earlier and if so, do not allow >> the read to complete until after the bad sector was written with the good >> data we just fetched. As we have the extent locked while reading, no one >> can change the data in between. > > This has the potential for error loops: when the write fails too > you get another error in the log and can flood the log etc. > I assume this could get really noisy if that disk completely > went away.I wasn''t clear enough on that: We only track read errors, here. Ans error correction can only happen on the read path. So if the write attempt fails, we can''t go into a loop.> Perhaps it needs a threshold to see if there aren''t too many errors > on the mirror and then stop retrying at some point.This might make sense for completely broken disks that did not went away, yet. However, for the future I''d like to see some intelligence in btrfs monitoring disk errors and automatically replacing a disk after a certain (maybe configurable) number of errors. For the mean time, I''d accept a completely broken disk to flush the log. Anyway, I''ve got some sata error injectors and will test my patches with those in the following days. Maybe some obvious point turns up where we could throttle things. -Jan -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Andi Kleen
2011-Jul-24 23:01 UTC
Re: [RFC PATCH 4/4] btrfs: Moved repair code from inode.c to extent_io.c
> I wasn''t clear enough on that: We only track read errors, here. Ans > error correction can only happen on the read path. So if the write > attempt fails, we can''t go into a loop.Not in a loop, but you trigger more IO errors, which can be nasty if the IO error logging triggers more IO (pretty common because syslogd calls fsync). And then your code does even more IO, floods more etc.etc. And the user will be unhappy if their console gets flooded. We''ve have a similar problems in the past with readahead causing error flooding. Any time where an error can cause more IO you have to be extremly careful. Right now this seems rather risky to me. -Andi -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Ian Kent
2011-Jul-25 03:58 UTC
Re: [RFC PATCH 4/4] btrfs: Moved repair code from inode.c to extent_io.c
On Fri, 2011-07-22 at 16:58 +0200, Jan Schmidt wrote:> The raid-retry code in inode.c can be generalized so that it works for > metadata as well. Thus, this patch moves it to extent_io.c and makes the > raid-retry code a raid-repair code. > > Repair works that way: Whenever a read error occurs and we have more > mirrors to try, note the failed mirror, and retry another. If we find a > good one, check if we did note a failure earlier and if so, do not allow > the read to complete until after the bad sector was written with the good > data we just fetched. As we have the extent locked while reading, no one > can change the data in between. > > Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net> > --- > fs/btrfs/extent_io.c | 386 +++++++++++++++++++++++++++++++++++++++++++++++++- > fs/btrfs/extent_io.h | 11 ++- > fs/btrfs/inode.c | 155 +-------------------- > 3 files changed, 393 insertions(+), 159 deletions(-) > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > index b181a94..7fca3ed 100644 > --- a/fs/btrfs/extent_io.c > +++ b/fs/btrfs/extent_io.c > @@ -15,6 +15,7 @@ > #include "compat.h" > #include "ctree.h" > #include "btrfs_inode.h" > +#include "volumes.h" > > static struct kmem_cache *extent_state_cache; > static struct kmem_cache *extent_buffer_cache; > @@ -1642,6 +1643,367 @@ static int check_page_writeback(struct extent_io_tree *tree, > return 0; > } > > +/* > + * When IO fails, either with EIO or csum verification fails, we > + * try other mirrors that might have a good copy of the data. This > + * io_failure_record is used to record state as we go through all the > + * mirrors. If another mirror has good data, the page is set up to date > + * and things continue. If a good mirror can''t be found, the original > + * bio end_io callback is called to indicate things have failed. > + */ > +struct io_failure_record { > + struct page *page; > + u64 start; > + u64 len; > + u64 logical; > + unsigned long bio_flags; > + int this_mirror; > + int failed_mirror; > + int in_validation; > +}; > + > +static int free_io_failure(struct inode *inode, struct io_failure_record *rec, > + int did_repair) > +{ > + int ret; > + int err = 0; > + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; > + > + set_state_private(failure_tree, rec->start, 0); > + ret = clear_extent_bits(failure_tree, rec->start, > + rec->start + rec->len - 1, > + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); > + if (ret) > + err = ret; > + > + if (did_repair) { > + ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, > + rec->start + rec->len - 1, > + EXTENT_DAMAGED, GFP_NOFS); > + if (ret && !err) > + err = ret; > + } > + > + kfree(rec); > + return err; > +} > + > +static void repair_io_failure_callback(struct bio *bio, int err) > +{ > + complete(bio->bi_private); > +} > + > +/* > + * this bypasses the standard btrfs submit functions deliberately, as > + * the standard behavior is to write all copies in a raid setup. here we only > + * want to write the one bad copy. so we do the mapping for ourselves and issue > + * submit_bio directly. > + * to avoid any synchonization issues, wait for the data after writing, which > + * actually prevents the read that triggered the error from finishing. > + * currently, there can be no more than two copies of every data bit. thus, > + * exactly one rewrite is required. > + */ > +int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, > + u64 length, u64 logical, struct page *page, > + int mirror_num) > +{ > + struct bio *bio; > + struct btrfs_device *dev; > + DECLARE_COMPLETION_ONSTACK(compl); > + u64 map_length = 0; > + u64 sector; > + struct btrfs_bio *bbio = NULL; > + int ret; > + > + BUG_ON(!mirror_num); > + > + bio = bio_alloc(GFP_NOFS, 1); > + if (!bio) > + return -EIO; > + bio->bi_private = &compl; > + bio->bi_end_io = repair_io_failure_callback; > + bio->bi_size = 0; > + map_length = length; > + > + ret = btrfs_map_block(map_tree, WRITE, logical, > + &map_length, &bbio, mirror_num); > + if (ret) { > + bio_put(bio); > + return -EIO; > + } > + BUG_ON(mirror_num != bbio->mirror_num); > + sector = bbio->stripes[mirror_num-1].physical >> 9; > + bio->bi_sector = sector; > + dev = bbio->stripes[mirror_num-1].dev; > + kfree(bbio); > + if (!dev || !dev->bdev || !dev->writeable) { > + bio_put(bio); > + return -EIO; > + } > + bio->bi_bdev = dev->bdev; > + bio_add_page(bio, page, length, start-page_offset(page)); > + submit_bio(WRITE_SYNC, bio); > + wait_for_completion(&compl); > + > + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { > + /* try to remap that extent elsewhere? */ > + bio_put(bio); > + return -EIO; > + } > + > + printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " > + "sector %llu)\n", page->mapping->host->i_ino, start, > + dev->name, sector); > + > + bio_put(bio); > + return 0; > +} > + > +/* > + * each time an IO finishes, we do a fast check in the IO failure tree > + * to see if we need to process or clean up an io_failure_record > + */ > +static int clean_io_failure(u64 start, struct page *page) > +{ > + u64 private; > + u64 private_failure; > + struct io_failure_record *failrec; > + struct btrfs_mapping_tree *map_tree; > + struct extent_state *state; > + int num_copies; > + int did_repair = 0; > + int ret; > + struct inode *inode = page->mapping->host; > + > + private = 0; > + ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, > + (u64)-1, 1, EXTENT_DIRTY, 0); > + if (!ret) > + return 0; > + > + ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, > + &private_failure); > + if (ret) > + return 0; > + > + failrec = (struct io_failure_record *)(unsigned long) private_failure; > + BUG_ON(!failrec->this_mirror); > + > + if (failrec->in_validation) { > + /* there was no real error, just free the record */ > + pr_debug("clean_io_failure: freeing dummy error at %llu\n", > + failrec->start); > + did_repair = 1; > + goto out; > + } > + > + spin_lock(&BTRFS_I(inode)->io_tree.lock); > + state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, > + failrec->start, > + EXTENT_LOCKED); > + spin_unlock(&BTRFS_I(inode)->io_tree.lock); > + > + if (state && state->start == failrec->start) { > + map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; > + num_copies = btrfs_num_copies(map_tree, failrec->logical, > + failrec->len); > + if (num_copies > 1) { > + ret = repair_io_failure(map_tree, start, failrec->len, > + failrec->logical, page, > + failrec->failed_mirror); > + did_repair = !ret; > + } > + } > + > +out: > + if (!ret) > + ret = free_io_failure(inode, failrec, did_repair); > + > + return ret; > +} > + > +/* > + * this is a generic handler for readpage errors (default > + * readpage_io_failed_hook). if other copies exist, read those and write back > + * good data to the failed position. does not investigate in remapping the > + * failed extent elsewhere, hoping the device will be smart enough to do this as > + * needed > + */ > + > +static int bio_readpage_error(struct bio *failed_bio, struct page *page, > + u64 start, u64 end, int failed_mirror, > + struct extent_state *state) > +{ > + struct io_failure_record *failrec = NULL; > + u64 private; > + struct extent_map *em; > + struct inode *inode = page->mapping->host; > + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; > + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; > + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; > + struct bio *bio; > + int num_copies; > + int ret; > + int read_mode; > + u64 logical; > + > + BUG_ON(failed_bio->bi_rw & REQ_WRITE); > + > + ret = get_state_private(failure_tree, start, &private); > + if (ret) { > + failrec = kzalloc(sizeof(*failrec), GFP_NOFS); > + if (!failrec) > + return -ENOMEM; > + failrec->start = start; > + failrec->len = end - start + 1; > + failrec->this_mirror = 0; > + failrec->bio_flags = 0; > + failrec->in_validation = 0; > + > + read_lock(&em_tree->lock); > + em = lookup_extent_mapping(em_tree, start, failrec->len); > + if (!em) {Looks like a missing "read_unlock(&em_tree->lock);" here to me?> + kfree(failrec); > + return -EIO; > + } > + > + if (em->start > start || em->start + em->len < start) { > + free_extent_map(em); > + em = NULL; > + } > + read_unlock(&em_tree->lock); > + > + if (!em || IS_ERR(em)) { > + kfree(failrec); > + return -EIO; > + } > + logical = start - em->start; > + logical = em->block_start + logical; > + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { > + logical = em->block_start; > + failrec->bio_flags = EXTENT_BIO_COMPRESSED; > + extent_set_compress_type(&failrec->bio_flags, > + em->compress_type); > + } > + pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " > + "len=%llu\n", logical, start, failrec->len); > + failrec->logical = logical; > + free_extent_map(em); > + > + /* set the bits in the private failure tree */ > + ret = set_extent_bits(failure_tree, start, end, > + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); > + if (ret >= 0) > + ret = set_state_private(failure_tree, start, > + (u64)(unsigned long)failrec); > + /* set the bits in the inode''s tree */ > + if (ret >= 0) > + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, > + GFP_NOFS); > + if (ret < 0) { > + kfree(failrec); > + return ret; > + } > + } else { > + failrec = (struct io_failure_record *)(unsigned long)private; > + pr_debug("bio_readpage_error: (found) logical=%llu, " > + "start=%llu, len=%llu, validation=%d\n", > + failrec->logical, failrec->start, failrec->len, > + failrec->in_validation); > + /* > + * when data can be on disk more than twice, add to failrec here > + * (e.g. with a list for failed_mirror) to make > + * clean_io_failure() clean all those errors at once. > + */ > + } > + num_copies = btrfs_num_copies( > + &BTRFS_I(inode)->root->fs_info->mapping_tree, > + failrec->logical, failrec->len); > + if (num_copies == 1) { > + /* > + * we only have a single copy of the data, so don''t bother with > + * all the retry and error correction code that follows. no > + * matter what the error is, it is very likely to persist. > + */ > + pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " > + "state=%p, num_copies=%d, next_mirror %d, " > + "failed_mirror %d\n", state, num_copies, > + failrec->this_mirror, failed_mirror); > + free_io_failure(inode, failrec, 0); > + return -EIO; > + } > + > + if (!state) { > + spin_lock(&tree->lock); > + state = find_first_extent_bit_state(tree, failrec->start, > + EXTENT_LOCKED); > + if (state && state->start != failrec->start) > + state = NULL; > + spin_unlock(&tree->lock); > + } > + > + /* > + * there are two premises: > + * a) deliver good data to the caller > + * b) correct the bad sectors on disk > + */ > + if (failed_bio->bi_vcnt > 1) { > + /* > + * to fulfill b), we need to know the exact failing sectors, as > + * we don''t want to rewrite any more than the failed ones. thus, > + * we need separate read requests for the failed bio > + * > + * if the following BUG_ON triggers, our validation request got > + * merged. we need separate requests for our algorithm to work. > + */ > + BUG_ON(failrec->in_validation); > + failrec->in_validation = 1; > + failrec->this_mirror = failed_mirror; > + read_mode = READ_SYNC | REQ_FAILFAST_DEV; > + } else { > + /* > + * we''re ready to fulfill a) and b) alongside. get a good copy > + * of the failed sector and if we succeed, we have setup > + * everything for repair_io_failure to do the rest for us. > + */ > + if (failrec->in_validation) { > + BUG_ON(failrec->this_mirror != failed_mirror); > + failrec->in_validation = 0; > + failrec->this_mirror = 0; > + } > + failrec->failed_mirror = failed_mirror; > + failrec->this_mirror++; > + if (failrec->this_mirror == failed_mirror) > + failrec->this_mirror++; > + read_mode = READ_SYNC; > + } > + > + if (!state || failrec->this_mirror > num_copies) { > + pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " > + "next_mirror %d, failed_mirror %d\n", state, > + num_copies, failrec->this_mirror, failed_mirror); > + free_io_failure(inode, failrec, 0); > + return -EIO; > + } > + > + bio = bio_alloc(GFP_NOFS, 1); > + bio->bi_private = state; > + bio->bi_end_io = failed_bio->bi_end_io; > + bio->bi_sector = failrec->logical >> 9; > + bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; > + bio->bi_size = 0; > + > + bio_add_page(bio, page, failrec->len, start - page_offset(page)); > + > + pr_debug("bio_readpage_error: submitting new read[%#x] to " > + "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, > + failrec->this_mirror, num_copies, failrec->in_validation); > + > + tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, > + failrec->bio_flags, 0); > + return 0; > +} > + > /* lots and lots of room for performance fixes in the end_bio funcs */ > > /* > @@ -1740,6 +2102,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err) > struct extent_state *cached = NULL; > struct extent_state *state; > > + pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " > + "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, > + (long int)bio->bi_bdev); > tree = &BTRFS_I(page->mapping->host)->io_tree; > > start = ((u64)page->index << PAGE_CACHE_SHIFT) + > @@ -1770,11 +2135,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err) > state); > if (ret) > uptodate = 0; > + else > + clean_io_failure(start, page); > } > - if (!uptodate && tree->ops && > - tree->ops->readpage_io_failed_hook) { > - ret = tree->ops->readpage_io_failed_hook(bio, page, > - start, end, NULL); > + if (!uptodate) { > + u64 failed_mirror; > + failed_mirror = (u64)bio->bi_bdev; > + if (tree->ops && tree->ops->readpage_io_failed_hook) > + ret = tree->ops->readpage_io_failed_hook( > + bio, page, start, end, > + failed_mirror, NULL); > + else > + ret = bio_readpage_error(bio, page, start, end, > + failed_mirror, NULL); > if (ret == 0) { > uptodate > test_bit(BIO_UPTODATE, &bio->bi_flags); > @@ -1854,6 +2227,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, > mirror_num, bio_flags, start); > else > submit_bio(rw, bio); > + > if (bio_flagged(bio, BIO_EOPNOTSUPP)) > ret = -EOPNOTSUPP; > bio_put(bio); > @@ -2966,7 +3340,7 @@ out: > return ret; > } > > -static inline struct page *extent_buffer_page(struct extent_buffer *eb, > +inline struct page *extent_buffer_page(struct extent_buffer *eb, > unsigned long i) > { > struct page *p; > @@ -2991,7 +3365,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb, > return p; > } > > -static inline unsigned long num_extent_pages(u64 start, u64 len) > +inline unsigned long num_extent_pages(u64 start, u64 len) > { > return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - > (start >> PAGE_CACHE_SHIFT); > diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h > index a11a92e..e29d54d 100644 > --- a/fs/btrfs/extent_io.h > +++ b/fs/btrfs/extent_io.h > @@ -17,6 +17,7 @@ > #define EXTENT_NODATASUM (1 << 10) > #define EXTENT_DO_ACCOUNTING (1 << 11) > #define EXTENT_FIRST_DELALLOC (1 << 12) > +#define EXTENT_DAMAGED (1 << 13) > #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) > #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) > > @@ -67,7 +68,7 @@ struct extent_io_ops { > unsigned long bio_flags); > int (*readpage_io_hook)(struct page *page, u64 start, u64 end); > int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, > - u64 start, u64 end, > + u64 start, u64 end, u64 failed_mirror, > struct extent_state *state); > int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, > u64 start, u64 end, > @@ -243,6 +244,8 @@ void free_extent_buffer(struct extent_buffer *eb); > int read_extent_buffer_pages(struct extent_io_tree *tree, > struct extent_buffer *eb, u64 start, int wait, > get_extent_t *get_extent, int mirror_num); > +unsigned long num_extent_pages(u64 start, u64 len); > +struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i); > > static inline void extent_buffer_get(struct extent_buffer *eb) > { > @@ -297,4 +300,10 @@ int extent_clear_unlock_delalloc(struct inode *inode, > struct bio * > btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, > gfp_t gfp_flags); > + > +struct btrfs_mapping_tree; > + > +int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, > + u64 length, u64 logical, struct page *page, > + int mirror_num); > #endif > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index 6ec7a93..86195d4 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -45,10 +45,10 @@ > #include "btrfs_inode.h" > #include "ioctl.h" > #include "print-tree.h" > -#include "volumes.h" > #include "ordered-data.h" > #include "xattr.h" > #include "tree-log.h" > +#include "volumes.h" > #include "compression.h" > #include "locking.h" > #include "free-space-cache.h" > @@ -1820,153 +1820,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, > } > > /* > - * When IO fails, either with EIO or csum verification fails, we > - * try other mirrors that might have a good copy of the data. This > - * io_failure_record is used to record state as we go through all the > - * mirrors. If another mirror has good data, the page is set up to date > - * and things continue. If a good mirror can''t be found, the original > - * bio end_io callback is called to indicate things have failed. > - */ > -struct io_failure_record { > - struct page *page; > - u64 start; > - u64 len; > - u64 logical; > - unsigned long bio_flags; > - int last_mirror; > -}; > - > -static int btrfs_io_failed_hook(struct bio *failed_bio, > - struct page *page, u64 start, u64 end, > - struct extent_state *state) > -{ > - struct io_failure_record *failrec = NULL; > - u64 private; > - struct extent_map *em; > - struct inode *inode = page->mapping->host; > - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; > - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; > - struct bio *bio; > - int num_copies; > - int ret; > - int rw; > - u64 logical; > - > - ret = get_state_private(failure_tree, start, &private); > - if (ret) { > - failrec = kmalloc(sizeof(*failrec), GFP_NOFS); > - if (!failrec) > - return -ENOMEM; > - failrec->start = start; > - failrec->len = end - start + 1; > - failrec->last_mirror = 0; > - failrec->bio_flags = 0; > - > - read_lock(&em_tree->lock); > - em = lookup_extent_mapping(em_tree, start, failrec->len); > - if (em->start > start || em->start + em->len < start) { > - free_extent_map(em); > - em = NULL; > - } > - read_unlock(&em_tree->lock); > - > - if (IS_ERR_OR_NULL(em)) { > - kfree(failrec); > - return -EIO; > - } > - logical = start - em->start; > - logical = em->block_start + logical; > - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { > - logical = em->block_start; > - failrec->bio_flags = EXTENT_BIO_COMPRESSED; > - extent_set_compress_type(&failrec->bio_flags, > - em->compress_type); > - } > - failrec->logical = logical; > - free_extent_map(em); > - set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | > - EXTENT_DIRTY, GFP_NOFS); > - set_state_private(failure_tree, start, > - (u64)(unsigned long)failrec); > - } else { > - failrec = (struct io_failure_record *)(unsigned long)private; > - } > - num_copies = btrfs_num_copies( > - &BTRFS_I(inode)->root->fs_info->mapping_tree, > - failrec->logical, failrec->len); > - failrec->last_mirror++; > - if (!state) { > - spin_lock(&BTRFS_I(inode)->io_tree.lock); > - state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, > - failrec->start, > - EXTENT_LOCKED); > - if (state && state->start != failrec->start) > - state = NULL; > - spin_unlock(&BTRFS_I(inode)->io_tree.lock); > - } > - if (!state || failrec->last_mirror > num_copies) { > - set_state_private(failure_tree, failrec->start, 0); > - clear_extent_bits(failure_tree, failrec->start, > - failrec->start + failrec->len - 1, > - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); > - kfree(failrec); > - return -EIO; > - } > - bio = bio_alloc(GFP_NOFS, 1); > - bio->bi_private = state; > - bio->bi_end_io = failed_bio->bi_end_io; > - bio->bi_sector = failrec->logical >> 9; > - bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; > - bio->bi_size = 0; > - > - bio_add_page(bio, page, failrec->len, start - page_offset(page)); > - if (failed_bio->bi_rw & REQ_WRITE) > - rw = WRITE; > - else > - rw = READ; > - > - ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, > - failrec->last_mirror, > - failrec->bio_flags, 0); > - return ret; > -} > - > -/* > - * each time an IO finishes, we do a fast check in the IO failure tree > - * to see if we need to process or clean up an io_failure_record > - */ > -static int btrfs_clean_io_failures(struct inode *inode, u64 start) > -{ > - u64 private; > - u64 private_failure; > - struct io_failure_record *failure; > - int ret; > - > - private = 0; > - if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, > - (u64)-1, 1, EXTENT_DIRTY, 0)) { > - ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, > - start, &private_failure); > - if (ret == 0) { > - failure = (struct io_failure_record *)(unsigned long) > - private_failure; > - set_state_private(&BTRFS_I(inode)->io_failure_tree, > - failure->start, 0); > - clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, > - failure->start, > - failure->start + failure->len - 1, > - EXTENT_DIRTY | EXTENT_LOCKED, > - GFP_NOFS); > - kfree(failure); > - } > - } > - return 0; > -} > - > -/* > * when reads are done, we need to check csums to verify the data is correct > - * if there''s a match, we allow the bio to finish. If not, we go through > - * the io_failure_record routines to find good copies > + * if there''s a match, we allow the bio to finish. If not, the code in > + * extent_io.c will try to find good copies for us. > */ > static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, > struct extent_state *state) > @@ -2012,10 +1868,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, > > kunmap_atomic(kaddr, KM_USER0); > good: > - /* if the io failure tree for this inode is non-empty, > - * check to see if we''ve recovered from a failed IO > - */ > - btrfs_clean_io_failures(inode, start); > return 0; > > zeroit: > @@ -7384,7 +7236,6 @@ static struct extent_io_ops btrfs_extent_io_ops = { > .readpage_end_io_hook = btrfs_readpage_end_io_hook, > .writepage_end_io_hook = btrfs_writepage_end_io_hook, > .writepage_start_hook = btrfs_writepage_start_hook, > - .readpage_io_failed_hook = btrfs_io_failed_hook, > .set_bit_hook = btrfs_set_bit_hook, > .clear_bit_hook = btrfs_clear_bit_hook, > .merge_extent_hook = btrfs_merge_extent_hook,-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Jan Schmidt
2011-Jul-25 08:52 UTC
Re: [RFC PATCH 4/4] btrfs: Moved repair code from inode.c to extent_io.c
On 25.07.2011 01:01, Andi Kleen wrote:>> I wasn''t clear enough on that: We only track read errors, here. Ans >> error correction can only happen on the read path. So if the write >> attempt fails, we can''t go into a loop. > > Not in a loop, but you trigger more IO errors, which can be nasty > if the IO error logging triggers more IO (pretty common because > syslogd calls fsync). And then your code does even more IO, floods > more etc.etc. And the user will be unhappy if their > console gets flooded.Okay, I see your point now. Thanks for pointing that out.> We''ve have a similar problems in the past with readahead causing > error flooding. > > Any time where an error can cause more IO you have to be extremly > careful. > > Right now this seems rather risky to me.Hum. This brings up a lot of questions. Would you consider throttling an appropriate solution to prevent error flooding? What would you use as a base? A per device counter (which might be misleading if there are more layers below)? A per filesystem counter (which might need configurability)? Should those "counters" regenerate over time? Any other approaches? -Jan -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Jan Schmidt
2011-Jul-25 08:59 UTC
Re: [RFC PATCH 4/4] btrfs: Moved repair code from inode.c to extent_io.c
On 25.07.2011 05:58, Ian Kent wrote:> On Fri, 2011-07-22 at 16:58 +0200, Jan Schmidt wrote: >> +static int bio_readpage_error(struct bio *failed_bio, struct page *page, >> + u64 start, u64 end, int failed_mirror, >> + struct extent_state *state) >> +{ >> + struct io_failure_record *failrec = NULL; >> + u64 private; >> + struct extent_map *em; >> + struct inode *inode = page->mapping->host; >> + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; >> + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; >> + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; >> + struct bio *bio; >> + int num_copies; >> + int ret; >> + int read_mode; >> + u64 logical; >> + >> + BUG_ON(failed_bio->bi_rw & REQ_WRITE); >> + >> + ret = get_state_private(failure_tree, start, &private); >> + if (ret) { >> + failrec = kzalloc(sizeof(*failrec), GFP_NOFS); >> + if (!failrec) >> + return -ENOMEM; >> + failrec->start = start; >> + failrec->len = end - start + 1; >> + failrec->this_mirror = 0; >> + failrec->bio_flags = 0; >> + failrec->in_validation = 0; >> + >> + read_lock(&em_tree->lock); >> + em = lookup_extent_mapping(em_tree, start, failrec->len); >> + if (!em) { > > Looks like a missing "read_unlock(&em_tree->lock);" here to me?Thanks, will be fixed in next version. -Jan>> + kfree(failrec); >> + return -EIO; >> + } >> + >> + if (em->start > start || em->start + em->len < start) { >> + free_extent_map(em); >> + em = NULL; >> + } >> + read_unlock(&em_tree->lock); >> + >> + if (!em || IS_ERR(em)) { >> + kfree(failrec); >> + return -EIO; >> + } >> + logical = start - em->start; >> + logical = em->block_start + logical; >> + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { >> + logical = em->block_start; >> + failrec->bio_flags = EXTENT_BIO_COMPRESSED; >> + extent_set_compress_type(&failrec->bio_flags, >> + em->compress_type); >> + } >> + pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " >> + "len=%llu\n", logical, start, failrec->len); >> + failrec->logical = logical; >> + free_extent_map(em); >> + >> + /* set the bits in the private failure tree */ >> + ret = set_extent_bits(failure_tree, start, end, >> + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); >> + if (ret >= 0) >> + ret = set_state_private(failure_tree, start, >> + (u64)(unsigned long)failrec); >> + /* set the bits in the inode''s tree */ >> + if (ret >= 0) >> + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, >> + GFP_NOFS); >> + if (ret < 0) { >> + kfree(failrec); >> + return ret; >> + } >> + } else { >> + failrec = (struct io_failure_record *)(unsigned long)private; >> + pr_debug("bio_readpage_error: (found) logical=%llu, " >> + "start=%llu, len=%llu, validation=%d\n", >> + failrec->logical, failrec->start, failrec->len, >> + failrec->in_validation); >> + /* >> + * when data can be on disk more than twice, add to failrec here >> + * (e.g. with a list for failed_mirror) to make >> + * clean_io_failure() clean all those errors at once. >> + */ >> + } >> + num_copies = btrfs_num_copies( >> + &BTRFS_I(inode)->root->fs_info->mapping_tree, >> + failrec->logical, failrec->len); >> + if (num_copies == 1) { >> + /* >> + * we only have a single copy of the data, so don''t bother with >> + * all the retry and error correction code that follows. no >> + * matter what the error is, it is very likely to persist. >> + */ >> + pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " >> + "state=%p, num_copies=%d, next_mirror %d, " >> + "failed_mirror %d\n", state, num_copies, >> + failrec->this_mirror, failed_mirror); >> + free_io_failure(inode, failrec, 0); >> + return -EIO; >> + } >> + >> + if (!state) { >> + spin_lock(&tree->lock); >> + state = find_first_extent_bit_state(tree, failrec->start, >> + EXTENT_LOCKED); >> + if (state && state->start != failrec->start) >> + state = NULL; >> + spin_unlock(&tree->lock); >> + } >> + >> + /* >> + * there are two premises: >> + * a) deliver good data to the caller >> + * b) correct the bad sectors on disk >> + */ >> + if (failed_bio->bi_vcnt > 1) { >> + /* >> + * to fulfill b), we need to know the exact failing sectors, as >> + * we don''t want to rewrite any more than the failed ones. thus, >> + * we need separate read requests for the failed bio >> + * >> + * if the following BUG_ON triggers, our validation request got >> + * merged. we need separate requests for our algorithm to work. >> + */ >> + BUG_ON(failrec->in_validation); >> + failrec->in_validation = 1; >> + failrec->this_mirror = failed_mirror; >> + read_mode = READ_SYNC | REQ_FAILFAST_DEV; >> + } else { >> + /* >> + * we''re ready to fulfill a) and b) alongside. get a good copy >> + * of the failed sector and if we succeed, we have setup >> + * everything for repair_io_failure to do the rest for us. >> + */ >> + if (failrec->in_validation) { >> + BUG_ON(failrec->this_mirror != failed_mirror); >> + failrec->in_validation = 0; >> + failrec->this_mirror = 0; >> + } >> + failrec->failed_mirror = failed_mirror; >> + failrec->this_mirror++; >> + if (failrec->this_mirror == failed_mirror) >> + failrec->this_mirror++; >> + read_mode = READ_SYNC; >> + } >> + >> + if (!state || failrec->this_mirror > num_copies) { >> + pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " >> + "next_mirror %d, failed_mirror %d\n", state, >> + num_copies, failrec->this_mirror, failed_mirror); >> + free_io_failure(inode, failrec, 0); >> + return -EIO; >> + } >> + >> + bio = bio_alloc(GFP_NOFS, 1); >> + bio->bi_private = state; >> + bio->bi_end_io = failed_bio->bi_end_io; >> + bio->bi_sector = failrec->logical >> 9; >> + bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; >> + bio->bi_size = 0; >> + >> + bio_add_page(bio, page, failrec->len, start - page_offset(page)); >> + >> + pr_debug("bio_readpage_error: submitting new read[%#x] to " >> + "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, >> + failrec->this_mirror, num_copies, failrec->in_validation); >> + >> + tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, >> + failrec->bio_flags, 0); >> + return 0; >> +}-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html