The goal is to detect when drives start to get an increased error rate, when drives should be replaced soon. Therefore statistic counters are added that count IO errors (read, write and flush). Additionally, the software detected errors like checksum errors and corrupted blocks are counted. An ioctl interface is added to get the device statistic counters. A second ioctl is added to atomically get and reset these counters. The device statistics are written into the device tree with each transaction commit. Only modified statistics are written. When a filesystem is mounted, the device statistic for each involved device are read from the device tree and used to initialize the counters. A patch for the btrfs-progs world will also be sent. The patches are based on v3.1-161-gf4a8e65 (btrfs pull request from 12/1/2011). Stefan Behrens (3): Btrfs: add device counters for detected IO and checksum errors Btrfs: add ioctl to get and reset the device stats Btrfs: read device stats on mount, write modified ones during commit fs/btrfs/ctree.h | 51 ++++++++ fs/btrfs/disk-io.c | 25 +++- fs/btrfs/extent_io.c | 27 ++++- fs/btrfs/ioctl.c | 26 ++++ fs/btrfs/ioctl.h | 27 ++++ fs/btrfs/print-tree.c | 3 + fs/btrfs/scrub.c | 52 ++++++-- fs/btrfs/transaction.c | 4 + fs/btrfs/volumes.c | 335 +++++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.h | 43 ++++++ 10 files changed, 575 insertions(+), 18 deletions(-) -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Stefan Behrens
2011-Dec-09 16:40 UTC
[PATCH 1/3] Btrfs: add device counters for detected IO and checksum errors
The goal is to detect when drives start to get an increased error rate, when drives should be replaced soon. Therefore statistic counters are added that count IO errors (read, write and flush). Additionally, the software detected errors like checksum errors and corrupted blocks are counted. Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de> --- fs/btrfs/disk-io.c | 18 +++++++++++--- fs/btrfs/extent_io.c | 27 ++++++++++++++++++++- fs/btrfs/scrub.c | 52 +++++++++++++++++++++++++++++++++++------- fs/btrfs/volumes.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/volumes.h | 21 +++++++++++++++++ 5 files changed, 161 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 89094ee..b0f2a37 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2511,18 +2511,24 @@ recovery_tree_root: static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) { - char b[BDEVNAME_SIZE]; - if (uptodate) { set_buffer_uptodate(bh); } else { + struct btrfs_device *device = (struct btrfs_device *) + (((uintptr_t) bh->b_private) & ~((uintptr_t) 1)); + unsigned int with_flush = ((uintptr_t) bh->b_private) & 1; + printk_ratelimited(KERN_WARNING "lost page write due to " - "I/O error on %s\n", - bdevname(bh->b_bdev, b)); + "I/O error on %s\n", device->name); /* note, we dont'' set_buffer_write_io_error because we have * our own ways of dealing with the IO errors */ clear_buffer_uptodate(bh); + btrfs_device_stat_inc(&device->cnt_write_io_errs); + if (with_flush) + btrfs_device_stat_inc(&device->cnt_flush_io_errs); + device->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(device); } unlock_buffer(bh); put_bh(bh); @@ -2637,6 +2643,7 @@ static int write_dev_supers(struct btrfs_device *device, set_buffer_uptodate(bh); lock_buffer(bh); bh->b_end_io = btrfs_end_buffer_write_sync; + bh->b_private = device; } /* @@ -2695,6 +2702,9 @@ static int write_dev_flush(struct btrfs_device *device, int wait) } if (!bio_flagged(bio, BIO_UPTODATE)) { ret = -EIO; + btrfs_device_stat_inc(&device->cnt_flush_io_errs); + device->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(device); } /* drop the reference from the wait == 0 run */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7609d28..566d262 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1894,6 +1894,9 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { /* try to remap that extent elsewhere? */ bio_put(bio); + btrfs_device_stat_inc(&dev->cnt_write_io_errs); + dev->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(dev); return -EIO; } @@ -2280,10 +2283,30 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, state); - if (ret) + if (ret) { + /* no IO indicated but software detected errors + * in the block, either checksum errros or + * issues with the contents */ + int failed_mirror = (int)(uintptr_t) + bio->bi_bdev; + struct btrfs_root *root + BTRFS_I(page->mapping->host)->root; + struct btrfs_device *device; + uptodate = 0; - else + device = btrfs_find_device_for_logical( + root, start, + (int)failed_mirror); + if (device) { + btrfs_device_stat_inc( + &device->cnt_corruption_errs); + device->device_stats_dirty = 1; + btrfs_device_stat_print_on_error( + device); + } + } else { clean_io_failure(start, page); + } } if (!uptodate) { int failed_mirror; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f5d10b3..78454623 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -55,7 +55,7 @@ static int scrub_checksum_tree_block(struct scrub_dev *sdev, static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer); static int scrub_fixup_check(struct scrub_bio *sbio, int ix); static void scrub_fixup_end_io(struct bio *bio, int err); -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, +static int scrub_fixup_io(int rw, struct btrfs_device *dev, sector_t sector, struct page *page); static void scrub_fixup(struct scrub_bio *sbio, int ix); @@ -562,7 +562,7 @@ static int scrub_recheck_error(struct scrub_bio *sbio, int ix) DEFAULT_RATELIMIT_BURST); if (sbio->err) { - if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, + if (scrub_fixup_io(READ, sbio->sdev->dev, sector, sbio->bio->bi_io_vec[ix].bv_page) == 0) { if (scrub_fixup_check(sbio, ix) == 0) return 0; @@ -676,7 +676,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) if (i + 1 == sbio->spag[ix].mirror_num) continue; - if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, + if (scrub_fixup_io(READ, bbio->stripes[i].dev, bbio->stripes[i].physical >> 9, sbio->bio->bi_io_vec[ix].bv_page)) { /* I/O-error, this is not a good copy */ @@ -693,7 +693,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) /* * bi_io_vec[ix].bv_page now contains good data, write it back */ - if (scrub_fixup_io(WRITE, sdev->dev->bdev, + if (scrub_fixup_io(WRITE, sdev->dev, (sbio->physical + ix * PAGE_SIZE) >> 9, sbio->bio->bi_io_vec[ix].bv_page)) { /* I/O-error, writeback failed, give up */ @@ -720,7 +720,7 @@ uncorrectable: "logical %llu\n", (unsigned long long)logical); } -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, +static int scrub_fixup_io(int rw, struct btrfs_device *dev, sector_t sector, struct page *page) { struct bio *bio = NULL; @@ -728,7 +728,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, DECLARE_COMPLETION_ONSTACK(complete); bio = bio_alloc(GFP_NOFS, 1); - bio->bi_bdev = bdev; + bio->bi_bdev = dev->bdev; bio->bi_sector = sector; bio_add_page(bio, page, PAGE_SIZE, 0); bio->bi_end_io = scrub_fixup_end_io; @@ -739,6 +739,16 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, wait_for_completion(&complete); ret = !test_bit(BIO_UPTODATE, &bio->bi_flags); + if (ret) { + if (bio->bi_rw & WRITE) + btrfs_device_stat_inc(&dev->cnt_write_io_errs); + else + btrfs_device_stat_inc(&dev->cnt_read_io_errs); + if (WRITE_FLUSH == (bio->bi_rw & WRITE_FLUSH)) + btrfs_device_stat_inc(&dev->cnt_flush_io_errs); + dev->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(dev); + } bio_put(bio); return ret; } @@ -749,6 +759,18 @@ static void scrub_bio_end_io(struct bio *bio, int err) struct scrub_dev *sdev = sbio->sdev; struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + if (-EIO == err || -EREMOTEIO == err) { + struct btrfs_device *dev = sdev->dev; + + if (bio->bi_rw & WRITE) + btrfs_device_stat_inc(&dev->cnt_write_io_errs); + else + btrfs_device_stat_inc(&dev->cnt_read_io_errs); + if (WRITE_FLUSH == (bio->bi_rw & WRITE_FLUSH)) + btrfs_device_stat_inc(&dev->cnt_flush_io_errs); + dev->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(dev); + } sbio->err = err; sbio->bio = bio; @@ -847,8 +869,12 @@ static int scrub_checksum_data(struct scrub_dev *sdev, spin_lock(&sdev->stat_lock); ++sdev->stat.data_extents_scrubbed; sdev->stat.data_bytes_scrubbed += PAGE_SIZE; - if (fail) + if (fail) { ++sdev->stat.csum_errors; + btrfs_device_stat_inc(&sdev->dev->cnt_corruption_errs); + sdev->dev->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(sdev->dev); + } spin_unlock(&sdev->stat_lock); return fail; @@ -895,8 +921,12 @@ static int scrub_checksum_tree_block(struct scrub_dev *sdev, spin_lock(&sdev->stat_lock); ++sdev->stat.tree_extents_scrubbed; sdev->stat.tree_bytes_scrubbed += PAGE_SIZE; - if (crc_fail) + if (crc_fail) { ++sdev->stat.csum_errors; + btrfs_device_stat_inc(&sdev->dev->cnt_corruption_errs); + sdev->dev->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(sdev->dev); + } if (fail) ++sdev->stat.verify_errors; spin_unlock(&sdev->stat_lock); @@ -930,8 +960,12 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, PAGE_SIZE - BTRFS_CSUM_SIZE); btrfs_csum_final(crc, csum); - if (memcmp(csum, s->csum, sbio->sdev->csum_size)) + if (memcmp(csum, s->csum, sdev->csum_size)) { ++fail; + btrfs_device_stat_inc(&sdev->dev->cnt_corruption_errs); + sdev->dev->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(sdev->dev); + } if (fail) { /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 01991a3..cc21e14 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -23,6 +23,7 @@ #include <linux/random.h> #include <linux/iocontext.h> #include <linux/capability.h> +#include <linux/ratelimit.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -3236,11 +3237,28 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, static void btrfs_end_bio(struct bio *bio, int err) { - struct btrfs_bio *bbio = bio->bi_private; + struct btrfs_bio *bbio = (struct btrfs_bio *) + (((uintptr_t) bio->bi_private) & ~((uintptr_t) 3)); + unsigned int dev_nr = ((uintptr_t) bio->bi_private) & 3; int is_orig_bio = 0; - if (err) + if (err) { atomic_inc(&bbio->error); + if (-EIO == err || -EREMOTEIO == err) { + struct btrfs_device *dev; + + BUG_ON(dev_nr >= bbio->num_stripes); + dev = bbio->stripes[dev_nr].dev; + if (bio->bi_rw & WRITE) + btrfs_device_stat_inc(&dev->cnt_write_io_errs); + else + btrfs_device_stat_inc(&dev->cnt_read_io_errs); + if (WRITE_FLUSH == (bio->bi_rw & WRITE_FLUSH)) + btrfs_device_stat_inc(&dev->cnt_flush_io_errs); + dev->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(dev); + } + } if (bio == bbio->orig_bio) is_orig_bio = 1; @@ -3381,7 +3399,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } else { bio = first_bio; } - bio->bi_private = bbio; + BUG_ON(0 != (((uintptr_t) bbio) & 3)); + BUG_ON(dev_nr > 3); + bio->bi_private = (void *) (((uintptr_t) bbio) | dev_nr); bio->bi_end_io = btrfs_end_bio; bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; dev = bbio->stripes[dev_nr].dev; @@ -3729,6 +3749,28 @@ int btrfs_read_sys_array(struct btrfs_root *root) return ret; } +struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, + u64 logical, int mirror_num) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + int ret; + u64 map_length = 0; + struct btrfs_bio *bbio = NULL; + struct btrfs_device *device; + + BUG_ON(0 == mirror_num); + ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio, + mirror_num); + if (ret) { + BUG_ON(NULL != bbio); + return NULL; + } + BUG_ON(mirror_num != bbio->mirror_num); + device = bbio->stripes[mirror_num - 1].dev; + kfree(bbio); + return device; +} + int btrfs_read_chunk_tree(struct btrfs_root *root) { struct btrfs_path *path; @@ -3797,3 +3839,16 @@ error: btrfs_free_path(path); return ret; } + +void btrfs_device_stat_print_on_error(struct btrfs_device *device) +{ + printk_ratelimited(KERN_ERR "btrfs: bdev %s errs: wr %u, rd %u," + " flush %u, corrupt %u, gen %u\n", + device->name, + btrfs_device_stat_read(&device->cnt_write_io_errs), + btrfs_device_stat_read(&device->cnt_read_io_errs), + btrfs_device_stat_read(&device->cnt_flush_io_errs), + btrfs_device_stat_read(&device->cnt_corruption_errs), + btrfs_device_stat_read( + &device->cnt_generation_errs)); +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 78f2d4d..51ad850 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -106,6 +106,14 @@ struct btrfs_device { struct completion flush_wait; int nobarriers; + /* disk I/O failure stats. For detailed description refer to + * struct btrfs_device_stats_item in ctree.h */ + int device_stats_dirty; /* counters need to be written to disk */ + atomic_t cnt_write_io_errs; + atomic_t cnt_read_io_errs; + atomic_t cnt_flush_io_errs; + atomic_t cnt_corruption_errs; + atomic_t cnt_generation_errs; }; struct btrfs_fs_devices { @@ -233,4 +241,17 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); +struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, + u64 logical, int mirror_num); +void btrfs_device_stat_print_on_error(struct btrfs_device *device); + +static inline void btrfs_device_stat_inc(atomic_t *cnt) +{ + atomic_inc(cnt); +} + +static inline int btrfs_device_stat_read(atomic_t *cnt) +{ + return atomic_read(cnt); +} #endif -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Stefan Behrens
2011-Dec-09 16:40 UTC
[PATCH 2/3] Btrfs: add ioctl to get and reset the device stats
An ioctl interface is added to get the device statistic counters. A second ioctl is added to atomically get and reset these counters. Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de> --- fs/btrfs/ioctl.c | 26 +++++++++++++++++++ fs/btrfs/ioctl.h | 27 ++++++++++++++++++++ fs/btrfs/volumes.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 13 ++++++++++ 4 files changed, 135 insertions(+), 0 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 72d4616..bce3f92 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2891,6 +2891,28 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, return ret; } +static long btrfs_ioctl_get_device_stats(struct btrfs_root *root, + void __user *arg, int reset_after_read) +{ + struct btrfs_ioctl_get_device_stats *sa; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + ret = btrfs_get_device_stats(root, sa, reset_after_read); + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + kfree(sa); + return ret; +} + static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) { int ret = 0; @@ -3108,6 +3130,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_scrub_cancel(root, argp); case BTRFS_IOC_SCRUB_PROGRESS: return btrfs_ioctl_scrub_progress(root, argp); + case BTRFS_IOC_GET_DEVICE_STATS: + return btrfs_ioctl_get_device_stats(root, argp, 0); + case BTRFS_IOC_GET_AND_RESET_DEVICE_STATS: + return btrfs_ioctl_get_device_stats(root, argp, 1); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 252ae99..b9ffd0b 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -217,6 +217,29 @@ struct btrfs_ioctl_logical_ino_args { __u64 inodes; }; +#define BTRFS_IOCTL_GET_DEVICE_STATS_MAX_NR_ITEMS 5 +struct btrfs_ioctl_get_device_stats { + __u64 devid; /* in */ + __u64 nr_items; /* in/out */ + + /* out values: */ + + /* disk I/O failure stats */ + __u64 cnt_write_io_errs; /* EIO or EREMOTEIO from lower layers */ + __u64 cnt_read_io_errs; /* EIO or EREMOTEIO from lower layers */ + __u64 cnt_flush_io_errs; /* EIO or EREMOTEIO from lower layers */ + + /* stats for indirect indications for I/O failures */ + __u64 cnt_corruption_errs; /* checksum error, bytenr error or + * contents is illegal: this is an + * indication that the block was damaged + * during read or write, or written to + * wrong location or read from wrong + * location */ + __u64 cnt_generation_errs; /* an indication that blocks have not + * been written */ +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -276,5 +299,9 @@ struct btrfs_ioctl_logical_ino_args { struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ struct btrfs_ioctl_ino_path_args) +#define BTRFS_IOC_GET_DEVICE_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ + struct btrfs_ioctl_get_device_stats) +#define BTRFS_IOC_GET_AND_RESET_DEVICE_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \ + struct btrfs_ioctl_get_device_stats) #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cc21e14..99dfd00 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3852,3 +3852,72 @@ void btrfs_device_stat_print_on_error(struct btrfs_device *device) btrfs_device_stat_read( &device->cnt_generation_errs)); } + +int btrfs_get_device_stats(struct btrfs_root *root, + struct btrfs_ioctl_get_device_stats *stats, + int reset_after_read) +{ + struct btrfs_device *dev; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + + mutex_lock(&fs_devices->device_list_mutex); + dev = btrfs_find_device(root, stats->devid, NULL, NULL); + mutex_unlock(&fs_devices->device_list_mutex); + + if (!dev) { + printk(KERN_WARNING + "btrfs: get device_stats failed, device not found\n"); + return -ENODEV; + } else if (reset_after_read) { + if (stats->nr_items >= 1) + stats->cnt_write_io_errs + btrfs_device_stat_read_and_reset( + &dev->cnt_write_io_errs); + else + btrfs_device_stat_reset(&dev->cnt_write_io_errs); + if (stats->nr_items >= 2) + stats->cnt_read_io_errs + btrfs_device_stat_read_and_reset( + &dev->cnt_read_io_errs); + else + btrfs_device_stat_reset(&dev->cnt_read_io_errs); + if (stats->nr_items >= 3) + stats->cnt_flush_io_errs + btrfs_device_stat_read_and_reset( + &dev->cnt_flush_io_errs); + else + btrfs_device_stat_reset(&dev->cnt_flush_io_errs); + if (stats->nr_items >= 4) + stats->cnt_corruption_errs + btrfs_device_stat_read_and_reset( + &dev->cnt_corruption_errs); + else + btrfs_device_stat_reset(&dev->cnt_corruption_errs); + if (stats->nr_items >= 5) + stats->cnt_generation_errs + btrfs_device_stat_read_and_reset( + &dev->cnt_generation_errs); + else + btrfs_device_stat_reset(&dev->cnt_generation_errs); + dev->device_stats_dirty = 1; + } else { + if (stats->nr_items >= 1) + stats->cnt_write_io_errs = btrfs_device_stat_read( + &dev->cnt_write_io_errs); + if (stats->nr_items >= 2) + stats->cnt_read_io_errs = btrfs_device_stat_read( + &dev->cnt_read_io_errs); + if (stats->nr_items >= 3) + stats->cnt_flush_io_errs = btrfs_device_stat_read( + &dev->cnt_flush_io_errs); + if (stats->nr_items >= 4) + stats->cnt_corruption_errs = btrfs_device_stat_read( + &dev->cnt_corruption_errs); + if (stats->nr_items >= 5) + stats->cnt_generation_errs = btrfs_device_stat_read( + &dev->cnt_generation_errs); + } + if (stats->nr_items > 5) + stats->nr_items = 5; + return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 51ad850..ad8c3bc 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -244,6 +244,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, u64 logical, int mirror_num); void btrfs_device_stat_print_on_error(struct btrfs_device *device); +int btrfs_get_device_stats(struct btrfs_root *root, + struct btrfs_ioctl_get_device_stats *stats, + int reset_after_read); static inline void btrfs_device_stat_inc(atomic_t *cnt) { @@ -254,4 +257,14 @@ static inline int btrfs_device_stat_read(atomic_t *cnt) { return atomic_read(cnt); } + +static inline int btrfs_device_stat_read_and_reset(atomic_t *cnt) +{ + return atomic_xchg(cnt, 0); +} + +static inline void btrfs_device_stat_reset(atomic_t *cnt) +{ + atomic_set(cnt, 0); +} #endif -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Stefan Behrens
2011-Dec-09 16:40 UTC
[PATCH 3/3] Btrfs: read device stats on mount, write modified ones during commit
The device statistics are written into the device tree with each transaction commit. Only modified statistics are written. When a filesystem is mounted, the device statistic for each involved device are read from the device tree and used to initialize the counters. Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de> --- fs/btrfs/ctree.h | 51 ++++++++++++ fs/btrfs/disk-io.c | 7 ++ fs/btrfs/print-tree.c | 3 + fs/btrfs/transaction.c | 4 + fs/btrfs/volumes.c | 205 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 9 ++ 6 files changed, 279 insertions(+), 0 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 89fab53..f5e2429 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -750,6 +750,26 @@ struct btrfs_csum_item { u8 csum; } __attribute__ ((__packed__)); +struct btrfs_device_stats_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 cnt_write_io_errs; /* EIO or EREMOTEIO from lower layers */ + __le64 cnt_read_io_errs; /* EIO or EREMOTEIO from lower layers */ + __le64 cnt_flush_io_errs; /* EIO or EREMOTEIO from lower layers */ + + /* stats for indirect indications for I/O failures */ + __le64 cnt_corruption_errs; /* checksum error, bytenr error or + * contents is illegal: this is an + * indication that the block was damaged + * during read or write, or written to + * wrong location or read from wrong + * location */ + __le64 cnt_generation_errs; /* an indication that blocks have not + * been written */ +} __attribute__ ((__packed__)); + /* different types of block groups (and chunks) */ #define BTRFS_BLOCK_GROUP_DATA (1 << 0) #define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) @@ -1388,6 +1408,12 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_CHUNK_ITEM_KEY 228 /* + * Persistantly stores the io stats in the device tree. + * One key for all stats, (0, BTRFS_DEVICE_STATS_KEY, devid). + */ +#define BTRFS_DEVICE_STATS_KEY 248 + +/* * string items are for debugging. They just store a short string of * data in the FS */ @@ -2202,6 +2228,31 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, return btrfs_item_size(eb, e) - offset; } +/* btrfs_device_stats_item */ +BTRFS_SETGET_FUNCS(device_stats_cnt_write_io_errs, + struct btrfs_device_stats_item, cnt_write_io_errs, 64); +BTRFS_SETGET_FUNCS(device_stats_cnt_read_io_errs, + struct btrfs_device_stats_item, cnt_read_io_errs, 64); +BTRFS_SETGET_FUNCS(device_stats_cnt_flush_io_errs, + struct btrfs_device_stats_item, cnt_flush_io_errs, 64); +BTRFS_SETGET_FUNCS(device_stats_cnt_corruption_errs, + struct btrfs_device_stats_item, cnt_corruption_errs, 64); +BTRFS_SETGET_FUNCS(device_stats_cnt_generation_errs, + struct btrfs_device_stats_item, cnt_generation_errs, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_device_stats_cnt_write_io_errs, + struct btrfs_device_stats_item, cnt_write_io_errs, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_stats_cnt_read_io_errs, + struct btrfs_device_stats_item, cnt_read_io_errs, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_stats_cnt_flush_io_errs, + struct btrfs_device_stats_item, cnt_flush_io_errs, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_stats_cnt_corruption_errs, + struct btrfs_device_stats_item, cnt_corruption_errs, + 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_stats_cnt_generation_errs, + struct btrfs_device_stats_item, cnt_generation_errs, + 64); + static inline struct btrfs_root *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b0f2a37..cac8f51 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2321,6 +2321,13 @@ retry_root_backup: fs_info->metadata_alloc_profile = (u64)-1; fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; + ret = btrfs_init_device_stats(fs_info); + if (ret) { + printk(KERN_ERR "btrfs: failed to init device_stats: %d\n", + ret); + goto fail_block_groups; + } + ret = btrfs_init_space_info(fs_info); if (ret) { printk(KERN_ERR "Failed to initial space info: %d\n", ret); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index f38e452..a9e45e4 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -294,6 +294,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_dev_extent_chunk_offset(l, dev_extent), (unsigned long long) btrfs_dev_extent_length(l, dev_extent)); + case BTRFS_DEVICE_STATS_KEY: + printk(KERN_INFO "\t\tdevice stats\n"); + break; }; } } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 81376d9..9cf3095 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -28,6 +28,7 @@ #include "locking.h" #include "tree-log.h" #include "inode-map.h" +#include "volumes.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -724,6 +725,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); BUG_ON(ret); + ret = btrfs_run_device_stats(trans, root->fs_info); + BUG_ON(ret); + while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; list_del_init(next); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 99dfd00..accd9e4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -39,6 +39,8 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); static int btrfs_relocate_sys_chunks(struct btrfs_root *root); +static void __btrfs_reset_device_stats(struct btrfs_device *dev); +static void btrfs_device_stat_print_on_load(struct btrfs_device *device); static DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -357,6 +359,7 @@ static noinline int device_list_add(const char *path, return -ENOMEM; } device->devid = devid; + device->device_stats_valid = 0; device->work.func = pending_bios_fn; memcpy(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); @@ -3840,8 +3843,194 @@ error: return ret; } +static void __btrfs_reset_device_stats(struct btrfs_device *device) +{ + btrfs_device_stat_reset(&device->cnt_write_io_errs); + btrfs_device_stat_reset(&device->cnt_read_io_errs); + btrfs_device_stat_reset(&device->cnt_flush_io_errs); + btrfs_device_stat_reset(&device->cnt_corruption_errs); + btrfs_device_stat_reset(&device->cnt_generation_errs); +} + +int btrfs_init_device_stats(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct extent_buffer *eb; + int slot; + int ret = 0; + struct btrfs_device *device; + struct btrfs_path *path = NULL; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + int item_size; + struct btrfs_device_stats_item *ptr; + + key.objectid = 0; + key.type = BTRFS_DEVICE_STATS_KEY; + key.offset = device->devid; + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); + if (ret) { + printk(KERN_WARNING "btrfs: no device_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n", + device->name, device->devid); + __btrfs_reset_device_stats(device); + device->device_stats_valid = 1; + device->device_stats_dirty = 1; + btrfs_release_path(path); + continue; + } + slot = path->slots[0]; + eb = path->nodes[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + item_size = btrfs_item_size_nr(eb, slot); + + ptr = btrfs_item_ptr(eb, slot, + struct btrfs_device_stats_item); + + if (item_size >= 1 * sizeof(__le64)) + btrfs_device_stat_set(&device->cnt_write_io_errs, + btrfs_device_stats_cnt_write_io_errs(eb, ptr)); + else + btrfs_device_stat_reset(&device->cnt_write_io_errs); + if (item_size >= 2 * sizeof(__le64)) + btrfs_device_stat_set(&device->cnt_read_io_errs, + btrfs_device_stats_cnt_read_io_errs(eb, ptr)); + else + btrfs_device_stat_reset(&device->cnt_read_io_errs); + if (item_size >= 3 * sizeof(__le64)) + btrfs_device_stat_set(&device->cnt_flush_io_errs, + btrfs_device_stats_cnt_flush_io_errs(eb, ptr)); + else + btrfs_device_stat_reset(&device->cnt_flush_io_errs); + if (item_size >= 4 * sizeof(__le64)) + btrfs_device_stat_set(&device->cnt_corruption_errs, + btrfs_device_stats_cnt_corruption_errs(eb, + ptr)); + else + btrfs_device_stat_reset(&device->cnt_corruption_errs); + if (item_size >= 5 * sizeof(__le64)) + btrfs_device_stat_set(&device->cnt_generation_errs, + btrfs_device_stats_cnt_generation_errs(eb, + ptr)); + else + btrfs_device_stat_reset(&device->cnt_generation_errs); + + btrfs_device_stat_print_on_load(device); + device->device_stats_valid = 1; + btrfs_release_path(path); + } + mutex_unlock(&fs_devices->device_list_mutex); + +out: + btrfs_free_path(path); + return ret < 0 ? ret : 0; +} + +static int update_device_stat_item(struct btrfs_trans_handle *trans, + struct btrfs_root *dev_root, + struct btrfs_device *device) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_device_stats_item *ptr; + int ret; + + key.objectid = 0; + key.type = BTRFS_DEVICE_STATS_KEY; + key.offset = device->devid; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, dev_root, &key, path, 0, 1); + if (ret < 0) { + printk(KERN_WARNING "btrfs: error %d while searching for device_stats item for device %s!\n", + ret, device->name); + goto out; + } + + if (0 == ret && + btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* need to delete old one and insert a new one */ + ret = btrfs_del_item(trans, dev_root, path); + if (0 != ret) { + printk(KERN_WARNING "btrfs: delete too small device_stats item for device %s failed %d!\n", + device->name, ret); + goto out; + } + ret = 1; + } + + if (1 == ret) { + /* need to insert a new item */ + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, dev_root, path, + &key, sizeof(*ptr)); + if (ret < 0) { + printk(KERN_WARNING "btrfs: insert device_stats item for device %s failed %d!\n", + device->name, ret); + goto out; + } + } + + eb = path->nodes[0]; + ptr = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_device_stats_item); + btrfs_set_device_stats_cnt_write_io_errs(eb, ptr, + btrfs_device_stat_read(&device->cnt_write_io_errs)); + btrfs_set_device_stats_cnt_read_io_errs(eb, ptr, + btrfs_device_stat_read(&device->cnt_read_io_errs)); + btrfs_set_device_stats_cnt_flush_io_errs(eb, ptr, + btrfs_device_stat_read(&device->cnt_flush_io_errs)); + btrfs_set_device_stats_cnt_corruption_errs(eb, ptr, + btrfs_device_stat_read(&device->cnt_corruption_errs)); + btrfs_set_device_stats_cnt_generation_errs(eb, ptr, + btrfs_device_stat_read(&device->cnt_generation_errs)); + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * called from commit_transaction. Writes all changed device stats to disk. + */ +int btrfs_run_device_stats(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + int ret = 0; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->device_stats_valid || !device->device_stats_dirty) + continue; + + ret = update_device_stat_item(trans, dev_root, device); + if (!ret) + device->device_stats_dirty = 0; + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + void btrfs_device_stat_print_on_error(struct btrfs_device *device) { + if (!device->device_stats_valid) + return; printk_ratelimited(KERN_ERR "btrfs: bdev %s errs: wr %u, rd %u," " flush %u, corrupt %u, gen %u\n", device->name, @@ -3853,6 +4042,18 @@ void btrfs_device_stat_print_on_error(struct btrfs_device *device) &device->cnt_generation_errs)); } +static void btrfs_device_stat_print_on_load(struct btrfs_device *device) +{ + printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u," + " corrupt %u, gen %u\n", + device->name, + btrfs_device_stat_read(&device->cnt_write_io_errs), + btrfs_device_stat_read(&device->cnt_read_io_errs), + btrfs_device_stat_read(&device->cnt_flush_io_errs), + btrfs_device_stat_read(&device->cnt_corruption_errs), + btrfs_device_stat_read(&device->cnt_generation_errs)); +} + int btrfs_get_device_stats(struct btrfs_root *root, struct btrfs_ioctl_get_device_stats *stats, int reset_after_read) @@ -3868,6 +4069,10 @@ int btrfs_get_device_stats(struct btrfs_root *root, printk(KERN_WARNING "btrfs: get device_stats failed, device not found\n"); return -ENODEV; + } else if (!dev->device_stats_valid) { + printk(KERN_WARNING + "btrfs: get device_stats failed, not yet valid\n"); + return -ENODEV; } else if (reset_after_read) { if (stats->nr_items >= 1) stats->cnt_write_io_errs diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ad8c3bc..7811347 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -108,6 +108,7 @@ struct btrfs_device { /* disk I/O failure stats. For detailed description refer to * struct btrfs_device_stats_item in ctree.h */ + int device_stats_valid; int device_stats_dirty; /* counters need to be written to disk */ atomic_t cnt_write_io_errs; atomic_t cnt_read_io_errs; @@ -243,6 +244,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, u64 *start, u64 *max_avail); struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, u64 logical, int mirror_num); +int btrfs_init_device_stats(struct btrfs_fs_info *fs_info); +int btrfs_run_device_stats(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); void btrfs_device_stat_print_on_error(struct btrfs_device *device); int btrfs_get_device_stats(struct btrfs_root *root, struct btrfs_ioctl_get_device_stats *stats, @@ -267,4 +271,9 @@ static inline void btrfs_device_stat_reset(atomic_t *cnt) { atomic_set(cnt, 0); } + +static inline void btrfs_device_stat_set(atomic_t *cnt, unsigned long val) +{ + atomic_set(cnt, val); +} #endif -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Goffredo Baroncelli
2011-Dec-11 10:18 UTC
Re: [PATCH 0/3] Btrfs: add IO error device stats
On Friday, 09 December, 2011 17:40:25 Stefan Behrens wrote:> The goal is to detect when drives start to get an increased error rate, > when drives should be replaced soon. Therefore statistic counters are > added that count IO errors (read, write and flush). Additionally, the > software detected errors like checksum errors and corrupted blocks are > counted. > > An ioctl interface is added to get the device statistic counters. > A second ioctl is added to atomically get and reset these counters.Hi Stefan, what about exporting these information under /sys ? Something like: /sys/fs/btrfs/devices/<uuid>/error/cnt_write_io_errs /sys/fs/btrfs/devices/<uuid>/error/cnt_read_io_errs /sys/fs/btrfs/devices/<uuid>/error/cnt_flush_io_errs /sys/fs/btrfs/devices/<uuid>/error/cnt_corruption_errs /sys/fs/btrfs/devices/<uuid>/error/cnt_generation_errs /sys/fs/btrfs/devices/<uuid>/error/cnt_write_io_errs_0 /sys/fs/btrfs/devices/<uuid>/error/cnt_read_io_errs_0 /sys/fs/btrfs/devices/<uuid>/error/cnt_flush_io_errs_0 /sys/fs/btrfs/devices/<uuid>/error/cnt_corruption_errs_0 /sys/fs/btrfs/devices/<uuid>/error/cnt_generation_errs_0 where the ''_0'' family are for "read and reset" behaviour. This would be very flexible when another count will be added...> The device statistics are written into the device tree with each > transaction commit. Only modified statistics are written. > When a filesystem is mounted, the device statistic for each involved > device are read from the device tree and used to initialize the > counters. > > A patch for the btrfs-progs world will also be sent. > > The patches are based on v3.1-161-gf4a8e65 (btrfs pull request from > 12/1/2011). > > Stefan Behrens (3): > Btrfs: add device counters for detected IO and checksum errors > Btrfs: add ioctl to get and reset the device stats > Btrfs: read device stats on mount, write modified ones during commit > > fs/btrfs/ctree.h | 51 ++++++++ > fs/btrfs/disk-io.c | 25 +++- > fs/btrfs/extent_io.c | 27 ++++- > fs/btrfs/ioctl.c | 26 ++++ > fs/btrfs/ioctl.h | 27 ++++ > fs/btrfs/print-tree.c | 3 + > fs/btrfs/scrub.c | 52 ++++++-- > fs/btrfs/transaction.c | 4 + > fs/btrfs/volumes.c | 335 > +++++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.h | > 43 ++++++ > 10 files changed, 575 insertions(+), 18 deletions(-)-- gpg key@ keyserver.linux.it: Goffredo Baroncelli (ghigo) <kreijack@inwind.it> Key fingerprint = 4769 7E51 5293 D36C 814E C054 BF04 F161 3DC5 0512 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Goffredo Baroncelli
2011-Dec-11 10:24 UTC
Re: [PATCH 2/3] Btrfs: add ioctl to get and reset the device stats
On Friday, 09 December, 2011 17:40:27 Stefan Behrens wrote:> An ioctl interface is added to get the device statistic counters. > A second ioctl is added to atomically get and reset these counters.[...]> > +static long btrfs_ioctl_get_device_stats(struct btrfs_root *root, > + void __user *arg, int reset_after_read) > +{ > + struct btrfs_ioctl_get_device_stats *sa; > + int ret; > + > + if (!capable(CAP_SYS_ADMIN)) > + return -EPERM; > +I agree that the BTRFS_IOC_GET_AND_RESET_DEVICE_STATS should be a privileged operation. But I think that the BTRFS_IOC_GET_DEVICE_STATS should be allowed to everyone. Think about a daemon which look at the error, and then send an email to warn about possible defect. Allowing BTRFS_IOC_GET_DEVICE_STATS to everyone, prevent to run the deamon with root privileges. BR G.Baroncelli [...] -- gpg key@ keyserver.linux.it: Goffredo Baroncelli (ghigo) <kreijack@inwind.it> Key fingerprint = 4769 7E51 5293 D36C 814E C054 BF04 F161 3DC5 0512 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Stefan Behrens
2011-Dec-12 13:34 UTC
Re: [PATCH 2/3] Btrfs: add ioctl to get and reset the device stats
On 12/11/2011 11:24 AM, Goffredo Baroncelli wrote:> On Friday, 09 December, 2011 17:40:27 Stefan Behrens wrote: >> An ioctl interface is added to get the device statistic counters. >> A second ioctl is added to atomically get and reset these counters. > > [...] > >> >> +static long btrfs_ioctl_get_device_stats(struct btrfs_root *root, >> + void __user *arg, int reset_after_read) >> +{ >> + struct btrfs_ioctl_get_device_stats *sa; >> + int ret; >> + >> + if (!capable(CAP_SYS_ADMIN)) >> + return -EPERM; >> + > > I agree that the BTRFS_IOC_GET_AND_RESET_DEVICE_STATS should be a privileged > operation. But I think that the BTRFS_IOC_GET_DEVICE_STATS should be allowed > to everyone. Think about a daemon which look at the error, and then send an > email to warn about possible defect. Allowing BTRFS_IOC_GET_DEVICE_STATS to > everyone, prevent to run the deamon with root privileges.I fully agree to your comment and will change it in a v2. Thanks! -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Changes v1-v2: - Remove restriction that BTRFS_IOC_GET_DEVICE_STATS is a privileged operation - Cast u64 to unsigned long long for printf() The goal is to detect when drives start to get an increased error rate, when drives should be replaced soon. Therefore statistic counters are added that count IO errors (read, write and flush). Additionally, the software detected errors like checksum errors and corrupted blocks are counted. An ioctl interface is added to get the device statistic counters. A second ioctl is added to atomically get and reset these counters. The device statistics are written into the device tree with each transaction commit. Only modified statistics are written. When a filesystem is mounted, the device statistics for each involved device are read from the device tree and used to initialize the counters. A patch for the btrfs-progs world will also be sent. The patches are based on v3.1-182-gd85c8a6. Stefan Behrens (3): Btrfs: add device counters for detected IO and checksum errors Btrfs: add ioctl to get and reset the device stats Btrfs: read device stats on mount, write modified ones during commit fs/btrfs/ctree.h | 51 ++++++++ fs/btrfs/disk-io.c | 25 +++- fs/btrfs/extent_io.c | 27 ++++- fs/btrfs/ioctl.c | 26 ++++ fs/btrfs/ioctl.h | 27 ++++ fs/btrfs/print-tree.c | 3 + fs/btrfs/scrub.c | 52 ++++++-- fs/btrfs/transaction.c | 4 + fs/btrfs/volumes.c | 335 +++++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.h | 43 ++++++ 10 files changed, 575 insertions(+), 18 deletions(-) -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Stefan Behrens
2011-Dec-21 16:05 UTC
[PATCH v2 1/3] Btrfs: add device counters for detected IO and checksum errors
The goal is to detect when drives start to get an increased error rate, when drives should be replaced soon. Therefore statistic counters are added that count IO errors (read, write and flush). Additionally, the software detected errors like checksum errors and corrupted blocks are counted. Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de> --- fs/btrfs/disk-io.c | 18 +++++++++++--- fs/btrfs/extent_io.c | 27 ++++++++++++++++++++- fs/btrfs/scrub.c | 52 +++++++++++++++++++++++++++++++++++------- fs/btrfs/volumes.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/volumes.h | 21 +++++++++++++++++ 5 files changed, 161 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3f9d555..905f1fa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2502,18 +2502,24 @@ recovery_tree_root: static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) { - char b[BDEVNAME_SIZE]; - if (uptodate) { set_buffer_uptodate(bh); } else { + struct btrfs_device *device = (struct btrfs_device *) + (((uintptr_t)bh->b_private) & ~((uintptr_t)1)); + unsigned int with_flush = ((uintptr_t)bh->b_private) & 1; + printk_ratelimited(KERN_WARNING "lost page write due to " - "I/O error on %s\n", - bdevname(bh->b_bdev, b)); + "I/O error on %s\n", device->name); /* note, we dont'' set_buffer_write_io_error because we have * our own ways of dealing with the IO errors */ clear_buffer_uptodate(bh); + btrfs_device_stat_inc(&device->cnt_write_io_errs); + if (with_flush) + btrfs_device_stat_inc(&device->cnt_flush_io_errs); + device->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(device); } unlock_buffer(bh); put_bh(bh); @@ -2628,6 +2634,7 @@ static int write_dev_supers(struct btrfs_device *device, set_buffer_uptodate(bh); lock_buffer(bh); bh->b_end_io = btrfs_end_buffer_write_sync; + bh->b_private = device; } /* @@ -2686,6 +2693,9 @@ static int write_dev_flush(struct btrfs_device *device, int wait) } if (!bio_flagged(bio, BIO_UPTODATE)) { ret = -EIO; + btrfs_device_stat_inc(&device->cnt_flush_io_errs); + device->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(device); } /* drop the reference from the wait == 0 run */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 49f3c9d..e6bf7ee 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1901,6 +1901,9 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { /* try to remap that extent elsewhere? */ bio_put(bio); + btrfs_device_stat_inc(&dev->cnt_write_io_errs); + dev->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(dev); return -EIO; } @@ -2287,10 +2290,30 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, state); - if (ret) + if (ret) { + /* no IO indicated but software detected errors + * in the block, either checksum errros or + * issues with the contents */ + int failed_mirror = (int)(uintptr_t) + bio->bi_bdev; + struct btrfs_root *root + BTRFS_I(page->mapping->host)->root; + struct btrfs_device *device; + uptodate = 0; - else + device = btrfs_find_device_for_logical( + root, start, + (int)failed_mirror); + if (device) { + btrfs_device_stat_inc( + &device->cnt_corruption_errs); + device->device_stats_dirty = 1; + btrfs_device_stat_print_on_error( + device); + } + } else { clean_io_failure(start, page); + } } if (!uptodate) { int failed_mirror; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ddf2c90..07fbbef 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -54,7 +54,7 @@ static int scrub_checksum_tree_block(struct scrub_dev *sdev, static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer); static int scrub_fixup_check(struct scrub_bio *sbio, int ix); static void scrub_fixup_end_io(struct bio *bio, int err); -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, +static int scrub_fixup_io(int rw, struct btrfs_device *dev, sector_t sector, struct page *page); static void scrub_fixup(struct scrub_bio *sbio, int ix); @@ -561,7 +561,7 @@ static int scrub_recheck_error(struct scrub_bio *sbio, int ix) DEFAULT_RATELIMIT_BURST); if (sbio->err) { - if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, + if (scrub_fixup_io(READ, sbio->sdev->dev, sector, sbio->bio->bi_io_vec[ix].bv_page) == 0) { if (scrub_fixup_check(sbio, ix) == 0) return 0; @@ -675,7 +675,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) if (i + 1 == sbio->spag[ix].mirror_num) continue; - if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, + if (scrub_fixup_io(READ, bbio->stripes[i].dev, bbio->stripes[i].physical >> 9, sbio->bio->bi_io_vec[ix].bv_page)) { /* I/O-error, this is not a good copy */ @@ -692,7 +692,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) /* * bi_io_vec[ix].bv_page now contains good data, write it back */ - if (scrub_fixup_io(WRITE, sdev->dev->bdev, + if (scrub_fixup_io(WRITE, sdev->dev, (sbio->physical + ix * PAGE_SIZE) >> 9, sbio->bio->bi_io_vec[ix].bv_page)) { /* I/O-error, writeback failed, give up */ @@ -719,7 +719,7 @@ uncorrectable: "logical %llu\n", (unsigned long long)logical); } -static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, +static int scrub_fixup_io(int rw, struct btrfs_device *dev, sector_t sector, struct page *page) { struct bio *bio = NULL; @@ -727,7 +727,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, DECLARE_COMPLETION_ONSTACK(complete); bio = bio_alloc(GFP_NOFS, 1); - bio->bi_bdev = bdev; + bio->bi_bdev = dev->bdev; bio->bi_sector = sector; bio_add_page(bio, page, PAGE_SIZE, 0); bio->bi_end_io = scrub_fixup_end_io; @@ -738,6 +738,16 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, wait_for_completion(&complete); ret = !test_bit(BIO_UPTODATE, &bio->bi_flags); + if (ret) { + if (bio->bi_rw & WRITE) + btrfs_device_stat_inc(&dev->cnt_write_io_errs); + else + btrfs_device_stat_inc(&dev->cnt_read_io_errs); + if (WRITE_FLUSH == (bio->bi_rw & WRITE_FLUSH)) + btrfs_device_stat_inc(&dev->cnt_flush_io_errs); + dev->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(dev); + } bio_put(bio); return ret; } @@ -748,6 +758,18 @@ static void scrub_bio_end_io(struct bio *bio, int err) struct scrub_dev *sdev = sbio->sdev; struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; + if (-EIO == err || -EREMOTEIO == err) { + struct btrfs_device *dev = sdev->dev; + + if (bio->bi_rw & WRITE) + btrfs_device_stat_inc(&dev->cnt_write_io_errs); + else + btrfs_device_stat_inc(&dev->cnt_read_io_errs); + if (WRITE_FLUSH == (bio->bi_rw & WRITE_FLUSH)) + btrfs_device_stat_inc(&dev->cnt_flush_io_errs); + dev->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(dev); + } sbio->err = err; sbio->bio = bio; @@ -846,8 +868,12 @@ static int scrub_checksum_data(struct scrub_dev *sdev, spin_lock(&sdev->stat_lock); ++sdev->stat.data_extents_scrubbed; sdev->stat.data_bytes_scrubbed += PAGE_SIZE; - if (fail) + if (fail) { ++sdev->stat.csum_errors; + btrfs_device_stat_inc(&sdev->dev->cnt_corruption_errs); + sdev->dev->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(sdev->dev); + } spin_unlock(&sdev->stat_lock); return fail; @@ -894,8 +920,12 @@ static int scrub_checksum_tree_block(struct scrub_dev *sdev, spin_lock(&sdev->stat_lock); ++sdev->stat.tree_extents_scrubbed; sdev->stat.tree_bytes_scrubbed += PAGE_SIZE; - if (crc_fail) + if (crc_fail) { ++sdev->stat.csum_errors; + btrfs_device_stat_inc(&sdev->dev->cnt_corruption_errs); + sdev->dev->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(sdev->dev); + } if (fail) ++sdev->stat.verify_errors; spin_unlock(&sdev->stat_lock); @@ -929,8 +959,12 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc, PAGE_SIZE - BTRFS_CSUM_SIZE); btrfs_csum_final(crc, csum); - if (memcmp(csum, s->csum, sbio->sdev->csum_size)) + if (memcmp(csum, s->csum, sdev->csum_size)) { ++fail; + btrfs_device_stat_inc(&sdev->dev->cnt_corruption_errs); + sdev->dev->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(sdev->dev); + } if (fail) { /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4b839f..7681477 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -23,6 +23,7 @@ #include <linux/random.h> #include <linux/iocontext.h> #include <linux/capability.h> +#include <linux/ratelimit.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -3241,11 +3242,28 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, static void btrfs_end_bio(struct bio *bio, int err) { - struct btrfs_bio *bbio = bio->bi_private; + struct btrfs_bio *bbio = (struct btrfs_bio *) + (((uintptr_t)bio->bi_private) & ~((uintptr_t)3)); + unsigned int dev_nr = ((uintptr_t)bio->bi_private) & 3; int is_orig_bio = 0; - if (err) + if (err) { atomic_inc(&bbio->error); + if (-EIO == err || -EREMOTEIO == err) { + struct btrfs_device *dev; + + BUG_ON(dev_nr >= bbio->num_stripes); + dev = bbio->stripes[dev_nr].dev; + if (bio->bi_rw & WRITE) + btrfs_device_stat_inc(&dev->cnt_write_io_errs); + else + btrfs_device_stat_inc(&dev->cnt_read_io_errs); + if (WRITE_FLUSH == (bio->bi_rw & WRITE_FLUSH)) + btrfs_device_stat_inc(&dev->cnt_flush_io_errs); + dev->device_stats_dirty = 1; + btrfs_device_stat_print_on_error(dev); + } + } if (bio == bbio->orig_bio) is_orig_bio = 1; @@ -3386,7 +3404,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } else { bio = first_bio; } - bio->bi_private = bbio; + BUG_ON(0 != (((uintptr_t)bbio) & 3)); + BUG_ON(dev_nr > 3); + bio->bi_private = (void *)(((uintptr_t)bbio) | dev_nr); bio->bi_end_io = btrfs_end_bio; bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; dev = bbio->stripes[dev_nr].dev; @@ -3734,6 +3754,28 @@ int btrfs_read_sys_array(struct btrfs_root *root) return ret; } +struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, + u64 logical, int mirror_num) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + int ret; + u64 map_length = 0; + struct btrfs_bio *bbio = NULL; + struct btrfs_device *device; + + BUG_ON(0 == mirror_num); + ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio, + mirror_num); + if (ret) { + BUG_ON(NULL != bbio); + return NULL; + } + BUG_ON(mirror_num != bbio->mirror_num); + device = bbio->stripes[mirror_num - 1].dev; + kfree(bbio); + return device; +} + int btrfs_read_chunk_tree(struct btrfs_root *root) { struct btrfs_path *path; @@ -3802,3 +3844,16 @@ error: btrfs_free_path(path); return ret; } + +void btrfs_device_stat_print_on_error(struct btrfs_device *device) +{ + printk_ratelimited(KERN_ERR "btrfs: bdev %s errs: wr %u, rd %u," + " flush %u, corrupt %u, gen %u\n", + device->name, + btrfs_device_stat_read(&device->cnt_write_io_errs), + btrfs_device_stat_read(&device->cnt_read_io_errs), + btrfs_device_stat_read(&device->cnt_flush_io_errs), + btrfs_device_stat_read(&device->cnt_corruption_errs), + btrfs_device_stat_read( + &device->cnt_generation_errs)); +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 78f2d4d..51ad850 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -106,6 +106,14 @@ struct btrfs_device { struct completion flush_wait; int nobarriers; + /* disk I/O failure stats. For detailed description refer to + * struct btrfs_device_stats_item in ctree.h */ + int device_stats_dirty; /* counters need to be written to disk */ + atomic_t cnt_write_io_errs; + atomic_t cnt_read_io_errs; + atomic_t cnt_flush_io_errs; + atomic_t cnt_corruption_errs; + atomic_t cnt_generation_errs; }; struct btrfs_fs_devices { @@ -233,4 +241,17 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); +struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, + u64 logical, int mirror_num); +void btrfs_device_stat_print_on_error(struct btrfs_device *device); + +static inline void btrfs_device_stat_inc(atomic_t *cnt) +{ + atomic_inc(cnt); +} + +static inline int btrfs_device_stat_read(atomic_t *cnt) +{ + return atomic_read(cnt); +} #endif -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Stefan Behrens
2011-Dec-21 16:05 UTC
[PATCH v2 2/3] Btrfs: add ioctl to get and reset the device stats
An ioctl interface is added to get the device statistic counters. A second ioctl is added to atomically get and reset these counters. Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de> --- fs/btrfs/ioctl.c | 26 +++++++++++++++++++ fs/btrfs/ioctl.h | 27 ++++++++++++++++++++ fs/btrfs/volumes.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 13 ++++++++++ 4 files changed, 135 insertions(+), 0 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c04f02c..5bb31b0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2893,6 +2893,28 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, return ret; } +static long btrfs_ioctl_get_device_stats(struct btrfs_root *root, + void __user *arg, int reset_after_read) +{ + struct btrfs_ioctl_get_device_stats *sa; + int ret; + + if (reset_after_read && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + ret = btrfs_get_device_stats(root, sa, reset_after_read); + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + kfree(sa); + return ret; +} + static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) { int ret = 0; @@ -3110,6 +3132,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_scrub_cancel(root, argp); case BTRFS_IOC_SCRUB_PROGRESS: return btrfs_ioctl_scrub_progress(root, argp); + case BTRFS_IOC_GET_DEVICE_STATS: + return btrfs_ioctl_get_device_stats(root, argp, 0); + case BTRFS_IOC_GET_AND_RESET_DEVICE_STATS: + return btrfs_ioctl_get_device_stats(root, argp, 1); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 252ae99..b9ffd0b 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -217,6 +217,29 @@ struct btrfs_ioctl_logical_ino_args { __u64 inodes; }; +#define BTRFS_IOCTL_GET_DEVICE_STATS_MAX_NR_ITEMS 5 +struct btrfs_ioctl_get_device_stats { + __u64 devid; /* in */ + __u64 nr_items; /* in/out */ + + /* out values: */ + + /* disk I/O failure stats */ + __u64 cnt_write_io_errs; /* EIO or EREMOTEIO from lower layers */ + __u64 cnt_read_io_errs; /* EIO or EREMOTEIO from lower layers */ + __u64 cnt_flush_io_errs; /* EIO or EREMOTEIO from lower layers */ + + /* stats for indirect indications for I/O failures */ + __u64 cnt_corruption_errs; /* checksum error, bytenr error or + * contents is illegal: this is an + * indication that the block was damaged + * during read or write, or written to + * wrong location or read from wrong + * location */ + __u64 cnt_generation_errs; /* an indication that blocks have not + * been written */ +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -276,5 +299,9 @@ struct btrfs_ioctl_logical_ino_args { struct btrfs_ioctl_ino_path_args) #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ struct btrfs_ioctl_ino_path_args) +#define BTRFS_IOC_GET_DEVICE_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ + struct btrfs_ioctl_get_device_stats) +#define BTRFS_IOC_GET_AND_RESET_DEVICE_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \ + struct btrfs_ioctl_get_device_stats) #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7681477..9ee33a5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3857,3 +3857,72 @@ void btrfs_device_stat_print_on_error(struct btrfs_device *device) btrfs_device_stat_read( &device->cnt_generation_errs)); } + +int btrfs_get_device_stats(struct btrfs_root *root, + struct btrfs_ioctl_get_device_stats *stats, + int reset_after_read) +{ + struct btrfs_device *dev; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + + mutex_lock(&fs_devices->device_list_mutex); + dev = btrfs_find_device(root, stats->devid, NULL, NULL); + mutex_unlock(&fs_devices->device_list_mutex); + + if (!dev) { + printk(KERN_WARNING + "btrfs: get device_stats failed, device not found\n"); + return -ENODEV; + } else if (reset_after_read) { + if (stats->nr_items >= 1) + stats->cnt_write_io_errs + btrfs_device_stat_read_and_reset( + &dev->cnt_write_io_errs); + else + btrfs_device_stat_reset(&dev->cnt_write_io_errs); + if (stats->nr_items >= 2) + stats->cnt_read_io_errs + btrfs_device_stat_read_and_reset( + &dev->cnt_read_io_errs); + else + btrfs_device_stat_reset(&dev->cnt_read_io_errs); + if (stats->nr_items >= 3) + stats->cnt_flush_io_errs + btrfs_device_stat_read_and_reset( + &dev->cnt_flush_io_errs); + else + btrfs_device_stat_reset(&dev->cnt_flush_io_errs); + if (stats->nr_items >= 4) + stats->cnt_corruption_errs + btrfs_device_stat_read_and_reset( + &dev->cnt_corruption_errs); + else + btrfs_device_stat_reset(&dev->cnt_corruption_errs); + if (stats->nr_items >= 5) + stats->cnt_generation_errs + btrfs_device_stat_read_and_reset( + &dev->cnt_generation_errs); + else + btrfs_device_stat_reset(&dev->cnt_generation_errs); + dev->device_stats_dirty = 1; + } else { + if (stats->nr_items >= 1) + stats->cnt_write_io_errs = btrfs_device_stat_read( + &dev->cnt_write_io_errs); + if (stats->nr_items >= 2) + stats->cnt_read_io_errs = btrfs_device_stat_read( + &dev->cnt_read_io_errs); + if (stats->nr_items >= 3) + stats->cnt_flush_io_errs = btrfs_device_stat_read( + &dev->cnt_flush_io_errs); + if (stats->nr_items >= 4) + stats->cnt_corruption_errs = btrfs_device_stat_read( + &dev->cnt_corruption_errs); + if (stats->nr_items >= 5) + stats->cnt_generation_errs = btrfs_device_stat_read( + &dev->cnt_generation_errs); + } + if (stats->nr_items > 5) + stats->nr_items = 5; + return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 51ad850..ad8c3bc 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -244,6 +244,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, u64 logical, int mirror_num); void btrfs_device_stat_print_on_error(struct btrfs_device *device); +int btrfs_get_device_stats(struct btrfs_root *root, + struct btrfs_ioctl_get_device_stats *stats, + int reset_after_read); static inline void btrfs_device_stat_inc(atomic_t *cnt) { @@ -254,4 +257,14 @@ static inline int btrfs_device_stat_read(atomic_t *cnt) { return atomic_read(cnt); } + +static inline int btrfs_device_stat_read_and_reset(atomic_t *cnt) +{ + return atomic_xchg(cnt, 0); +} + +static inline void btrfs_device_stat_reset(atomic_t *cnt) +{ + atomic_set(cnt, 0); +} #endif -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Stefan Behrens
2011-Dec-21 16:05 UTC
[PATCH v2 3/3] Btrfs: read device stats on mount, write modified ones during commit
The device statistics are written into the device tree with each transaction commit. Only modified statistics are written. When a filesystem is mounted, the device statistics for each involved device are read from the device tree and used to initialize the counters. Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de> --- fs/btrfs/ctree.h | 51 ++++++++++++ fs/btrfs/disk-io.c | 7 ++ fs/btrfs/print-tree.c | 3 + fs/btrfs/transaction.c | 4 + fs/btrfs/volumes.c | 205 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 9 ++ 6 files changed, 279 insertions(+), 0 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6738503..d8cd931 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -750,6 +750,26 @@ struct btrfs_csum_item { u8 csum; } __attribute__ ((__packed__)); +struct btrfs_device_stats_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 cnt_write_io_errs; /* EIO or EREMOTEIO from lower layers */ + __le64 cnt_read_io_errs; /* EIO or EREMOTEIO from lower layers */ + __le64 cnt_flush_io_errs; /* EIO or EREMOTEIO from lower layers */ + + /* stats for indirect indications for I/O failures */ + __le64 cnt_corruption_errs; /* checksum error, bytenr error or + * contents is illegal: this is an + * indication that the block was damaged + * during read or write, or written to + * wrong location or read from wrong + * location */ + __le64 cnt_generation_errs; /* an indication that blocks have not + * been written */ +} __attribute__ ((__packed__)); + /* different types of block groups (and chunks) */ #define BTRFS_BLOCK_GROUP_DATA (1 << 0) #define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) @@ -1384,6 +1404,12 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_CHUNK_ITEM_KEY 228 /* + * Persistantly stores the io stats in the device tree. + * One key for all stats, (0, BTRFS_DEVICE_STATS_KEY, devid). + */ +#define BTRFS_DEVICE_STATS_KEY 248 + +/* * string items are for debugging. They just store a short string of * data in the FS */ @@ -2196,6 +2222,31 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, return btrfs_item_size(eb, e) - offset; } +/* btrfs_device_stats_item */ +BTRFS_SETGET_FUNCS(device_stats_cnt_write_io_errs, + struct btrfs_device_stats_item, cnt_write_io_errs, 64); +BTRFS_SETGET_FUNCS(device_stats_cnt_read_io_errs, + struct btrfs_device_stats_item, cnt_read_io_errs, 64); +BTRFS_SETGET_FUNCS(device_stats_cnt_flush_io_errs, + struct btrfs_device_stats_item, cnt_flush_io_errs, 64); +BTRFS_SETGET_FUNCS(device_stats_cnt_corruption_errs, + struct btrfs_device_stats_item, cnt_corruption_errs, 64); +BTRFS_SETGET_FUNCS(device_stats_cnt_generation_errs, + struct btrfs_device_stats_item, cnt_generation_errs, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_device_stats_cnt_write_io_errs, + struct btrfs_device_stats_item, cnt_write_io_errs, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_stats_cnt_read_io_errs, + struct btrfs_device_stats_item, cnt_read_io_errs, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_stats_cnt_flush_io_errs, + struct btrfs_device_stats_item, cnt_flush_io_errs, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_stats_cnt_corruption_errs, + struct btrfs_device_stats_item, cnt_corruption_errs, + 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_stats_cnt_generation_errs, + struct btrfs_device_stats_item, cnt_generation_errs, + 64); + static inline struct btrfs_root *btrfs_sb(struct super_block *sb) { return sb->s_fs_info; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 905f1fa..cd9fcba 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2325,6 +2325,13 @@ retry_root_backup: fs_info->metadata_alloc_profile = (u64)-1; fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; + ret = btrfs_init_device_stats(fs_info); + if (ret) { + printk(KERN_ERR "btrfs: failed to init device_stats: %d\n", + ret); + goto fail_block_groups; + } + ret = btrfs_init_space_info(fs_info); if (ret) { printk(KERN_ERR "Failed to initial space info: %d\n", ret); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index f38e452..a9e45e4 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -294,6 +294,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_dev_extent_chunk_offset(l, dev_extent), (unsigned long long) btrfs_dev_extent_length(l, dev_extent)); + case BTRFS_DEVICE_STATS_KEY: + printk(KERN_INFO "\t\tdevice stats\n"); + break; }; } } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 81376d9..9cf3095 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -28,6 +28,7 @@ #include "locking.h" #include "tree-log.h" #include "inode-map.h" +#include "volumes.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -724,6 +725,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); BUG_ON(ret); + ret = btrfs_run_device_stats(trans, root->fs_info); + BUG_ON(ret); + while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; list_del_init(next); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9ee33a5..c4c7dd1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -38,6 +38,8 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); static int btrfs_relocate_sys_chunks(struct btrfs_root *root); +static void __btrfs_reset_device_stats(struct btrfs_device *dev); +static void btrfs_device_stat_print_on_load(struct btrfs_device *device); static DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -362,6 +364,7 @@ static noinline int device_list_add(const char *path, return -ENOMEM; } device->devid = devid; + device->device_stats_valid = 0; device->work.func = pending_bios_fn; memcpy(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); @@ -3845,8 +3848,194 @@ error: return ret; } +static void __btrfs_reset_device_stats(struct btrfs_device *device) +{ + btrfs_device_stat_reset(&device->cnt_write_io_errs); + btrfs_device_stat_reset(&device->cnt_read_io_errs); + btrfs_device_stat_reset(&device->cnt_flush_io_errs); + btrfs_device_stat_reset(&device->cnt_corruption_errs); + btrfs_device_stat_reset(&device->cnt_generation_errs); +} + +int btrfs_init_device_stats(struct btrfs_fs_info *fs_info) +{ + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct extent_buffer *eb; + int slot; + int ret = 0; + struct btrfs_device *device; + struct btrfs_path *path = NULL; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + int item_size; + struct btrfs_device_stats_item *ptr; + + key.objectid = 0; + key.type = BTRFS_DEVICE_STATS_KEY; + key.offset = device->devid; + ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); + if (ret) { + printk(KERN_WARNING "btrfs: no device_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n", + device->name, (unsigned long long)device->devid); + __btrfs_reset_device_stats(device); + device->device_stats_valid = 1; + device->device_stats_dirty = 1; + btrfs_release_path(path); + continue; + } + slot = path->slots[0]; + eb = path->nodes[0]; + btrfs_item_key_to_cpu(eb, &found_key, slot); + item_size = btrfs_item_size_nr(eb, slot); + + ptr = btrfs_item_ptr(eb, slot, + struct btrfs_device_stats_item); + + if (item_size >= 1 * sizeof(__le64)) + btrfs_device_stat_set(&device->cnt_write_io_errs, + btrfs_device_stats_cnt_write_io_errs(eb, ptr)); + else + btrfs_device_stat_reset(&device->cnt_write_io_errs); + if (item_size >= 2 * sizeof(__le64)) + btrfs_device_stat_set(&device->cnt_read_io_errs, + btrfs_device_stats_cnt_read_io_errs(eb, ptr)); + else + btrfs_device_stat_reset(&device->cnt_read_io_errs); + if (item_size >= 3 * sizeof(__le64)) + btrfs_device_stat_set(&device->cnt_flush_io_errs, + btrfs_device_stats_cnt_flush_io_errs(eb, ptr)); + else + btrfs_device_stat_reset(&device->cnt_flush_io_errs); + if (item_size >= 4 * sizeof(__le64)) + btrfs_device_stat_set(&device->cnt_corruption_errs, + btrfs_device_stats_cnt_corruption_errs(eb, + ptr)); + else + btrfs_device_stat_reset(&device->cnt_corruption_errs); + if (item_size >= 5 * sizeof(__le64)) + btrfs_device_stat_set(&device->cnt_generation_errs, + btrfs_device_stats_cnt_generation_errs(eb, + ptr)); + else + btrfs_device_stat_reset(&device->cnt_generation_errs); + + btrfs_device_stat_print_on_load(device); + device->device_stats_valid = 1; + btrfs_release_path(path); + } + mutex_unlock(&fs_devices->device_list_mutex); + +out: + btrfs_free_path(path); + return ret < 0 ? ret : 0; +} + +static int update_device_stat_item(struct btrfs_trans_handle *trans, + struct btrfs_root *dev_root, + struct btrfs_device *device) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_device_stats_item *ptr; + int ret; + + key.objectid = 0; + key.type = BTRFS_DEVICE_STATS_KEY; + key.offset = device->devid; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, dev_root, &key, path, 0, 1); + if (ret < 0) { + printk(KERN_WARNING "btrfs: error %d while searching for device_stats item for device %s!\n", + ret, device->name); + goto out; + } + + if (0 == ret && + btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { + /* need to delete old one and insert a new one */ + ret = btrfs_del_item(trans, dev_root, path); + if (0 != ret) { + printk(KERN_WARNING "btrfs: delete too small device_stats item for device %s failed %d!\n", + device->name, ret); + goto out; + } + ret = 1; + } + + if (1 == ret) { + /* need to insert a new item */ + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, dev_root, path, + &key, sizeof(*ptr)); + if (ret < 0) { + printk(KERN_WARNING "btrfs: insert device_stats item for device %s failed %d!\n", + device->name, ret); + goto out; + } + } + + eb = path->nodes[0]; + ptr = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_device_stats_item); + btrfs_set_device_stats_cnt_write_io_errs(eb, ptr, + btrfs_device_stat_read(&device->cnt_write_io_errs)); + btrfs_set_device_stats_cnt_read_io_errs(eb, ptr, + btrfs_device_stat_read(&device->cnt_read_io_errs)); + btrfs_set_device_stats_cnt_flush_io_errs(eb, ptr, + btrfs_device_stat_read(&device->cnt_flush_io_errs)); + btrfs_set_device_stats_cnt_corruption_errs(eb, ptr, + btrfs_device_stat_read(&device->cnt_corruption_errs)); + btrfs_set_device_stats_cnt_generation_errs(eb, ptr, + btrfs_device_stat_read(&device->cnt_generation_errs)); + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + return ret; +} + +/* + * called from commit_transaction. Writes all changed device stats to disk. + */ +int btrfs_run_device_stats(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *dev_root = fs_info->dev_root; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + int ret = 0; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->device_stats_valid || !device->device_stats_dirty) + continue; + + ret = update_device_stat_item(trans, dev_root, device); + if (!ret) + device->device_stats_dirty = 0; + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + void btrfs_device_stat_print_on_error(struct btrfs_device *device) { + if (!device->device_stats_valid) + return; printk_ratelimited(KERN_ERR "btrfs: bdev %s errs: wr %u, rd %u," " flush %u, corrupt %u, gen %u\n", device->name, @@ -3858,6 +4047,18 @@ void btrfs_device_stat_print_on_error(struct btrfs_device *device) &device->cnt_generation_errs)); } +static void btrfs_device_stat_print_on_load(struct btrfs_device *device) +{ + printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u," + " corrupt %u, gen %u\n", + device->name, + btrfs_device_stat_read(&device->cnt_write_io_errs), + btrfs_device_stat_read(&device->cnt_read_io_errs), + btrfs_device_stat_read(&device->cnt_flush_io_errs), + btrfs_device_stat_read(&device->cnt_corruption_errs), + btrfs_device_stat_read(&device->cnt_generation_errs)); +} + int btrfs_get_device_stats(struct btrfs_root *root, struct btrfs_ioctl_get_device_stats *stats, int reset_after_read) @@ -3873,6 +4074,10 @@ int btrfs_get_device_stats(struct btrfs_root *root, printk(KERN_WARNING "btrfs: get device_stats failed, device not found\n"); return -ENODEV; + } else if (!dev->device_stats_valid) { + printk(KERN_WARNING + "btrfs: get device_stats failed, not yet valid\n"); + return -ENODEV; } else if (reset_after_read) { if (stats->nr_items >= 1) stats->cnt_write_io_errs diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ad8c3bc..7811347 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -108,6 +108,7 @@ struct btrfs_device { /* disk I/O failure stats. For detailed description refer to * struct btrfs_device_stats_item in ctree.h */ + int device_stats_valid; int device_stats_dirty; /* counters need to be written to disk */ atomic_t cnt_write_io_errs; atomic_t cnt_read_io_errs; @@ -243,6 +244,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, u64 *start, u64 *max_avail); struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root, u64 logical, int mirror_num); +int btrfs_init_device_stats(struct btrfs_fs_info *fs_info); +int btrfs_run_device_stats(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); void btrfs_device_stat_print_on_error(struct btrfs_device *device); int btrfs_get_device_stats(struct btrfs_root *root, struct btrfs_ioctl_get_device_stats *stats, @@ -267,4 +271,9 @@ static inline void btrfs_device_stat_reset(atomic_t *cnt) { atomic_set(cnt, 0); } + +static inline void btrfs_device_stat_set(atomic_t *cnt, unsigned long val) +{ + atomic_set(cnt, val); +} #endif -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html