Gui Hecheng
2013-Nov-28 05:32 UTC
[PATCH 1/4] Btrfs-progs: chunk-recover: add new flag to prepare recovering for ordered data chunk
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com> When reading block groups we will searching it''s corresponding chunk, however, at this time, some chunks has not been built(data chunks raid0/raid10/raid56), don''t bug_on here, we will try to rebuild these chunks later. Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> --- chunk-recover.c | 1 + ctree.h | 1 + volumes.c | 9 ++++++++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/chunk-recover.c b/chunk-recover.c index e880bbc..ae0d318 100644 --- a/chunk-recover.c +++ b/chunk-recover.c @@ -1197,6 +1197,7 @@ open_ctree_with_broken_chunk(struct recover_control *rc) fprintf(stderr, "Failed to allocate memory for fs_info\n"); return ERR_PTR(-ENOMEM); } + fs_info->is_chunk_recover = 1; fs_info->fs_devices = rc->fs_devices; ret = btrfs_open_devices(fs_info->fs_devices, O_RDWR); diff --git a/ctree.h b/ctree.h index 92a396a..9f68862 100644 --- a/ctree.h +++ b/ctree.h @@ -977,6 +977,7 @@ struct btrfs_fs_info { int system_allocs; int readonly; int on_restoring; + int is_chunk_recover; int (*free_extent_hook)(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, diff --git a/volumes.c b/volumes.c index c38da6c..bd01270 100644 --- a/volumes.c +++ b/volumes.c @@ -1496,8 +1496,15 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) int readonly = 0; int i; + /* + * During chunk recovering, we may fail to find block group''s + * corresponding chunk, we will rebuild it later + */ ce = search_cache_extent(&map_tree->cache_tree, chunk_offset); - BUG_ON(!ce); + if (!root->fs_info->is_chunk_recover) + BUG_ON(!ce); + else + return 0; map = container_of(ce, struct map_lookup, ce); for (i = 0; i < map->num_stripes; i++) { -- 1.8.0.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Gui Hecheng
2013-Nov-28 05:32 UTC
[PATCH 2/4] btrfs-progs: skip chunk recover works when check chunks successfully
If no chunks need to be recovered, skip the recover works, meanwhile the user won''t be annoyed by the "ask_user". Signed-off-by: Gui Hecheng <guihc.fnst@cn.fujitsu.com> --- chunk-recover.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/chunk-recover.c b/chunk-recover.c index ae0d318..45d6eae 100644 --- a/chunk-recover.c +++ b/chunk-recover.c @@ -1685,6 +1685,9 @@ int btrfs_recover_chunk_tree(char *path, int verbose, int yes) * droppped from the fs. Don''t deal with them now, we will * check it after the fs is opened. */ + } else { + fprintf(stderr, "Check chunks successfully with no orphans\n"); + goto fail_rc; } root = open_ctree_with_broken_chunk(&rc); -- 1.8.0.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Gui Hecheng
2013-Nov-28 05:32 UTC
[PATCH 3/4] btrfs-progs: add chunk-recover raid0/5/6 data stripes rebuild routine
Decide the raid0/5/6 data stripes'' order using checksums. For one chunk, fetch each 64k logical stripe 1. search its checksum in the csum tree 2. read the physical stripe data on each device 3. calc the data checksums 4. if one checksum matches the value from the csum tree, then the logical stripe resides in that device, the stripe order index can be calculated. 5. if more than one checksums match, then use the successive csum in the tree to compare again. 6. if equal stripes are encountered, just fetch next stripe. 7. if some devices'' order are still not decided, then they can not be recovered. Signed-off-by: Gui Hecheng <guihc.fnst@cn.fujitsu.com> --- chunk-recover.c | 371 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 371 insertions(+) diff --git a/chunk-recover.c b/chunk-recover.c index 45d6eae..ac2a437 100644 --- a/chunk-recover.c +++ b/chunk-recover.c @@ -1566,6 +1566,371 @@ static int btrfs_rebuild_chunk_stripes(struct recover_control *rc, return ret; } +static int next_csum(struct btrfs_root *root, + struct extent_buffer **leaf, + struct btrfs_path *path, + int *slot, + u64 *csum_offset, + u32 *tree_csum, + u64 end, + struct btrfs_key *key) +{ + int ret = 0; + struct btrfs_root *csum_root = root->fs_info->csum_root; + struct btrfs_csum_item *csum_item; + u32 blocksize = root->sectorsize; + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + int csums_in_item = btrfs_item_size_nr(*leaf, *slot) / csum_size; + + if (*csum_offset >= csums_in_item) { + ++(*slot); + *csum_offset = 0; + if (*slot >= btrfs_header_nritems(*leaf)) { + ret = btrfs_next_leaf(csum_root, path); + if (ret < 0) + return -1; + else if (ret > 0) + return 1; + *leaf = path->nodes[0]; + *slot = path->slots[0]; + } + btrfs_item_key_to_cpu(*leaf, key, *slot); + } + + if (key->offset + (*csum_offset) * blocksize >= end) + return 2; + csum_item = btrfs_item_ptr(*leaf, *slot, struct btrfs_csum_item); + csum_item = (struct btrfs_csum_item *)((unsigned char *)csum_item + + (*csum_offset) * csum_size); + read_extent_buffer(*leaf, tree_csum, + (unsigned long)csum_item, csum_size); + return ret; +} + +static u64 calc_data_offset(struct btrfs_key *key, + struct chunk_record *chunk, + u64 dev_offset, + u64 csum_offset, + u32 blocksize) +{ + u64 data_offset; + int logical_stripe_nr; + int dev_stripe_nr; + int nr_data_stripes; + + data_offset = key->offset + csum_offset * blocksize - chunk->offset; + nr_data_stripes = chunk->num_stripes; + + if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID5) + nr_data_stripes -= 1; + else if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID6) + nr_data_stripes -= 2; + + logical_stripe_nr = data_offset / chunk->stripe_len; + dev_stripe_nr = logical_stripe_nr / nr_data_stripes; + + data_offset -= logical_stripe_nr * chunk->stripe_len; + data_offset += dev_stripe_nr * chunk->stripe_len; + + return dev_offset + data_offset; +} + +static int check_one_csum(int fd, u64 start, u32 len, u32 tree_csum) +{ + char *data; + int ret = 0; + u32 csum_result = ~(u32)0; + + data = malloc(len); + if (!data) + return -1; + ret = pread64(fd, data, len, start); + if (ret < 0 || ret != len) { + ret = -1; + goto out; + } + ret = 0; + csum_result = btrfs_csum_data(NULL, data, csum_result, len); + btrfs_csum_final(csum_result, (char *)&csum_result); + if (csum_result != tree_csum) + ret = 1; +out: + free(data); + return ret; +} + +static u64 item_end_offset(struct btrfs_root *root, struct btrfs_key *key, + struct extent_buffer *leaf, int slot) { + u32 blocksize = root->sectorsize; + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + + u64 offset = btrfs_item_size_nr(leaf, slot); + offset /= csum_size; + offset *= blocksize; + offset += key->offset; + + return offset; +} + +static int insert_stripe(struct list_head *devexts, + struct recover_control *rc, + struct chunk_record *chunk, + int index) { + struct device_extent_record *devext; + struct btrfs_device *dev; + + devext = list_entry(devexts->next, struct device_extent_record, + chunk_list); + dev = btrfs_find_device_by_devid(rc->fs_devices, devext->objectid, + 0); + if (!dev) + return 1; + BUG_ON(btrfs_find_device_by_devid(rc->fs_devices, devext->objectid, + 1)); + + chunk->stripes[index].devid = devext->objectid; + chunk->stripes[index].offset = devext->offset; + memcpy(chunk->stripes[index].dev_uuid, dev->uuid, BTRFS_UUID_SIZE); + + list_move(&devext->chunk_list, &chunk->dextents); + + return 0; +} + +#define EQUAL_STRIPE (1 << 0) + +static int rebuild_raid_data_chunk_stripes(struct recover_control *rc, + struct btrfs_root *root, + struct chunk_record *chunk, + u8 *flags) +{ + int i; + int ret = 0; + int slot; + struct btrfs_path path; + struct btrfs_key prev_key; + struct btrfs_key key; + struct btrfs_root *csum_root; + struct extent_buffer *leaf; + struct device_extent_record *devext; + struct device_extent_record *next; + struct btrfs_device *dev; + u64 start = chunk->offset; + u64 end = start + chunk->stripe_len; + u64 chunk_end = chunk->offset + chunk->length; + u64 csum_offset = 0; + u64 data_offset; + u32 blocksize = root->sectorsize; + u32 tree_csum; + int index = 0; + int num_unordered = 0; + LIST_HEAD(unordered); + LIST_HEAD(candidates); + + csum_root = root->fs_info->csum_root; + btrfs_init_path(&path); + list_splice_init(&chunk->dextents, &candidates); +again: + if (list_is_last(candidates.next, &candidates)) + goto out; + + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = start; + + ret = btrfs_search_slot(NULL, csum_root, &key, &path, 0, 0); + if (ret < 0) { + fprintf(stderr, "Search csum failed(%d)\n", ret); + goto fail_out; + } + leaf = path.nodes[0]; + slot = path.slots[0]; + if (ret > 0) { + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(csum_root, &path); + if (ret < 0) { + fprintf(stderr, + "Walk tree failed(%d)\n", ret); + goto fail_out; + } else if (ret > 0) { + slot = btrfs_header_nritems(leaf) - 1; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (item_end_offset(root, &key, leaf, slot) + > start) { + csum_offset = start - key.offset; + csum_offset /= blocksize; + goto next_csum; + } + goto next_stripe; + } + leaf = path.nodes[0]; + slot = path.slots[0]; + } + btrfs_item_key_to_cpu(leaf, &key, slot); + ret = btrfs_previous_item(csum_root, &path, 0, + BTRFS_EXTENT_CSUM_KEY); + if (ret < 0) + goto fail_out; + else if (ret > 0) { + if (key.offset >= end) + goto next_stripe; + else + goto next_csum; + } + leaf = path.nodes[0]; + slot = path.slots[0]; + + btrfs_item_key_to_cpu(leaf, &prev_key, slot); + if (item_end_offset(root, &prev_key, leaf, slot) > start) { + csum_offset = start - prev_key.offset; + csum_offset /= blocksize; + btrfs_item_key_to_cpu(leaf, &key, slot); + } else { + if (key.offset >= end) + goto next_stripe; + } + + if (key.offset + csum_offset * blocksize > chunk_end) + goto out; + } +next_csum: + ret = next_csum(root, &leaf, &path, &slot, &csum_offset, &tree_csum, + end, &key); + if (ret < 0) { + fprintf(stderr, "Fetch csum failed\n"); + goto fail_out; + } else if (ret == 1) { + list_for_each_entry(devext, &unordered, chunk_list) + num_unordered++; + if (!(*flags & EQUAL_STRIPE)) + *flags |= EQUAL_STRIPE; + goto out; + } else if (ret == 2) + goto next_stripe; + + list_for_each_entry_safe(devext, next, &candidates, chunk_list) { + data_offset = calc_data_offset(&key, chunk, devext->offset, + csum_offset, blocksize); + dev = btrfs_find_device_by_devid(rc->fs_devices, + devext->objectid, 0); + if (!dev) { + ret = 1; + goto fail_out; + } + BUG_ON(btrfs_find_device_by_devid(rc->fs_devices, + devext->objectid, 1)); + + ret = check_one_csum(dev->fd, data_offset, blocksize, + tree_csum); + if (ret < 0) + goto fail_out; + else if (ret > 0) + list_move(&devext->chunk_list, &unordered); + } + + if (list_empty(&candidates)) { + list_for_each_entry(devext, &unordered, chunk_list) + num_unordered++; + if (chunk->type_flags & BTRFS_BLOCK_GROUP_RAID6 + && num_unordered == 2) { + list_splice_init(&unordered, &chunk->dextents); + btrfs_release_path(&path); + return 0; + } else + ret = 1; + + goto fail_out; + } + + if (list_is_last(candidates.next, &candidates)) { + index = btrfs_calc_stripe_index(chunk, + key.offset + csum_offset * blocksize); + if (chunk->stripes[index].devid) + goto next_stripe; + ret = insert_stripe(&candidates, rc, chunk, index); + if (ret) + goto fail_out; + } else { + csum_offset++; + goto next_csum; + } +next_stripe: + start = btrfs_next_stripe_logical_offset(chunk, start); + end = min(start + chunk->stripe_len, chunk_end); + list_splice_init(&unordered, &candidates); + btrfs_release_path(&path); + csum_offset = 0; + if (end < chunk_end) + goto again; +out: + ret = 0; + list_splice_init(&candidates, &unordered); + list_for_each_entry(devext, &unordered, chunk_list) + num_unordered++; + if (num_unordered == 1) { + for (i = 0; i < chunk->num_stripes; i++) { + if (!chunk->stripes[i].devid) { + index = i; + break; + } + } + ret = insert_stripe(&unordered, rc, chunk, index); + if (ret) + goto fail_out; + } else { + if ((num_unordered == 2 && chunk->type_flags + & BTRFS_BLOCK_GROUP_RAID5) + || (num_unordered == 3 && chunk->type_flags + & BTRFS_BLOCK_GROUP_RAID6)) { + for (i = 0; i < chunk->num_stripes; i++) { + if (!chunk->stripes[i].devid) { + ret = insert_stripe(&unordered, rc, + chunk, i); + if (ret) + break; + } + } + } + } +fail_out: + ret = !!ret || (list_empty(&unordered) ? 0 : 1); + list_splice_init(&candidates, &chunk->dextents); + list_splice_init(&unordered, &chunk->dextents); + btrfs_release_path(&path); + + return ret; +} + +static int btrfs_rebuild_ordered_data_chunk_stripes(struct recover_control *rc, + struct btrfs_root *root) +{ + struct chunk_record *chunk; + struct chunk_record *next; + int ret = 0; + int err; + u8 flags; + + list_for_each_entry_safe(chunk, next, &rc->unrepaired_chunks, list) { + if ((chunk->type_flags & BTRFS_BLOCK_GROUP_DATA) + && (chunk->type_flags & BTRFS_ORDERED_RAID)) { + flags = 0; + err = rebuild_raid_data_chunk_stripes(rc, root, chunk, + &flags); + if (err) { + list_move(&chunk->list, &rc->bad_chunks); + if (flags & EQUAL_STRIPE) + fprintf(stderr, + "Failure: too many equal stripes in chunk[%llu %llu]\n", + chunk->offset, chunk->length); + if (!ret) + ret = err; + } else + list_move(&chunk->list, &rc->good_chunks); + } + } + return ret; +} + static int btrfs_recover_chunks(struct recover_control *rc) { struct chunk_record *chunk; @@ -1703,6 +2068,12 @@ int btrfs_recover_chunk_tree(char *path, int verbose, int yes) goto fail_close_ctree; } + ret = btrfs_rebuild_ordered_data_chunk_stripes(&rc, root); + if (ret) { + fprintf(stderr, "Failed to rebuild ordered chunk stripes.\n"); + goto fail_close_ctree; + } + if (!rc.yes) { ret = ask_user("We are going to rebuild the chunk tree on disk, it might destroy the old metadata on the disk, Are you sure?"); if (!ret) { -- 1.8.0.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Gui Hecheng
2013-Nov-28 05:32 UTC
[PATCH 4/4] btrfs-progs: scan devices in parallel for chunk-recover
Originally, multi devices are scanned one by one; Now, one thread is used per device to scan. Signed-off-by: Gui Hecheng <guihc.fnst@cn.fujitsu.com> --- chunk-recover.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 82 insertions(+), 7 deletions(-) diff --git a/chunk-recover.c b/chunk-recover.c index ac2a437..d103e17 100644 --- a/chunk-recover.c +++ b/chunk-recover.c @@ -26,6 +26,7 @@ #include <fcntl.h> #include <unistd.h> #include <uuid/uuid.h> +#include <pthread.h> #include "kerncompat.h" #include "list.h" @@ -64,6 +65,7 @@ struct recover_control { struct list_head good_chunks; struct list_head bad_chunks; struct list_head unrepaired_chunks; + pthread_mutex_t rc_lock; }; struct extent_record { @@ -75,6 +77,12 @@ struct extent_record { int nmirrors; }; +struct device_scan { + struct recover_control *rc; + struct btrfs_device *dev; + int fd; +}; + static struct extent_record *btrfs_new_extent_record(struct extent_buffer *eb) { struct extent_record *rec; @@ -202,6 +210,7 @@ static void init_recover_control(struct recover_control *rc, int verbose, rc->verbose = verbose; rc->yes = yes; + pthread_mutex_init(&rc->rc_lock, NULL); } static void free_recover_control(struct recover_control *rc) @@ -210,6 +219,7 @@ static void free_recover_control(struct recover_control *rc) free_chunk_cache_tree(&rc->chunk); free_device_extent_tree(&rc->devext); free_extent_record_tree(&rc->eb_cache); + pthread_mutex_destroy(&rc->rc_lock); } static int process_block_group_item(struct block_group_tree *bg_cache, @@ -694,14 +704,20 @@ static int extract_metadata_record(struct recover_control *rc, btrfs_item_key_to_cpu(leaf, &key, i); switch (key.type) { case BTRFS_BLOCK_GROUP_ITEM_KEY: + pthread_mutex_lock(&rc->rc_lock); ret = process_block_group_item(&rc->bg, leaf, &key, i); + pthread_mutex_unlock(&rc->rc_lock); break; case BTRFS_CHUNK_ITEM_KEY: + pthread_mutex_lock(&rc->rc_lock); ret = process_chunk_item(&rc->chunk, leaf, &key, i); + pthread_mutex_unlock(&rc->rc_lock); break; case BTRFS_DEV_EXTENT_KEY: + pthread_mutex_lock(&rc->rc_lock); ret = process_device_extent_item(&rc->devext, leaf, &key, i); + pthread_mutex_unlock(&rc->rc_lock); break; } if (ret) @@ -721,12 +737,19 @@ static inline int is_super_block_address(u64 offset) return 0; } -static int scan_one_device(struct recover_control *rc, int fd, - struct btrfs_device *device) +static int scan_one_device(void *dev_scan_struct) { struct extent_buffer *buf; u64 bytenr; int ret = 0; + struct device_scan *dev_scan = (struct device_scan *)dev_scan_struct; + struct recover_control *rc = dev_scan->rc; + struct btrfs_device *device = dev_scan->dev; + int fd = dev_scan->fd; + + ret = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL); + if (ret) + return 1; buf = malloc(sizeof(*buf) + rc->leafsize); if (!buf) @@ -754,7 +777,9 @@ static int scan_one_device(struct recover_control *rc, int fd, continue; } + pthread_mutex_lock(&rc->rc_lock); ret = process_extent_buffer(&rc->eb_cache, buf, device, bytenr); + pthread_mutex_unlock(&rc->rc_lock); if (ret) goto out; @@ -784,6 +809,7 @@ next_node: bytenr += rc->leafsize; } out: + close(fd); free(buf); return ret; } @@ -793,6 +819,27 @@ static int scan_devices(struct recover_control *rc) int ret = 0; int fd; struct btrfs_device *dev; + struct device_scan *dev_scans; + pthread_t *t_scans; + int *t_rets; + int devnr = 0; + int devidx = 0; + int cancel_from = 0; + int cancel_to = 0; + int i; + + list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) + devnr++; + dev_scans = (struct device_scan *)malloc(sizeof(struct device_scan) + * devnr); + if (!dev_scans) + return -ENOMEM; + t_scans = (pthread_t *)malloc(sizeof(pthread_t) * devnr); + if (!t_scans) + return -ENOMEM; + t_rets = (int *)malloc(sizeof(int) * devnr); + if (!t_rets) + return -ENOMEM; list_for_each_entry(dev, &rc->fs_devices->devices, dev_list) { fd = open(dev->name, O_RDONLY); @@ -801,12 +848,40 @@ static int scan_devices(struct recover_control *rc) dev->name); return -1; } - ret = scan_one_device(rc, fd, dev); - close(fd); - if (ret) - return ret; + dev_scans[devidx].rc = rc; + dev_scans[devidx].dev = dev; + dev_scans[devidx].fd = fd; + ret = pthread_create(&t_scans[devidx], NULL, + (void *)scan_one_device, + (void *)&dev_scans[devidx]); + if (ret) { + cancel_from = 0; + cancel_to = devidx - 1; + goto out; + } + devidx++; } - return ret; + + i = 0; + while (i < devidx) { + ret = pthread_join(t_scans[i], (void **)&t_rets[i]); + if (ret || t_rets[i]) { + ret = 1; + cancel_from = i + 1; + cancel_to = devnr - 1; + break; + } + i++; + } +out: + while (cancel_from <= cancel_to) { + pthread_cancel(t_scans[cancel_from]); + cancel_from++; + } + free(dev_scans); + free(t_scans); + free(t_rets); + return !!ret; } static int build_device_map_by_chunk_record(struct btrfs_root *root, -- 1.8.0.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html