This is a preliminary attempt to add RAID5 and RAID6 support. So far it doesn''t attempt to write or read the parity blocks -- it just lays the data blocks out as we want them, so it''s effectively just a complex and wasteful kind of RAID0. The next step is to make btrfs_map_bio() do the right thing: - Satisfy read requests for mirrors #2 and #3 by recreating data from RAID5 parity or RAID6 error correction stripe respectively. - Write out parity and RAID6 blocks appropriately when data writes happen. The former is relatively easy; the latter is slightly more interesting. Chris suggests that we can avoid read/modify/write cycles for the parity blocks by ensuring that the file system always writes a full set of stripes. So for a RAID5 of 4 disks with 64KiB stripe_len, that would be a 192KiB minimum write size, for example. I''m not entirely sure of the best way to do that -- can we set a minimum allocation size for a chunk, and then maybe have it fall back to RAID1 (or a RAID5 chunk with smaller stripe_len) for smaller allocations if they''d be too wasteful on the larger RAID5 chunks? And how would we handle nodatacow? I think I''m going to do a crappy r/m/w thing for now (in the knowledge that the error correction stripes won''t be powerfail-safe), and then we can set about trying to render it unnecessary. (Yes, I know I need to fix up btrfs_discard_extent() for RAID5 too -- it doesn''t discard the parity stripes, and I may want to make it avoid discarding partial stripes for now, until we fix the above.) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 98a8738..40168d7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -653,6 +653,8 @@ struct btrfs_csum_item { #define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) #define BTRFS_BLOCK_GROUP_DUP (1 << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) +#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) +#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) struct btrfs_block_group_item { __le64 used; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d829ef3..fadec64 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2496,6 +2496,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP); if (extra_flags) { @@ -2524,29 +2526,34 @@ static void set_block_group_readonly(struct btrfs_block_group_cache *cache) u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { u64 num_devices = root->fs_info->fs_devices->rw_devices; + u64 tmp; + /* First, mask out the RAID levels which aren''t possible */ if (num_devices == 1) - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5); + if (num_devices < 3) + flags &= ~BTRFS_BLOCK_GROUP_RAID6; if (num_devices < 4) flags &= ~BTRFS_BLOCK_GROUP_RAID10; - if ((flags & BTRFS_BLOCK_GROUP_DUP) && - (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10))) { - flags &= ~BTRFS_BLOCK_GROUP_DUP; - } + tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); + flags &= ~tmp; - if ((flags & BTRFS_BLOCK_GROUP_RAID1) && - (flags & BTRFS_BLOCK_GROUP_RAID10)) { - flags &= ~BTRFS_BLOCK_GROUP_RAID1; - } + if (tmp & BTRFS_BLOCK_GROUP_RAID6) + tmp = BTRFS_BLOCK_GROUP_RAID6; + else if (tmp & BTRFS_BLOCK_GROUP_RAID5) + tmp = BTRFS_BLOCK_GROUP_RAID5; + else if (tmp & BTRFS_BLOCK_GROUP_RAID10) + tmp = BTRFS_BLOCK_GROUP_RAID10; + else if (tmp & BTRFS_BLOCK_GROUP_RAID1) + tmp = BTRFS_BLOCK_GROUP_RAID1; + else if (tmp & BTRFS_BLOCK_GROUP_RAID0) + tmp = BTRFS_BLOCK_GROUP_RAID0; - if ((flags & BTRFS_BLOCK_GROUP_RAID0) && - ((flags & BTRFS_BLOCK_GROUP_RAID1) | - (flags & BTRFS_BLOCK_GROUP_RAID10) | - (flags & BTRFS_BLOCK_GROUP_DUP))) - flags &= ~BTRFS_BLOCK_GROUP_RAID0; - return flags; + return flags | tmp; } static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) @@ -6548,6 +6555,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) { u64 num_devices; u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; num_devices = root->fs_info->fs_devices->rw_devices; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f1f10ea..3b231ef 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -42,6 +42,21 @@ struct map_lookup { struct btrfs_bio_stripe stripes[]; }; +static inline int nr_parity_stripes(struct map_lookup *map) +{ + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 1; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return 2; + else + return 0; +} + +static inline int nr_data_stripes(struct map_lookup *map) +{ + return map->num_stripes - nr_parity_stripes(map); +} + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); @@ -1140,6 +1155,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto out; } + if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && + root->fs_info->fs_devices->rw_devices <= 2) { + printk(KERN_ERR "btrfs: unable to go below two " + "devices on raid5\n"); + ret = -EINVAL; + goto out; + } + if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && + root->fs_info->fs_devices->rw_devices <= 3) { + printk(KERN_ERR "btrfs: unable to go below three " + "devices on raid6\n"); + ret = -EINVAL; + goto out; + } + if (strcmp(device_path, "missing") == 0) { struct list_head *devices; struct btrfs_device *tmp; @@ -2090,6 +2120,10 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, return calc_size; else if (type & BTRFS_BLOCK_GROUP_RAID10) return calc_size * (num_stripes / sub_stripes); + else if (type & BTRFS_BLOCK_GROUP_RAID5) + return calc_size * (num_stripes - 1); + else if (type & BTRFS_BLOCK_GROUP_RAID6) + return calc_size * (num_stripes - 2); else return calc_size * num_stripes; } @@ -2153,6 +2187,18 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, sub_stripes = 2; min_stripes = 4; } + if (type & (BTRFS_BLOCK_GROUP_RAID5)) { + num_stripes = fs_devices->rw_devices; + if (num_stripes < 2) + return -ENOSPC; + min_stripes = 2; + } + if (type & (BTRFS_BLOCK_GROUP_RAID6)) { + num_stripes = fs_devices->rw_devices; + if (num_stripes < 3) + return -ENOSPC; + min_stripes = 3; + } if (type & BTRFS_BLOCK_GROUP_DATA) { max_chunk_size = 10 * calc_size; @@ -2539,6 +2585,10 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID5) + ret = 2; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + ret = 3; else ret = 1; free_extent_map(em); @@ -2645,6 +2695,7 @@ again: stripe_offset = offset - stripe_offset; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP)) { /* we limit the length of each bio to what fits in a stripe */ @@ -2691,6 +2742,25 @@ again: map->sub_stripes, stripe_index + current->pid % map->sub_stripes); } + + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + u64 tmp; + + stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + + /* + * Mirror #0 or #1 means the original data block. + * Mirror #2 is RAID5 parity block. + * Mirror #3 is RAID6 Q block. + */ + if (mirror_num > 1) + stripe_index = nr_data_stripes(map) + mirror_num - 2; + + /* We distribute the parity blocks across stripes */ + tmp = stripe_nr + stripe_index; + stripe_index = do_div(tmp, map->num_stripes); + } else { /* * after this do_div call, stripe_nr is the number of stripes @@ -2749,6 +2819,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 bytenr; u64 length; u64 stripe_nr; + u64 rmap_len; int i, j, nr = 0; spin_lock(&em_tree->lock); @@ -2759,10 +2830,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, map = (struct map_lookup *)em->bdev; length = em->len; + rmap_len = map->stripe_len; + if (map->type & BTRFS_BLOCK_GROUP_RAID10) do_div(length, map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) do_div(length, map->num_stripes); + else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + do_div(length, nr_data_stripes(map)); + rmap_len = map->stripe_len * nr_data_stripes(map); + } buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); BUG_ON(!buf); @@ -2782,8 +2860,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, do_div(stripe_nr, map->sub_stripes); } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = stripe_nr * map->num_stripes + i; - } - bytenr = chunk_start + stripe_nr * map->stripe_len; + } /* else if RAID[56], multiply by nr_data_stripes(). + * Alternatively, just use rmap_len below instead of + * map->stripe_len */ + + bytenr = chunk_start + stripe_nr * rmap_len; WARN_ON(nr >= map->num_stripes); for (j = 0; j < nr; j++) { if (buf[j] == bytenr) @@ -2797,7 +2878,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, *logical = buf; *naddrs = nr; - *stripe_len = map->stripe_len; + *stripe_len = rmap_len; free_extent_map(em); return 0; -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, 2009-07-11 at 15:39 +0100, David Woodhouse wrote:> This is a preliminary attempt to add RAID5 and RAID6 support.Matching btrfs-progs patch... diff --git a/ctree.h b/ctree.h index a9062ea..5b3c690 100644 --- a/ctree.h +++ b/ctree.h @@ -640,6 +640,8 @@ struct btrfs_csum_item { #define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) #define BTRFS_BLOCK_GROUP_DUP (1 << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) +#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) +#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) struct btrfs_block_group_item { __le64 used; diff --git a/extent-tree.c b/extent-tree.c index b2f9bb2..77cfcb5 100644 --- a/extent-tree.c +++ b/extent-tree.c @@ -1775,6 +1775,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_DUP); if (extra_flags) { if (flags & BTRFS_BLOCK_GROUP_DATA) diff --git a/mkfs.c b/mkfs.c index 2e99b95..aefe1af 100644 --- a/mkfs.c +++ b/mkfs.c @@ -203,16 +203,22 @@ static int create_raid_groups(struct btrfs_trans_handle *trans, u64 metadata_profile) { u64 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy); - u64 allowed; + u64 allowed = 0; int ret; - if (num_devices == 1) - allowed = BTRFS_BLOCK_GROUP_DUP; - else if (num_devices >= 4) { - allowed = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10; - } else - allowed = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1; + switch (num_devices) { + default: + case 4: + allowed |= BTRFS_BLOCK_GROUP_RAID10; + case 3: + allowed |= BTRFS_BLOCK_GROUP_RAID6; + case 2: + allowed |= BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5; + break; + case 1: + allowed |= BTRFS_BLOCK_GROUP_DUP; + } if (allowed & metadata_profile) { ret = create_one_raid_group(trans, root, @@ -292,6 +298,10 @@ static u64 parse_profile(char *s) return BTRFS_BLOCK_GROUP_RAID0; } else if (strcmp(s, "raid1") == 0) { return BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP; + } else if (strcmp(s, "raid5") == 0) { + return BTRFS_BLOCK_GROUP_RAID5; + } else if (strcmp(s, "raid6") == 0) { + return BTRFS_BLOCK_GROUP_RAID6; } else if (strcmp(s, "raid10") == 0) { return BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP; } else if (strcmp(s, "single") == 0) { diff --git a/volumes.c b/volumes.c index 7671855..90090b0 100644 --- a/volumes.c +++ b/volumes.c @@ -47,6 +47,21 @@ struct map_lookup { struct btrfs_bio_stripe stripes[]; }; +static inline int nr_parity_stripes(struct map_lookup *map) +{ + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 1; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return 2; + else + return 0; +} + +static inline int nr_data_stripes(struct map_lookup *map) +{ + return map->num_stripes - nr_parity_stripes(map); +} + #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) @@ -623,6 +638,10 @@ static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes, return calc_size; else if (type & BTRFS_BLOCK_GROUP_RAID10) return calc_size * (num_stripes / sub_stripes); + else if (type & BTRFS_BLOCK_GROUP_RAID5) + return calc_size * (num_stripes - 1); + else if (type & BTRFS_BLOCK_GROUP_RAID6) + return calc_size * (num_stripes - 2); else return calc_size * num_stripes; } @@ -664,6 +683,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, } if (type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP)) { if (type & BTRFS_BLOCK_GROUP_SYSTEM) { @@ -703,6 +723,18 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, sub_stripes = 2; min_stripes = 4; } + if (type & (BTRFS_BLOCK_GROUP_RAID5)) { + num_stripes = btrfs_super_num_devices(&info->super_copy); + if (num_stripes < 2) + return -ENOSPC; + min_stripes = 2; + } + if (type & (BTRFS_BLOCK_GROUP_RAID6)) { + num_stripes = btrfs_super_num_devices(&info->super_copy); + if (num_stripes < 3) + return -ENOSPC; + min_stripes = 3; + } /* we don''t want a chunk larger than 10% of the FS */ percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1); @@ -879,6 +911,10 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID5) + ret = 2; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + ret = 3; else ret = 1; return ret; @@ -894,6 +930,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 bytenr; u64 length; u64 stripe_nr; + u64 rmap_len; int i, j, nr = 0; ce = find_first_cache_extent(&map_tree->cache_tree, chunk_start); @@ -901,10 +938,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, map = container_of(ce, struct map_lookup, ce); length = ce->size; + rmap_len = map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID10) length = ce->size / (map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) length = ce->size / map->num_stripes; + else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + length = ce->size / nr_data_stripes(map); + rmap_len = map->stripe_len * nr_data_stripes(map); + } buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); @@ -923,8 +966,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, map->sub_stripes; } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = stripe_nr * map->num_stripes + i; - } - bytenr = ce->start + stripe_nr * map->stripe_len; + } /* else if RAID[56], multiply by nr_data_stripes(). + * Alternatively, just use rmap_len below instead of + * map->stripe_len */ + + bytenr = ce->start + stripe_nr * rmap_len; for (j = 0; j < nr; j++) { if (buf[j] == bytenr) break; @@ -935,7 +981,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, *logical = buf; *naddrs = nr; - *stripe_len = map->stripe_len; + *stripe_len = rmap_len; return 0; } @@ -1001,6 +1047,7 @@ again: stripe_offset = offset - stripe_offset; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP)) { /* we limit the length of each bio to what fits in a stripe */ @@ -1041,6 +1088,23 @@ again: multi->num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + + stripe_index = stripe_nr % nr_data_stripes(map); + stripe_nr = stripe_nr / nr_data_stripes(map); + + /* + * Mirror #0 or #1 means the original data block. + * Mirror #2 is RAID5 parity block. + * Mirror #3 is RAID6 Q block. + */ + if (mirror_num > 1) + stripe_index = nr_data_stripes(map) + mirror_num - 2; + + /* We distribute the parity blocks across stripes */ + stripe_index = (stripe_nr + stripe_index) & map->num_stripes; + } else { /* * after this do_div call, stripe_nr is the number of stripes -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, 2009-07-11 at 15:39 +0100, David Woodhouse wrote:> This is a preliminary attempt to add RAID5 and RAID6 support. > > So far it doesn''t attempt to write or read the parity blocks -- it > just > lays the data blocks out as we want them, so it''s effectively just a > complex and wasteful kind of RAID0. > > The next step is to make btrfs_map_bio() do the right thing: > - Satisfy read requests for mirrors #2 and #3 by recreating data from > RAID5 parity or RAID6 error correction stripe respectively. > - Write out parity and RAID6 blocks appropriately when data writes > happen.Actually, the next step is to tweak __btrfs_map_block() a bit more to let it return information about the whole stripe-set, so that btrfs_map_bio() _can_ do what we say above... So rather than just mapping the requested address as if it''s RAID0, we (where appropriate) return information about the _entire_ disk set in the btrfs_multi_bio, with an auxiliary array giving the _logical_ offset corresponding to each physical stripe in the referenced set (with special values for the P and Q stripes). We do this for all writes, and for reads where mirror_num > 1 (i.e. when we''re being asked to rebuild it from parity, rather than reading the original data blocks). git://, http://git.infradead.org/users/dwmw2/btrfs-raid56.git commit ed90c58ba7c60555af4b8c00a104c7d71f6db6d2 Author: David Woodhouse <David.Woodhouse@intel.com> Date: Sun Jul 12 11:15:22 2009 +0100 Btrfs: Let btrfs_map_block() return full stripe information for RAID[56] ... in the cases where it''s necessary -- which is for a write, or for a parity recovery attempt. We''ll let btrfs_map_bio() do the rest. Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3b231ef..55facd3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -62,6 +62,11 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_device *device); static int btrfs_relocate_sys_chunks(struct btrfs_root *root); +#define RAID5_P_STRIPE ((u64)-1) +#define RAID6_Q_STRIPE ((u64)-2) + +#define is_parity_stripe(x) ( ((x) == RAID5_P_STRIPE) || ((x) == RAID6_Q_STRIPE) ) + #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) @@ -2614,7 +2619,8 @@ static int find_live_mirror(struct map_lookup *map, int first, int num, static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, struct btrfs_multi_bio **multi_ret, - int mirror_num, struct page *unplug_page) + int mirror_num, struct page *unplug_page, + u64 **raid_map_ret) { struct extent_map *em; struct map_lookup *map; @@ -2622,6 +2628,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 offset; u64 stripe_offset; u64 stripe_nr; + u64 *raid_map = NULL; int stripes_allocated = 8; int stripes_required = 1; int stripe_index; @@ -2674,9 +2681,24 @@ again: max_errors = 1; } } - if (multi_ret && (rw & (1 << BIO_RW)) && - stripes_allocated < stripes_required) { - stripes_allocated = map->num_stripes; + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) + && multi_ret && (rw & (1 << BIO_RW) || mirror_num > 1) && raid_map_ret) { + /* RAID[56] write or recovery. Return all stripes */ + stripes_required = map->num_stripes; + max_errors = nr_parity_stripes(map); + + /* Only allocate the map if we''ve already got a large enough multi_ret */ + if (stripes_allocated >= stripes_required) { + raid_map = kmalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); + if (!raid_map) { + free_extent_map(em); + kfree(multi); + return -ENOMEM; + } + } + } + if (multi_ret && stripes_allocated < stripes_required) { + stripes_allocated = stripes_required; free_extent_map(em); kfree(multi); goto again; @@ -2749,18 +2771,43 @@ again: stripe_index = do_div(stripe_nr, nr_data_stripes(map)); - /* - * Mirror #0 or #1 means the original data block. - * Mirror #2 is RAID5 parity block. - * Mirror #3 is RAID6 Q block. - */ - if (mirror_num > 1) - stripe_index = nr_data_stripes(map) + mirror_num - 2; - - /* We distribute the parity blocks across stripes */ - tmp = stripe_nr + stripe_index; - stripe_index = do_div(tmp, map->num_stripes); - + if (unplug_page) { + stripe_index = 0; + num_stripes = map->num_stripes; + } else if (raid_map) { + int i, rot; + + /* Work out the disk rotation on this stripe-set */ + tmp = stripe_nr; + rot = do_div(tmp, map->num_stripes); + + /* Fill in the logical address of each stripe */ + tmp = stripe_nr * nr_data_stripes(map); + for (i = 0; i < nr_data_stripes(map); i++) + raid_map[(i+rot) % map->num_stripes] + em->start + (tmp + i) * map->stripe_len; + + raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + raid_map[(i+rot+1) % map->num_stripes] = RAID6_Q_STRIPE; + + *length = map->stripe_len; + stripe_index = 0; + stripe_offset = 0; + num_stripes = map->num_stripes; + } else { + /* + * Mirror #0 or #1 means the original data block. + * Mirror #2 is RAID5 parity block. + * Mirror #3 is RAID6 Q block. + */ + if (mirror_num > 1) + stripe_index = nr_data_stripes(map) + mirror_num - 2; + + /* We distribute the parity blocks across stripes */ + tmp = stripe_nr + stripe_index; + stripe_index = do_div(tmp, map->num_stripes); + } } else { /* * after this do_div call, stripe_nr is the number of stripes @@ -2795,6 +2842,8 @@ again: multi->num_stripes = num_stripes; multi->max_errors = max_errors; } + if (raid_map_ret) + *raid_map_ret = raid_map; out: free_extent_map(em); return 0; @@ -2805,7 +2854,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, struct btrfs_multi_bio **multi_ret, int mirror_num) { return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, - mirror_num, NULL); + mirror_num, NULL, NULL); } int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, @@ -2889,7 +2938,7 @@ int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, { u64 length = PAGE_CACHE_SIZE; return __btrfs_map_block(map_tree, READ, logical, &length, - NULL, 0, page); + NULL, 0, page, NULL); } static void end_bio_multi_stripe(struct bio *bio, int err) -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
This is a fairly crap hack. Even if the file system _does_ want to write a full stripe-set at a time, the merge_bio_hook logic will prevent it from doing so, and ensure that we always have to read the other stripes to recreate the parity -- with all the concurrency issues that involves. The raid_hack_mutex is a brute-force approach which isn''t even really sufficient to fix the RAID5 ''write hole'' problem, but does protect us against simultaneous writes to different data stripes in the same set. This hack serves two purposes: - It does actually write parity (and RAID6 syndrome) blocks so that I can implement and test the recovery. - It will hopefully offend someone more clueful about the higher layers of the filesystem, provoking them to help me with the task of ensuring that we only ever write full stripe-sets (or at least that we can waste the remaining free space in the stripe-set, if we can''t manage that). So hopefully most of this code can go away in the end -- although some of it may be cannibalised to handle rebuilding after a disk replacement. Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> --- fs/btrfs/Kconfig | 1 + fs/btrfs/volumes.c | 321 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 316 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 7bb3c02..4703325 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -4,6 +4,7 @@ config BTRFS_FS select LIBCRC32C select ZLIB_INFLATE select ZLIB_DEFLATE + select MD_RAID6_PQ help Btrfs is a new filesystem with extents, writable snapshotting, support for multiple devices and many more features. diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 55facd3..1f509ab 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -21,6 +21,7 @@ #include <linux/blkdev.h> #include <linux/random.h> #include <linux/iocontext.h> +#include <linux/raid/pq.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -61,6 +62,12 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); static int btrfs_relocate_sys_chunks(struct btrfs_root *root); +static int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, + int async, struct btrfs_multi_bio *multi, + u64 *raid_map, u64 stripe_len, int mirror_num); +static int raid56_parity_write(struct btrfs_root *root, struct bio *bio, + int async, struct btrfs_multi_bio *multi, + u64 *raid_map, u64 stripe_len); #define RAID5_P_STRIPE ((u64)-1) #define RAID6_Q_STRIPE ((u64)-2) @@ -3052,6 +3059,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 logical = (u64)bio->bi_sector << 9; u64 length = 0; u64 map_length; + u64 *raid_map = NULL; struct btrfs_multi_bio *multi = NULL; int ret; int dev_nr = 0; @@ -3061,10 +3069,25 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, - mirror_num); + ret = __btrfs_map_block(map_tree, rw, logical, &map_length, &multi, + mirror_num, NULL, &raid_map); BUG_ON(ret); + multi->end_io = first_bio->bi_end_io; + multi->private = first_bio->bi_private; + multi->orig_bio = first_bio; + atomic_set(&multi->stripes_pending, multi->num_stripes); + + if (raid_map) { + if (rw == READ) + return raid56_parity_recover(root, bio, async_submit, + multi, raid_map, map_length, + mirror_num); + else + return raid56_parity_write(root, bio, async_submit, multi, + raid_map, map_length); + } + total_devs = multi->num_stripes; if (map_length < length) { printk(KERN_CRIT "mapping failed logical %llu bio len %llu " @@ -3073,10 +3096,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, (unsigned long long)map_length); BUG(); } - multi->end_io = first_bio->bi_end_io; - multi->private = first_bio->bi_private; - multi->orig_bio = first_bio; - atomic_set(&multi->stripes_pending, multi->num_stripes); while (dev_nr < total_devs) { if (total_devs > 1) { @@ -3494,3 +3513,293 @@ error: btrfs_free_path(path); return ret; } + +static DEFINE_MUTEX(raid_hack_mutex); + +struct btrfs_raid_multi_bio { + struct btrfs_root *root; + struct btrfs_multi_bio *multi; + u64 *raid_map; + struct bio *bio[0]; +}; + +static void raid_write_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_multi_bio *rmult = bio->bi_private; + int i, j; + int nr_pages = rmult->multi->orig_bio->bi_size >> PAGE_SHIFT; + + if (err) + atomic_inc(&rmult->multi->error); + + if (!atomic_dec_and_test(&rmult->multi->stripes_pending)) + return; + + /* OK, we have read all the stripes we need to. */ + if (atomic_read(&rmult->multi->error)) { + bio_endio(rmult->multi->orig_bio, -EIO); + goto cleanup; + } + + rmult->multi->orig_bio->bi_private = rmult->multi->private; + rmult->multi->orig_bio->bi_end_io = rmult->multi->end_io; + bio_endio(rmult->multi->orig_bio, 0); + + cleanup: + for (i = 0; i < rmult->multi->num_stripes; i++) { + if (!rmult->bio[i]) + continue; + for (j = 0; j < nr_pages; j++) { + __free_page(rmult->bio[i]->bi_io_vec[j].bv_page); + } + bio_put(rmult->bio[i]); + } + kfree(rmult->raid_map); + kfree(rmult->multi); + kfree(rmult); + mutex_unlock(&raid_hack_mutex); +} + +static void raid_read_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_multi_bio *rmult = bio->bi_private; + int nr_pages = rmult->multi->orig_bio->bi_size >> PAGE_SHIFT; + int i, j, k; + void **pointers; + void *q_ptr = NULL, *p_ptr; + int dstripe, pstripe, qstripe; + + if (err) + atomic_inc(&rmult->multi->error); + + if (!atomic_dec_and_test(&rmult->multi->stripes_pending)) + return; + + /* OK, we have read all the stripes we need to. */ + if (atomic_read(&rmult->multi->error)) { + bio_endio(rmult->multi->orig_bio, -EIO); + goto cleanup; + } + + pointers = kmalloc(rmult->multi->num_stripes * sizeof(void *), GFP_ATOMIC); + BUG_ON(!pointers); /* FIXME */ + + for (i = 0; i < nr_pages; i++) { + p_ptr = q_ptr = NULL; + k = 0; + for (j = 0; j < rmult->multi->num_stripes; j++) { + struct bio *bio = rmult->bio[j]; + if (!bio) + bio = rmult->multi->orig_bio; + + /* Is this always a valid assumption? */ + BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_SIZE); + BUG_ON(bio->bi_io_vec[i].bv_offset); + + /* FIXME: Would be nice to kmap here so that we can allow highmem + pages, but since we''re in end_io context it would need to be + kmap_atomic, and there are an arbitrary number of pages... */ + if (rmult->raid_map[j] == RAID5_P_STRIPE) + p_ptr = phys_to_virt(page_to_phys(bio->bi_io_vec[i].bv_page)); + else if (rmult->raid_map[j] == RAID6_Q_STRIPE) + q_ptr = phys_to_virt(page_to_phys(bio->bi_io_vec[i].bv_page)); + else + pointers[k++] = phys_to_virt(page_to_phys(bio->bi_io_vec[i].bv_page)); + } + if (q_ptr) { + pointers[k++] = p_ptr; + if (q_ptr) + pointers[k++] = q_ptr; + BUG_ON(k != j); + + raid6_call.gen_syndrome(rmult->multi->num_stripes, + PAGE_SIZE, pointers); + } else { + memset(p_ptr, 0, PAGE_SIZE); + for (j = 0; j < PAGE_SIZE; j += sizeof(unsigned long)) { + for (k = 0; k < rmult->multi->num_stripes - 1; k++) { + *(unsigned long *)(p_ptr + j) ^+ *(unsigned long *)(pointers[k] + j); + } + } + } + /* kunmap pages here */ + } + kfree(pointers); + + /* Now, submit the P and Q blocks as well as the original write */ + /* Find which data stripe the original write goes to */ + + dstripe = pstripe = qstripe = -1; + for (i = 0; i < rmult->multi->num_stripes; i++) { + if (!rmult->bio[i]) + dstripe = i; + else if (rmult->raid_map[i] == RAID5_P_STRIPE) + pstripe = i; + else if (rmult->raid_map[i] == RAID6_Q_STRIPE) + qstripe = i; + } + + atomic_set(&rmult->multi->stripes_pending, (qstripe == -1)?2:3); + + rmult->bio[pstripe]->bi_sector = rmult->multi->stripes[pstripe].physical >> 9; + rmult->bio[pstripe]->bi_private = rmult; + rmult->bio[pstripe]->bi_end_io = raid_write_end_io; + schedule_bio(rmult->root, rmult->multi->stripes[pstripe].dev, WRITE, + rmult->bio[pstripe]); + + if (qstripe != -1) { + rmult->bio[qstripe]->bi_sector = rmult->multi->stripes[qstripe].physical >> 9; + rmult->bio[qstripe]->bi_private = rmult; + rmult->bio[qstripe]->bi_end_io = raid_write_end_io; + schedule_bio(rmult->root, rmult->multi->stripes[qstripe].dev, WRITE, + rmult->bio[qstripe]); + } + + rmult->multi->orig_bio->bi_sector = rmult->multi->stripes[dstripe].physical >> 9; + rmult->multi->orig_bio->bi_private = rmult; + rmult->multi->orig_bio->bi_end_io = raid_write_end_io; + schedule_bio(rmult->root, rmult->multi->stripes[dstripe].dev, WRITE, + rmult->multi->orig_bio); + + return; + + cleanup: + for (i = 0; i < rmult->multi->num_stripes; i++) { + if (!rmult->bio[i]) + continue; + for (j = 0; j < nr_pages; j++) { + __free_page(rmult->bio[i]->bi_io_vec[j].bv_page); + } + bio_put(rmult->bio[i]); + } + kfree(rmult->raid_map); + kfree(rmult->multi); + kfree(rmult); + mutex_unlock(&raid_hack_mutex); +} + +static struct bio *alloc_raid_stripe_bio(struct btrfs_bio_stripe *stripe, + u64 len) +{ + struct bio *bio; + + bio = bio_alloc(GFP_NOFS, len >> PAGE_SHIFT?:1); + if (!bio) + return NULL; + + bio->bi_size = 0; + bio->bi_bdev = stripe->dev->bdev; + bio->bi_sector = stripe->physical >> 9; + + while (len) { + int this = min_t(u64, len, PAGE_SIZE); + struct page *page = alloc_page(GFP_NOFS); + if (!page || bio_add_page(bio, page, this, 0) < this) { + if (page) + __free_page(page); + /* FIXME free pages attached to it */ + bio_put(bio); + return NULL; + } + len -= this; + } + return bio; +} + +static int raid56_parity_write(struct btrfs_root *root, struct bio *bio, + int async, struct btrfs_multi_bio *multi, + u64 *raid_map, u64 stripe_len) +{ + int i; + int start_ofs, end_ofs; + int stripes_to_read = 0; + u64 logical = (u64)bio->bi_sector << 9; + + struct btrfs_raid_multi_bio *rmult; + + rmult = kzalloc(sizeof(*rmult) + multi->num_stripes * sizeof(void *), + GFP_NOFS); + if (!rmult) { + kfree(raid_map); + kfree(multi); + return -ENOMEM; + } + rmult->multi = multi; + rmult->raid_map = raid_map; + rmult->root = root; + + /* + * FIXME: the merge_bio_hook logic currently ensures that writes only + * cover one stripe, meaning we _always_ have to read the other data + * stripes to generate the parity (unless we have the minimum number + * of disks. We want to avoid that read/modify/write cycle somehow by + * ensuring that the file system submits writes that cover a full + * stripe-set. + * + * When we achieve that, we''ll want to split the original write bio + * into its individual stripes here (which means that the parity + * calculation code doesn''t have to be _completely_ rewritten.) + * + * And we can ditch this mutex too: + */ + mutex_lock(&raid_hack_mutex); + + /* What subrange of the stripe are we writing? */ + start_ofs = do_div(logical, stripe_len); + end_ofs = start_ofs + bio->bi_size; + BUG_ON(end_ofs > stripe_len); + + /* Allocate bios for reading and for the parity and q-stripe writes */ + logical = (u64)bio->bi_sector << 9; + for (i = 0; i < multi->num_stripes; i++) { + if (start_ofs) { + if (!is_parity_stripe(raid_map[i])) + raid_map[i] += start_ofs; + multi->stripes[i].physical += start_ofs; + } + if (raid_map[i] == logical) { + /* Set the correct bdev for the original write bio */ + bio->bi_bdev = multi->stripes[i].dev->bdev; + } else { + rmult->bio[i] = alloc_raid_stripe_bio(&multi->stripes[i], + bio->bi_size); + BUG_ON(!rmult->bio[i]); /* FIXME */ + rmult->bio[i]->bi_private = rmult; + + if (!is_parity_stripe(raid_map[i])) + stripes_to_read++; + } + } + if (!stripes_to_read) { + /* Nothing to read -- just calculate parity and write it all */ + atomic_set(&multi->stripes_pending, 1); + bio->bi_private = rmult; + raid_read_end_io(bio, 0); + return 0; + } + + atomic_set(&multi->stripes_pending, stripes_to_read); + for (i = 0; i < multi->num_stripes; i++) { + if (rmult->bio[i] && !is_parity_stripe(raid_map[i])) { + rmult->bio[i]->bi_end_io = raid_read_end_io; + if (async) + schedule_bio(root, multi->stripes[i].dev, READ, rmult->bio[i]); + else + submit_bio(READ, rmult->bio[i]); + } + } + return 0; +} + +static int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, + int async, struct btrfs_multi_bio *multi, + u64 *raid_map, u64 stripe_len, int mirror_num) +{ + WARN_ON(1); + kfree(multi); + kfree(raid_map); + bio_endio(bio, -EIO); + return 0; +} + -- 1.6.2.2 -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2009-07-13 at 11:05 +0100, David Woodhouse wrote:> > This hack serves two purposes: > - It does actually write parity (and RAID6 syndrome) blocks so that I > can implement and test the recovery.diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1f509ab..a23510b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3792,14 +3792,193 @@ static int raid56_parity_write(struct btrfs_root *root, struct bio *bio, return 0; } +static void raid_recover_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_multi_bio *rmult = bio->bi_private; + int nr_pages = rmult->multi->orig_bio->bi_size >> PAGE_SHIFT; + int i, j, k; + void **pointers; + void *q_ptr = NULL, *p_ptr; + int faila = -1, failb = -1; + + if (err) + atomic_inc(&rmult->multi->error); + + if (!atomic_dec_and_test(&rmult->multi->stripes_pending)) + return; + + /* OK, we have read all the stripes we need to. */ + if (atomic_read(&rmult->multi->error) > rmult->multi->max_errors - 1) { + bio_endio(rmult->multi->orig_bio, -EIO); + goto cleanup; + } + + pointers = kmalloc(rmult->multi->num_stripes * sizeof(void *), GFP_ATOMIC); + if (!pointers) { + bio_endio(rmult->multi->orig_bio, -EIO); + goto cleanup; + } + + for (i = 0; i < nr_pages; i++) { + p_ptr = q_ptr = NULL; + k = 0; + for (j = 0; j < rmult->multi->num_stripes; j++) { + struct bio *bio = rmult->bio[j]; + if (!bio) { + if (rmult->raid_map[j] == RAID6_Q_STRIPE) + continue; + bio = rmult->multi->orig_bio; + faila = j; + } else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + /* We counted the errors. There can be only one */ + BUG_ON(failb != -1); + if (rmult->raid_map[j] == RAID6_Q_STRIPE) { + /* Eep. Can''t recover from this. Theoretically if the only + failure is the Q stripe and the original data we''re trying + to read, then parity should have recovered it. But we''d + only get here if that was broken _too_ */ + bio_endio(rmult->multi->orig_bio, -EIO); + kfree(pointers); + goto cleanup; + } else if (rmult->raid_map[j] == RAID5_P_STRIPE) { + failb = -2; + } else { + failb = j; + } + } + + /* Is this always a valid assumption? */ + BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_SIZE); + BUG_ON(bio->bi_io_vec[i].bv_offset); + + /* FIXME: Would be nice to kmap here so that we can allow highmem + pages, but since we''re in end_io context it would need to be + kmap_atomic, and there are an arbitrary number of pages... */ + if (rmult->raid_map[j] == RAID5_P_STRIPE) + p_ptr = phys_to_virt(page_to_phys(bio->bi_io_vec[i].bv_page)); + else if (rmult->raid_map[j] == RAID6_Q_STRIPE) + q_ptr = phys_to_virt(page_to_phys(bio->bi_io_vec[i].bv_page)); + else + pointers[k++] = phys_to_virt(page_to_phys(bio->bi_io_vec[i].bv_page)); + } + pointers[k++] = p_ptr; + + if (q_ptr) { + pointers[k++] = q_ptr; + BUG_ON(k != j); + + if (failb == -1) { + /* + * Eep. We don''t _have_ a second failure, so parity really + * _should_ have worked. One of the stripes must be _corrupted_ + * rather than unreadable, which is a problem for us -- we have + * no way of knowing which one. Theoretically, we could increase + * the value of btrfs_num_copies() to let the upper layers try + * _all_ possible combinations until it finds one that looks OK? + */ + failb = -2; + } + if (failb == -2) { + raid6_datap_recov(rmult->multi->num_stripes, PAGE_SIZE, faila, pointers); + } else { + if (faila > failb) { + int tmp = failb; + failb = faila; + faila = tmp; + } + raid6_2data_recov(rmult->multi->num_stripes, PAGE_SIZE, faila, failb, pointers); + } + } else { + memcpy(pointers[faila], p_ptr, PAGE_SIZE); + for (k = 0; pointers[k] != p_ptr; k++) { + if (k == faila) + continue; + for (j = 0; j < PAGE_SIZE; j += sizeof(unsigned long)) { + *(unsigned long *)(pointers[faila] + j) ^+ *(unsigned long *)(pointers[k] + j); + } + } + } + /* kunmap pages here */ + } + kfree(pointers); + + rmult->multi->orig_bio->bi_size = 0; + bio_endio(rmult->multi->orig_bio, 0); + return; + + cleanup: + for (i = 0; i < rmult->multi->num_stripes; i++) { + if (!rmult->bio[i]) + continue; + for (j = 0; j < nr_pages; j++) { + __free_page(rmult->bio[i]->bi_io_vec[j].bv_page); + } + bio_put(rmult->bio[i]); + } + kfree(rmult->raid_map); + kfree(rmult->multi); + kfree(rmult); +} + static int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, int async, struct btrfs_multi_bio *multi, u64 *raid_map, u64 stripe_len, int mirror_num) { - WARN_ON(1); - kfree(multi); - kfree(raid_map); - bio_endio(bio, -EIO); + int i; + int start_ofs, end_ofs; + int stripes_to_read = 0; + u64 logical = (u64)bio->bi_sector << 9; + struct btrfs_raid_multi_bio *rmult; + + rmult = kzalloc(sizeof(*rmult) + multi->num_stripes * sizeof(void *), + GFP_NOFS); + if (!rmult) { + kfree(raid_map); + kfree(multi); + return -ENOMEM; + } + rmult->multi = multi; + rmult->raid_map = raid_map; + rmult->root = root; + + /* What subrange of the stripe are we reading? */ + start_ofs = do_div(logical, stripe_len); + end_ofs = start_ofs + bio->bi_size; + BUG_ON(end_ofs > stripe_len); + + /* Allocate bios for reading all the other stripes */ + logical = (u64)bio->bi_sector << 9; + for (i = 0; i < multi->num_stripes; i++) { + if (start_ofs) { + if (!is_parity_stripe(raid_map[i])) + raid_map[i] += start_ofs; + multi->stripes[i].physical += start_ofs; + } + /* Don''t read the original data block, of course. And + don''t read the Q stripe if we''re asked for mirror #2 + (which means recreate from parity) */ + if (raid_map[i] != logical && + (raid_map[i] != RAID6_Q_STRIPE || mirror_num == 3)) { + rmult->bio[i] = alloc_raid_stripe_bio(&multi->stripes[i], + bio->bi_size); + BUG_ON(!rmult->bio[i]); /* FIXME */ + rmult->bio[i]->bi_private = rmult; + rmult->bio[i]->bi_end_io = raid_recover_end_io; + stripes_to_read++; + } + } + + atomic_set(&multi->stripes_pending, stripes_to_read); + for (i = 0; i < multi->num_stripes; i++) { + + if (rmult->bio[i]) { + if (async) + schedule_bio(root, multi->stripes[i].dev, READ, rmult->bio[i]); + else + submit_bio(READ, rmult->bio[i]); + } + } return 0; } -- 1.6.2.5 -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, 2009-07-11 at 15:40 +0100, David Woodhouse wrote:> On Sat, 2009-07-11 at 15:39 +0100, David Woodhouse wrote: > > This is a preliminary attempt to add RAID5 and RAID6 support. > > Matching btrfs-progs patch...And this makes it actually write the P and Q stripes... These patches at git://, http://git.infradead.org/users/dwmw2/btrfs-progs-raid56.git I can now make a 4-disk RAID6 file system, copy some stuff to it, then kick out two of the disks and use it in degraded mode, and everything seems to work fine. diff --git a/Makefile b/Makefile index 8097b5a..2d8d349 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ CFLAGS = -g -Werror -Os objects = ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \ root-tree.o dir-item.o file-item.o inode-item.o \ inode-map.o crc32c.o rbtree.o extent-cache.o extent_io.o \ - volumes.o utils.o + volumes.o utils.o raid6.o # CHECKFLAGS=-D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \ diff --git a/disk-io.c b/disk-io.c index addebe1..c33c31b 100644 --- a/disk-io.c +++ b/disk-io.c @@ -138,7 +138,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, dev_nr = 0; length = blocksize; ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, - bytenr, &length, &multi, 0); + bytenr, &length, &multi, 0, NULL); BUG_ON(ret); device = multi->stripes[0].dev; device->total_ios++; @@ -196,7 +196,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, length = blocksize; while (1) { ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, - eb->start, &length, &multi, mirror_num); + eb->start, &length, &multi, mirror_num, + NULL); BUG_ON(ret); device = multi->stripes[0].dev; eb->fd = device->fd; @@ -224,12 +225,93 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, return NULL; } +static int write_raid56_with_parity(struct extent_buffer *eb, + struct btrfs_multi_bio *multi, + u64 stripe_len, u64 *raid_map) +{ + struct extent_buffer *ebs[multi->num_stripes], *p_eb = NULL, *q_eb = NULL; + u64 start_ofs, end_ofs; + int i, j; + int ret; + + start_ofs = eb->start % stripe_len; + end_ofs = start_ofs + eb->len; + BUG_ON(end_ofs > stripe_len); + + j = 0; + for (i = 0; i < multi->num_stripes; i++) { + struct extent_buffer *new_eb; + if (start_ofs) { + multi->stripes[i].physical += start_ofs; + if (raid_map[i] != (u64)-1 && raid_map[i] != (u64)-2) + raid_map[i] += start_ofs; + } + if (raid_map[i] == eb->start) { + eb->dev_bytenr = multi->stripes[i].physical; + eb->fd = multi->stripes[i].dev->fd; + multi->stripes[i].dev->total_ios++; + ebs[j++] = eb; + continue; + } + new_eb = kmalloc(sizeof(*eb) + eb->len, GFP_NOFS); + BUG_ON(!new_eb); + new_eb->dev_bytenr = multi->stripes[i].physical; + new_eb->fd = multi->stripes[i].dev->fd; + multi->stripes[i].dev->total_ios++; + new_eb->len = eb->len; + if (raid_map[i] == (u64)-1) { + p_eb = new_eb; + } else if (raid_map[i] == (u64)-2) { + q_eb = new_eb; + } else { + ret = read_extent_from_disk(new_eb); + BUG_ON(ret); + ebs[j++] = new_eb; + } + } + ebs[j++] = p_eb; + if (q_eb) { + void *pointers[multi->num_stripes]; + + ebs[j++] = q_eb; + + for (i = 0; i < multi->num_stripes; i++) + pointers[i] = ebs[i]->data; + + raid6_gen_syndrome(multi->num_stripes, eb->len, pointers); + + ret = write_extent_to_disk(q_eb); + BUG_ON(ret); + } else { + memcpy(p_eb->data, ebs[0]->data, eb->len); + for (j = 1; j < multi->num_stripes - 1; j++) { + for (i = 0; i < eb->len; i += sizeof(unsigned long)) { + *(unsigned long *)(p_eb->data + i) ^+ *(unsigned long *)(ebs[j]->data + i); + } + } + } + + ret = write_extent_to_disk(p_eb); + BUG_ON(ret); + + ret = write_extent_to_disk(eb); + BUG_ON(ret); + + for (i = 0; i < multi->num_stripes; i++) + if (ebs[i] != eb) + kfree(ebs[i]); + + return 0; +} + int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb) { int ret; int dev_nr; u64 length; + u64 *raid_map = NULL; struct btrfs_multi_bio *multi = NULL; if (check_tree_block(root, eb)) @@ -243,9 +325,12 @@ int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, dev_nr = 0; length = eb->len; ret = btrfs_map_block(&root->fs_info->mapping_tree, WRITE, - eb->start, &length, &multi, 0); + eb->start, &length, &multi, 0, &raid_map); - while(dev_nr < multi->num_stripes) { + if (raid_map) { + ret = write_raid56_with_parity(eb, multi, length, raid_map); + BUG_ON(ret); + } else while (dev_nr < multi->num_stripes) { BUG_ON(ret); eb->fd = multi->stripes[dev_nr].dev->fd; eb->dev_bytenr = multi->stripes[dev_nr].physical; diff --git a/disk-io.h b/disk-io.h index 49e5692..546649f 100644 --- a/disk-io.h +++ b/disk-io.h @@ -76,3 +76,6 @@ int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, int verify); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); #endif + +/* raid6.c */ +void raid6_gen_syndrome(int disks, size_t bytes, void **ptrs); diff --git a/raid6.c b/raid6.c new file mode 100644 index 0000000..2ba9d90 --- /dev/null +++ b/raid6.c @@ -0,0 +1,105 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6int1.c + * + * 1-way unrolled portable integer math RAID-6 instruction set + * + * This file was postprocessed using unroll.pl and then ported to userspace + */ +#include <stdint.h> +#include <unistd.h> +/* + * This is the C data type to use + */ + +/* Change this from BITS_PER_LONG if there is something better... */ +#if BITS_PER_LONG == 64 +# define NBYTES(x) ((x) * 0x0101010101010101UL) +# define NSIZE 8 +# define NSHIFT 3 +typedef uint64_t unative_t; +#else +# define NBYTES(x) ((x) * 0x01010101U) +# define NSIZE 4 +# define NSHIFT 2 +typedef uint32_t unative_t; +#endif + +#ifdef __GNUC__ +#define __attribute_const__ __attribute__((const)) +#else +#define __attribute_const__ +#endif + + + +/* + * These sub-operations are separate inlines since they can sometimes be + * specially optimized using architecture-specific hacks. + */ + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline __attribute_const__ unative_t SHLBYTE(unative_t v) +{ + unative_t vv; + + vv = (v << 1) & NBYTES(0xfe); + return vv; +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline __attribute_const__ unative_t MASK(unative_t v) +{ + unative_t vv; + + vv = v & NBYTES(0x80); + vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */ + return vv; +} + + +void raid6_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + uint8_t **dptr = (uint8_t **)ptrs; + uint8_t *p, *q; + int d, z, z0; + + unative_t wd0, wq0, wp0, w10, w20; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*1 ) { + wq0 = wp0 = *(unative_t *)&dptr[z0][d+0*NSIZE]; + for ( z = z0-1 ; z >= 0 ; z-- ) { + wd0 = *(unative_t *)&dptr[z][d+0*NSIZE]; + wp0 ^= wd0; + w20 = MASK(wq0); + w10 = SHLBYTE(wq0); + w20 &= NBYTES(0x1d); + w10 ^= w20; + wq0 = w10 ^ wd0; + } + *(unative_t *)&p[d+NSIZE*0] = wp0; + *(unative_t *)&q[d+NSIZE*0] = wq0; + } +} + diff --git a/volumes.c b/volumes.c index 90090b0..f146750 100644 --- a/volumes.c +++ b/volumes.c @@ -62,6 +62,12 @@ static inline int nr_data_stripes(struct map_lookup *map) return map->num_stripes - nr_parity_stripes(map); } + +#define RAID5_P_STRIPE ((u64)-1) +#define RAID6_Q_STRIPE ((u64)-2) + +#define is_parity_stripe(x) ( ((x) == RAID5_P_STRIPE) || ((x) == RAID6_Q_STRIPE) ) + #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) @@ -988,13 +994,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num) + struct btrfs_multi_bio **multi_ret, int mirror_num, + u64 **raid_map_ret) { struct cache_extent *ce; struct map_lookup *map; u64 offset; u64 stripe_offset; u64 stripe_nr; + u64 *raid_map = NULL; int stripes_allocated = 8; int stripes_required = 1; int stripe_index; @@ -1026,10 +1034,24 @@ again: stripes_required = map->sub_stripes; } } + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) + && multi_ret && ((rw & WRITE) || mirror_num > 1) && raid_map_ret) { + /* RAID[56] write or recovery. Return all stripes */ + stripes_required = map->num_stripes; + + /* Only allocate the map if we''ve already got a large enough multi_ret */ + if (stripes_allocated >= stripes_required) { + raid_map = kmalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); + if (!raid_map) { + kfree(multi); + return -ENOMEM; + } + } + } + /* if our multi bio struct is too small, back off and try again */ - if (multi_ret && rw == WRITE && - stripes_allocated < stripes_required) { - stripes_allocated = map->num_stripes; + if (multi_ret && stripes_allocated < stripes_required) { + stripes_allocated = stripes_required; kfree(multi); goto again; } @@ -1094,17 +1116,39 @@ again: stripe_index = stripe_nr % nr_data_stripes(map); stripe_nr = stripe_nr / nr_data_stripes(map); - /* - * Mirror #0 or #1 means the original data block. - * Mirror #2 is RAID5 parity block. - * Mirror #3 is RAID6 Q block. - */ - if (mirror_num > 1) - stripe_index = nr_data_stripes(map) + mirror_num - 2; + if (raid_map) { + int i, rot; + u64 tmp; + + /* Work out the disk rotation on this stripe-set */ + rot = stripe_nr % map->num_stripes; + + /* Fill in the logical address of each stripe */ + tmp = stripe_nr * nr_data_stripes(map); + for (i = 0; i < nr_data_stripes(map); i++) + raid_map[(i+rot) % map->num_stripes] + ce->start + (tmp + i) * map->stripe_len; - /* We distribute the parity blocks across stripes */ - stripe_index = (stripe_nr + stripe_index) & map->num_stripes; + raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + raid_map[(i+rot+1) % map->num_stripes] = RAID6_Q_STRIPE; + *length = map->stripe_len; + stripe_index = 0; + stripe_offset = 0; + multi->num_stripes = map->num_stripes; + } else { + /* + * Mirror #0 or #1 means the original data block. + * Mirror #2 is RAID5 parity block. + * Mirror #3 is RAID6 Q block. + */ + if (mirror_num > 1) + stripe_index = nr_data_stripes(map) + mirror_num - 2; + + /* We distribute the parity blocks across stripes */ + stripe_index = (stripe_nr + stripe_index) & map->num_stripes; + } } else { /* * after this do_div call, stripe_nr is the number of stripes @@ -1124,6 +1168,8 @@ again: stripe_index++; } *multi_ret = multi; + if (raid_map_ret) + *raid_map_ret = raid_map; out: return 0; } diff --git a/volumes.h b/volumes.h index bb78751..1e993db 100644 --- a/volumes.h +++ b/volumes.h @@ -98,7 +98,8 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, u64 num_bytes, u64 *start); int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num); + struct btrfs_multi_bio **multi_ret, int mirror_num, + u64 **raid_map); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hello list, for an internal demonstration system due in Q1/2011, I am very much interested in setting up a btrfs fs with RAID5; RAID1 just won''t cut it. As can be read in the btrfs wiki [1], the ''magic'' number for RAID5 support in btrfs is 2.6.37 which is just around the corner. So my question is: Is this still the roadmap for the introduction of RAID5 and RAID6, or has it been delayed somewhat? If so, what is the current target kernel release for btrfs RAID5 support? Thanks for the information! Best regards Martin [1] https://btrfs.wiki.kernel.org/index.php/FAQ#Can_I_use_RAID.5B56.5D_on_my_Btrfs_filesystem.3F -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html