David Woodhouse
2009-Aug-05 20:04 UTC
RAID[56] with arbitrary numbers of "parity" stripes.
We discussed using the top bits of the chunk type field field to store a number of redundant disks -- so instead of RAID5, RAID6, etc., we end up with a single ''RAID56'' flag, and the amount of redundancy is stored elsewhere. This attempts it, but I hate it and don''t really want to do it. The type field is designed as a bitmask, and _used_ as a bitmask in a number of places -- I think it''s ugly and fragile to do it this way (and degraded mounts aren''t working for some reason I haven''t chased down yet). I''d much prefer to stick with the separate bit flags for RAID5 and RAID6 (and RAID7, RAID8, or whatever we want to call the versions with 3, 4, or more redundant blocks). We have a 64-bit bitfield, after all -- we''re not exactly short of bits even once we start doing RAID50, RAID60, etc... diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7326707..71dd726 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -655,8 +655,14 @@ struct btrfs_csum_item { #define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) #define BTRFS_BLOCK_GROUP_DUP (1 << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) -#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) -#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) +#define BTRFS_BLOCK_GROUP_RAID56 (1 << 7) + +#define BTRFS_BLOCK_GROUP_USED_BITS 8 +/* For RAID5/RAID6, the top 8 bits indicate the number of spares + (1 for RAID5, 2 for RAID6, more once we get the arithmetic for it */ +#define BTRFS_BLOCK_GROUP_MASK (((u64)1 << 56) - 1) + +#define BTRFS_RAID56_MAX_SPARES 2 struct btrfs_block_group_item { __le64 used; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0cbf28e..fff73c4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2611,20 +2611,30 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { + u64 *avail = NULL; u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6 | + BTRFS_BLOCK_GROUP_RAID56 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP); if (extra_flags) { if (flags & BTRFS_BLOCK_GROUP_DATA) - fs_info->avail_data_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_METADATA) - fs_info->avail_metadata_alloc_bits |= extra_flags; - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - fs_info->avail_system_alloc_bits |= extra_flags; + avail = &fs_info->avail_data_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + avail = &fs_info->avail_metadata_alloc_bits; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + avail = &fs_info->avail_system_alloc_bits; + else BUG(); + + *avail |= extra_flags & BTRFS_BLOCK_GROUP_MASK; } + if (avail && extra_flags & BTRFS_BLOCK_GROUP_RAID56) { + u64 nr_spares = flags >> 56; + + if (nr_spares > *avail >> 56) + *avail = (*avail & BTRFS_BLOCK_GROUP_MASK) | + nr_spares << 56; + } } static void set_block_group_readonly(struct btrfs_block_group_cache *cache) @@ -2643,27 +2653,27 @@ static void set_block_group_readonly(struct btrfs_block_group_cache *cache) u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { u64 num_devices = root->fs_info->fs_devices->rw_devices; + u64 num_spares = flags >> 56; u64 tmp; /* First, mask out the RAID levels which aren''t possible */ if (num_devices == 1) flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID5); - if (num_devices < 3) - flags &= ~BTRFS_BLOCK_GROUP_RAID6; + BTRFS_BLOCK_GROUP_RAID56); if (num_devices < 4) flags &= ~BTRFS_BLOCK_GROUP_RAID10; tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID56 | + BTRFS_BLOCK_GROUP_RAID10 | ~BTRFS_BLOCK_GROUP_MASK); flags &= ~tmp; - if (tmp & BTRFS_BLOCK_GROUP_RAID6) - tmp = BTRFS_BLOCK_GROUP_RAID6; - else if (tmp & BTRFS_BLOCK_GROUP_RAID5) - tmp = BTRFS_BLOCK_GROUP_RAID5; - else if (tmp & BTRFS_BLOCK_GROUP_RAID10) + if (tmp & BTRFS_BLOCK_GROUP_RAID56) { + if (num_spares > num_devices - 1) + num_spares = num_devices - 1; + BUG_ON(!num_spares); + tmp = BTRFS_BLOCK_GROUP_RAID56 | (num_spares << 56); + } else if (tmp & BTRFS_BLOCK_GROUP_RAID10) tmp = BTRFS_BLOCK_GROUP_RAID10; else if (tmp & BTRFS_BLOCK_GROUP_RAID1) tmp = BTRFS_BLOCK_GROUP_RAID1; @@ -2691,7 +2701,6 @@ static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) info->metadata_alloc_profile; data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; } - return btrfs_reduce_alloc_profile(root, data); } @@ -3635,7 +3644,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, u64 search_start, u64 search_end, u64 hint_byte, struct btrfs_key *ins, u64 exclude_start, u64 exclude_nr, - int data) + u64 data) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; @@ -6774,8 +6783,7 @@ out: static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) { u64 num_devices; - u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | + u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; num_devices = root->fs_info->fs_devices->rw_devices; @@ -7284,6 +7292,47 @@ int btrfs_read_block_groups(struct btrfs_root *root) key.objectid = found_key.objectid + found_key.offset; btrfs_release_path(root, path); cache->flags = btrfs_block_group_flags(&cache->item); + + if (!!(cache->flags & BTRFS_BLOCK_GROUP_DATA) + + !!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) + + !!(cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) != 1) { + printk(KERN_ERR "btrfs block group has no storage type (%llx)\n", + cache->flags); + kfree(cache); + ret = -EINVAL; + goto error; + } +#if 1 /* Compat with old progs */ + if (cache->flags & BTRFS_BLOCK_GROUP_RAID56) { + int num_spares = cache->flags >> 56; + if (!num_spares) + cache->flags |= 1ULL<<56; + + } + + if (cache->flags & (BTRFS_BLOCK_GROUP_RAID56 << 1)) { + cache->flags &= ~(BTRFS_BLOCK_GROUP_RAID56 << 1); + cache->flags |= BTRFS_BLOCK_GROUP_RAID56 | 2ULL<<56; + } +#endif + if ((cache->flags & BTRFS_BLOCK_GROUP_MASK) >> BTRFS_BLOCK_GROUP_USED_BITS) { + printk(KERN_ERR "btrfs block group has unknown bits (%llx)\n", + cache->flags); + kfree(cache); + ret = -EINVAL; + goto error; + } + if (cache->flags & BTRFS_BLOCK_GROUP_RAID56) { + int num_spares = cache->flags >> 56; + if (!num_spares || num_spares > BTRFS_RAID56_MAX_SPARES) { + printk(KERN_ERR "btrfs RAID5/6 group has %d spares (flags %llx)\n", + num_spares, cache->flags); + kfree(cache); + ret = -EINVAL; + goto error; + } + } + cache->sectorsize = root->sectorsize; remove_sb_from_cache(root, cache); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 95babc1..28291cc 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -45,12 +45,7 @@ struct map_lookup { static inline int nr_parity_stripes(struct map_lookup *map) { - if (map->type & BTRFS_BLOCK_GROUP_RAID5) - return 1; - else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - return 2; - else - return 0; + return map->type >> 56; } static inline int nr_data_stripes(struct map_lookup *map) @@ -1176,19 +1171,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto out; } - if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && - root->fs_info->fs_devices->rw_devices <= 2) { - printk(KERN_ERR "btrfs: unable to go below two " - "devices on raid5\n"); - ret = -EINVAL; - goto out; - } - if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && - root->fs_info->fs_devices->rw_devices <= 3) { - printk(KERN_ERR "btrfs: unable to go below three " - "devices on raid6\n"); - ret = -EINVAL; - goto out; + if (all_avail & BTRFS_BLOCK_GROUP_RAID56) { + int required_devs = max(root->fs_info->avail_data_alloc_bits >> 56, + max(root->fs_info->avail_system_alloc_bits >> 56, + root->fs_info->avail_metadata_alloc_bits >> 56)); + if (root->fs_info->fs_devices->rw_devices <= required_devs + 1) { + printk(KERN_ERR "btrfs: unable to go below %d " + "devices on raid5/raid6\n", required_devs + 1); + ret = -EINVAL; + goto out; + } } if (strcmp(device_path, "missing") == 0) { @@ -2142,10 +2134,8 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, return calc_size; else if (type & BTRFS_BLOCK_GROUP_RAID10) return calc_size * (num_stripes / sub_stripes); - else if (type & BTRFS_BLOCK_GROUP_RAID5) - return calc_size * (num_stripes - 1); - else if (type & BTRFS_BLOCK_GROUP_RAID6) - return calc_size * (num_stripes - 2); + else if (type & BTRFS_BLOCK_GROUP_RAID56) + return calc_size * (num_stripes - (type >> 56)); else return calc_size * num_stripes; } @@ -2209,17 +2199,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, sub_stripes = 2; min_stripes = 4; } - if (type & (BTRFS_BLOCK_GROUP_RAID5)) { - num_stripes = fs_devices->rw_devices; - if (num_stripes < 2) - return -ENOSPC; - min_stripes = 2; - } - if (type & (BTRFS_BLOCK_GROUP_RAID6)) { + if (type & (BTRFS_BLOCK_GROUP_RAID56)) { num_stripes = fs_devices->rw_devices; - if (num_stripes < 3) + min_stripes = (type >> 56) + 1; + if (num_stripes < min_stripes) return -ENOSPC; - min_stripes = 3; } if (type & BTRFS_BLOCK_GROUP_DATA) { @@ -2609,10 +2593,8 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; - else if (map->type & BTRFS_BLOCK_GROUP_RAID5) - ret = 2; - else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - ret = 3; + else if (map->type & BTRFS_BLOCK_GROUP_RAID56) + ret = nr_parity_stripes(map); else ret = 1; free_extent_map(em); @@ -2734,8 +2716,8 @@ again: max_errors = 1; } } - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) - && multi_ret && (rw & (1 << BIO_RW) || mirror_num > 1) && raid_map_ret) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56 && multi_ret && + (rw & (1 << BIO_RW) || mirror_num > 1) && raid_map_ret) { /* RAID[56] write or recovery. Return all stripes */ stripes_required = map->num_stripes; max_errors = nr_parity_stripes(map); @@ -2770,8 +2752,7 @@ again: stripe_offset = offset - stripe_offset; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | - BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID56 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP)) { /* we limit the length of each bio to what fits in a stripe */ *length = min_t(u64, em->len - offset, @@ -2818,8 +2799,7 @@ again: current->pid % map->sub_stripes); } - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID56) { u64 tmp; stripe_index = do_div(stripe_nr, nr_data_stripes(map)); @@ -2841,7 +2821,7 @@ again: em->start + (tmp + i) * map->stripe_len; raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; - if (map->type & BTRFS_BLOCK_GROUP_RAID6) + if ((map->type >> 56) >= 2) raid_map[(i+rot+1) % map->num_stripes] = RAID6_Q_STRIPE; *length = map->stripe_len; @@ -2940,8 +2920,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, do_div(length, map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) do_div(length, map->num_stripes); - else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + else if (map->type & BTRFS_BLOCK_GROUP_RAID56) { do_div(length, nr_data_stripes(map)); rmap_len = map->stripe_len * nr_data_stripes(map); } -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> We discussed using the top bits of the chunk type field field to store a > number of redundant disks -- so instead of RAID5, RAID6, etc., we end up > with a single ''RAID56'' flag, and the amount of redundancy is stored > elsewhere.Is there any sort of timeline for RAID5/6 support in btrfs? I currently have 8 drives in a zfs-fuse RAIDZ2 (RAID6) configuration, and I''d love to see how btrfs compares to that, once it''s ready. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Roy Sigurd Karlsbakk
2009-Aug-22 16:31 UTC
Re: RAID[56] with arbitrary numbers of "parity" stripes.
On 22. aug.. 2009, at 17.34, tsuraan wrote:>> We discussed using the top bits of the chunk type field field to >> store a >> number of redundant disks -- so instead of RAID5, RAID6, etc., we >> end up >> with a single ''RAID56'' flag, and the amount of redundancy is stored >> elsewhere. > > Is there any sort of timeline for RAID5/6 support in btrfs? I > currently have 8 drives in a zfs-fuse RAIDZ2 (RAID6) configuration, > and I''d love to see how btrfs compares to that, once it''s ready.I think someone started doing RAID[56] (see threads "A start at RAID[56] support" and perhaps "Factor out RAID6 algorithms into lib/". Seems something is in the works. By the way - how does FUSE ZFS work? Is it stable? Good performance? We''re using ZFS natively on Solaris 10 now, perhaps moving the storage to opensolaris soon. roy -- Roy Sigurd Karlsbakk (+47) 97542685 roy@karlsbakk.net http://blogg.karlsbakk.net/ -- I all pedagogikk er det essensielt at pensum presenteres intelligibelt. Det er et elementært imperativ for alle pedagoger å unngå eksessiv anvendelse av idiomer med fremmed opprinnelse. I de fleste tilfeller eksisterer adekvate og relevante synonymer på norsk. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> By the way - how does FUSE ZFS work? Is it stable? Good performance? > We''re using ZFS natively on Solaris 10 now, perhaps moving the storage > to opensolaris soon.It''s pretty stable; I wouldn''t put anything on it that isn''t backed up, but I guess that holds for any other filesystem. The speed isn''t yet up to par with the ntfs-fuse project, but they''re working on it. On my personal machine, I get 100MB/s writes and 250MB/s reads on an 8-drive RAID-Z2, but I don''t know how that compares to anything else, since OpenSolaris won''t boot on my hardware and I don''t have the patience to wait for md to sync 8 TB worth of drives. I''m reasonably happy with zfs-fuse, but I''m definitely looking forward to seeing how it compares with a native filesystem that supports RAID6. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Reasonably Related Threads
- [RFC 03/12 RESEND PATCH] Btrfs: Reorder __btrfs_map_block to make code more efficient.
- [PATCH] Btrfs-progs: change the way mkfs picks raid profiles
- [PATCH] Btrfs: fix deadlock during allocating chunks
- [PATCH] btrfs: return EPERM in btrfs_rm_device()
- [RFC][PATCH 1/2] Btrfs: try to allocate new chunks with degenerated profile