Arne Jansen
2010-Dec-11 20:13 UTC
[PATCH] Fixing the chunk allocator to allow it to better utilize the devices
The chunk allocator had several small bugs that prevented it to use the devices fully in multi-device setups. find_free_dev_extent didn''t return max_avail correctly if the largest hole was at the end of the device. Also it disregarded alloc_start in certain situations. It has also been restructured for better readability. __btrs_alloc_chunk now uses the returned max_avail to determine the maximum chunk available for its again-loop, instead of the total amount of free space of the device, which might not be available in one piece. It also uses the devices in a round robin fashion, which help balancing the devices better. Signed-off-by: Arne Jansen <sensille@gmx.net> --- fs/btrfs/extent-tree.c | 4 +- fs/btrfs/volumes.c | 135 +++++++++++++++++++++++++----------------------- 2 files changed, 73 insertions(+), 66 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ddaf634..00d7240 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8049,7 +8049,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) mutex_lock(&root->fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { u64 min_free = btrfs_block_group_used(&block_group->item); - u64 dev_offset, max_avail; + u64 dev_offset; /* * check to make sure we can actually find a chunk with enough @@ -8057,7 +8057,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) */ if (device->total_bytes > device->bytes_used + min_free) { ret = find_free_dev_extent(NULL, device, min_free, - &dev_offset, &max_avail); + &dev_offset, NULL); if (!ret) break; ret = -1; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 91851b5..0540a9d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -738,29 +738,33 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; u64 hole_size = 0; - u64 last_byte = 0; - u64 search_start = 0; + u64 hole_start; + u64 hole_end; + u64 search_start; u64 search_end = device->total_bytes; int ret; int slot = 0; - int start_found; struct extent_buffer *l; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = 2; - start_found = 0; /* FIXME use last free of some kind */ /* we don''t want to overwrite the superblock on the drive, * so we make sure to start at an offset of at least 1MB */ - search_start = max((u64)1024 * 1024, search_start); + search_start = max((u64)1024 * 1024, root->fs_info->alloc_start); - if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) - search_start = max(root->fs_info->alloc_start, search_start); + if (search_start >= search_end) { + ret = -ENOSPC; + goto error; + } + + hole_start = search_start; + hole_end = search_end; key.objectid = device->devid; key.offset = search_start; @@ -772,8 +776,6 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, ret = btrfs_previous_item(root, path, key.objectid, key.type); if (ret < 0) goto error; - if (ret > 0) - start_found = 1; } l = path->nodes[0]; btrfs_item_key_to_cpu(l, &key, path->slots[0]); @@ -786,23 +788,8 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, continue; if (ret < 0) goto error; -no_more_items: - if (!start_found) { - if (search_start >= search_end) { - ret = -ENOSPC; - goto error; - } - *start = search_start; - start_found = 1; - goto check_pending; - } - *start = last_byte > search_start ? - last_byte : search_start; - if (search_end <= *start) { - ret = -ENOSPC; - goto error; - } - goto check_pending; + + break; } btrfs_item_key_to_cpu(l, &key, slot); @@ -810,43 +797,48 @@ no_more_items: goto next; if (key.objectid > device->devid) - goto no_more_items; + break; - if (key.offset >= search_start && key.offset > last_byte && - start_found) { - if (last_byte < search_start) - last_byte = search_start; - hole_size = key.offset - last_byte; + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; - if (hole_size > *max_avail) - *max_avail = hole_size; + if (key.offset > hole_start) { + hole_size = key.offset - hole_start; - if (key.offset > last_byte && - hole_size >= num_bytes) { - *start = last_byte; - goto check_pending; + if (hole_size >= num_bytes) { + hole_end = key.offset; + break; } + + if (max_avail && hole_size > *max_avail) + *max_avail = hole_size; } - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) - goto next; - start_found = 1; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); - last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); + hole_start = key.offset + btrfs_dev_extent_length(l, dev_extent); + if (hole_start < search_start) + hole_start = search_start; next: path->slots[0]++; cond_resched(); } -check_pending: /* we have to make sure we didn''t find an extent that has already * been allocated by the map tree or the original allocation */ - BUG_ON(*start < search_start); + BUG_ON(hole_start < search_start); + + hole_size = hole_end - hole_start; - if (*start + num_bytes > search_end) { + if (max_avail && hole_size > *max_avail) + *max_avail = hole_size; + + if (hole_start + num_bytes > search_end) { ret = -ENOSPC; goto error; } + + *start = hole_start; + /* check for pending inserts here */ ret = 0; @@ -2154,6 +2146,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = extent_root->fs_info; struct btrfs_device *device = NULL; + struct btrfs_device *smallest_dev; struct btrfs_fs_devices *fs_devices = info->fs_devices; struct list_head *cur; struct map_lookup *map = NULL; @@ -2165,7 +2158,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 max_chunk_size = calc_size; u64 min_free; u64 avail; - u64 max_avail = 0; + u64 min_max_avail = 0; u64 dev_offset; int num_stripes = 1; int min_stripes = 1; @@ -2223,7 +2216,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size); again: - max_avail = 0; if (!map || map->num_stripes != num_stripes) { kfree(map); map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); @@ -2260,15 +2252,9 @@ again: else min_free = calc_size; - /* - * we add 1MB because we never use the first 1MB of the device, unless - * we''ve looped, then we are likely allocating the maximum amount of - * space left already - */ - if (!looped) - min_free += 1024 * 1024; - INIT_LIST_HEAD(&private_devs); + min_max_avail = 0; + smallest_dev = NULL; while (index < num_stripes) { device = list_entry(cur, struct btrfs_device, dev_alloc_list); BUG_ON(!device->writeable); @@ -2278,10 +2264,20 @@ again: avail = 0; cur = cur->next; - if (device->in_fs_metadata && avail >= min_free) { - ret = find_free_dev_extent(trans, device, - min_free, &dev_offset, - &max_avail); + if (device->in_fs_metadata) { + u64 max_avail = 0; + ret = find_free_dev_extent(trans, device, min_free, + &dev_offset, &max_avail); + /* + * track the minimum stripe len that is available on all devices, + * but don''t count holes that are too small + */ + if (max_avail >= stripe_len * 4 && + (max_avail < min_max_avail || min_max_avail == 0)) { + min_max_avail = max_avail; + smallest_dev = device; + } + if (ret == 0) { list_move_tail(&device->dev_alloc_list, &private_devs); @@ -2295,12 +2291,16 @@ again: index++; } } - } else if (device->in_fs_metadata && avail > max_avail) - max_avail = avail; + } if (cur == &fs_devices->alloc_list) break; } - list_splice(&private_devs, &fs_devices->alloc_list); + /* + * splice the list of used devices to the end. This achieves + * a basic round-robin scheme which help balancing the devices + * better. + */ + list_splice_tail(&private_devs, &fs_devices->alloc_list); if (index < num_stripes) { if (index >= min_stripes) { num_stripes = index; @@ -2311,9 +2311,16 @@ again: looped = 1; goto again; } - if (!looped && max_avail > 0) { + if (!looped && min_max_avail > 0) { looped = 1; - calc_size = max_avail; + calc_size = min_max_avail; + /* + * move the smallest device to the head of the list to + * make sure it gets included in the chunk. + */ + BUG_ON(!smallest_dev); + list_move(&smallest_dev->dev_alloc_list, + &fs_devices->alloc_list); goto again; } kfree(map); -- 1.7.2.2 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html