v7 was a real mess -- I made any number of errors in rebasing it onto the 3.0 tree -- so I''ve skipped direct to v8. Thanks to David Sterba for pointing out all the problems. Changes since v6: rebased to 3.0-rc4. This series can also be pulled from the balance-management-v8 branch of http://git.darksatanic.net/repo/btrfs-kernel.git/ Hugo. Hugo Mills (8): btrfs: Balance progress monitoring btrfs: Cancel filesystem balance btrfs: Factor out enumeration of chunks to a separate function btrfs: Implement filtered balance ioctl btrfs: Balance filter for device ID btrfs: Balance filter for virtual address ranges btrfs: Replication-type information btrfs: Balance filter for physical device address fs/btrfs/ctree.h | 10 ++ fs/btrfs/disk-io.c | 2 + fs/btrfs/ioctl.c | 104 +++++++++++++- fs/btrfs/ioctl.h | 49 ++++++ fs/btrfs/super.c | 16 +-- fs/btrfs/volumes.c | 414 ++++++++++++++++++++++++++++++++++++---------------- fs/btrfs/volumes.h | 23 +++- 7 files changed, 482 insertions(+), 136 deletions(-) -- 1.7.2.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
This patch introduces a basic form of progress monitoring for balance operations, by counting the number of block groups remaining. The information is exposed to userspace by an ioctl. Signed-off-by: Hugo Mills <hugo@carfax.org.uk> --- fs/btrfs/ctree.h | 9 ++++++++ fs/btrfs/disk-io.c | 2 + fs/btrfs/ioctl.c | 34 +++++++++++++++++++++++++++++++ fs/btrfs/ioctl.h | 7 ++++++ fs/btrfs/volumes.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 106 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3006287..25aa3cf 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -873,6 +873,11 @@ struct btrfs_block_group_cache { struct list_head cluster_list; }; +struct btrfs_balance_info { + u32 expected; + u32 completed; +}; + struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; @@ -1115,6 +1120,10 @@ struct btrfs_fs_info { u64 fs_state; struct btrfs_delayed_root *delayed_root; + + /* Keep track of any rebalance operations on this FS */ + spinlock_t balance_info_lock; + struct btrfs_balance_info *balance_info; }; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1ac8db5d..38f8fbc 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1619,6 +1619,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); + spin_lock_init(&fs_info->balance_info_lock); mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); @@ -1648,6 +1649,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; fs_info->trans_no_join = 0; + fs_info->balance_info = NULL; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a3c4751..5ddf816 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2836,6 +2836,38 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, return ret; } +/* + * Return the current status of any balance operation + */ +long btrfs_ioctl_balance_progress( + struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_balance_progress __user *user_dest) +{ + int ret = 0; + struct btrfs_ioctl_balance_progress dest; + + spin_lock(&fs_info->balance_info_lock); + if (!fs_info->balance_info) { + ret = -EINVAL; + goto error; + } + + dest.expected = fs_info->balance_info->expected; + dest.completed = fs_info->balance_info->completed; + + spin_unlock(&fs_info->balance_info_lock); + + if (copy_to_user(user_dest, &dest, + sizeof(struct btrfs_ioctl_balance_progress))) + return -EFAULT; + + return 0; + +error: + spin_unlock(&fs_info->balance_info_lock); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2881,6 +2913,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: return btrfs_balance(root->fs_info->dev_root); + case BTRFS_IOC_BALANCE_PROGRESS: + return btrfs_ioctl_balance_progress(root->fs_info, argp); case BTRFS_IOC_CLONE: return btrfs_ioctl_clone(file, arg, 0, 0, 0); case BTRFS_IOC_CLONE_RANGE: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index ad1ea78..575b25f 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -193,6 +193,11 @@ struct btrfs_ioctl_space_args { struct btrfs_ioctl_space_info spaces[0]; }; +struct btrfs_ioctl_balance_progress { + __u32 expected; + __u32 completed; +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -248,4 +253,6 @@ struct btrfs_ioctl_space_args { struct btrfs_ioctl_dev_info_args) #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ struct btrfs_ioctl_fs_info_args) +#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 32, \ + struct btrfs_ioctl_balance_progress) #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1efa56e..4c0a386 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2026,6 +2026,7 @@ int btrfs_balance(struct btrfs_root *dev_root) struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; struct btrfs_trans_handle *trans; struct btrfs_key found_key; + struct btrfs_balance_info *bal_info; if (dev_root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; @@ -2036,6 +2037,20 @@ int btrfs_balance(struct btrfs_root *dev_root) mutex_lock(&dev_root->fs_info->volume_mutex); dev_root = dev_root->fs_info->dev_root; + bal_info = kmalloc( + sizeof(struct btrfs_balance_info), + GFP_NOFS); + if (!bal_info) { + ret = -ENOMEM; + goto error_no_status; + } + spin_lock(&dev_root->fs_info->balance_info_lock); + dev_root->fs_info->balance_info = bal_info; + bal_info->expected = -1; /* One less than actually counted, + because chunk 0 is special */ + bal_info->completed = 0; + spin_unlock(&dev_root->fs_info->balance_info_lock); + /* step one make some room on all the devices */ list_for_each_entry(device, devices, dev_list) { old_size = device->total_bytes; @@ -2059,10 +2074,37 @@ int btrfs_balance(struct btrfs_root *dev_root) btrfs_end_transaction(trans, dev_root); } - /* step two, relocate all the chunks */ + /* step two, count the chunks */ path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + ret = -ENOMEM; + goto error; + } + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret <= 0) { + printk(KERN_ERR "btrfs: Failed to find the last chunk.\n"); + BUG(); + } + + while (1) { + ret = btrfs_previous_item(chunk_root, path, 0, + BTRFS_CHUNK_ITEM_KEY); + if (ret) + break; + + spin_lock(&dev_root->fs_info->balance_info_lock); + bal_info->expected++; + spin_unlock(&dev_root->fs_info->balance_info_lock); + } + + btrfs_release_path(path); + /* step three, relocate all the chunks */ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; @@ -2100,10 +2142,20 @@ int btrfs_balance(struct btrfs_root *dev_root) found_key.offset); BUG_ON(ret && ret != -ENOSPC); key.offset = found_key.offset - 1; + spin_lock(&dev_root->fs_info->balance_info_lock); + bal_info->completed++; + spin_unlock(&dev_root->fs_info->balance_info_lock); + printk(KERN_INFO "btrfs: balance: %u/%u block groups completed\n", + bal_info->completed, bal_info->expected); } ret = 0; error: btrfs_free_path(path); + spin_lock(&dev_root->fs_info->balance_info_lock); + kfree(dev_root->fs_info->balance_info); + dev_root->fs_info->balance_info = NULL; + spin_unlock(&dev_root->fs_info->balance_info_lock); +error_no_status: mutex_unlock(&dev_root->fs_info->volume_mutex); return ret; } -- 1.7.2.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
This patch adds an ioctl for cancelling a btrfs balance operation mid-flight. The ioctl simply sets a flag, and the operation terminates after the current block group move has completed. Signed-off-by: Hugo Mills <hugo@carfax.org.uk> --- fs/btrfs/ctree.h | 1 + fs/btrfs/ioctl.c | 28 ++++++++++++++++++++++++++++ fs/btrfs/ioctl.h | 1 + fs/btrfs/volumes.c | 7 ++++++- 4 files changed, 36 insertions(+), 1 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 25aa3cf..5031085 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -876,6 +876,7 @@ struct btrfs_block_group_cache { struct btrfs_balance_info { u32 expected; u32 completed; + int cancel_pending; }; struct reloc_control; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5ddf816..d4458d0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2868,6 +2868,32 @@ error: return ret; } +/* + * Cancel a running balance operation + */ +long btrfs_ioctl_balance_cancel(struct btrfs_fs_info *fs_info) +{ + int err = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + spin_lock(&fs_info->balance_info_lock); + if (!fs_info->balance_info) { + err = -EINVAL; + goto error; + } + if (fs_info->balance_info->cancel_pending) { + err = -ECANCELED; + goto error; + } + fs_info->balance_info->cancel_pending = 1; + +error: + spin_unlock(&fs_info->balance_info_lock); + return err; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2915,6 +2941,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_balance(root->fs_info->dev_root); case BTRFS_IOC_BALANCE_PROGRESS: return btrfs_ioctl_balance_progress(root->fs_info, argp); + case BTRFS_IOC_BALANCE_CANCEL: + return btrfs_ioctl_balance_cancel(root->fs_info); case BTRFS_IOC_CLONE: return btrfs_ioctl_clone(file, arg, 0, 0, 0); case BTRFS_IOC_CLONE_RANGE: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 575b25f..edcbe61 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -255,4 +255,5 @@ struct btrfs_ioctl_balance_progress { struct btrfs_ioctl_fs_info_args) #define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 32, \ struct btrfs_ioctl_balance_progress) +#define BTRFS_IOC_BALANCE_CANCEL _IO(BTRFS_IOCTL_MAGIC, 33) #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4c0a386..f38b231 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2049,6 +2049,7 @@ int btrfs_balance(struct btrfs_root *dev_root) bal_info->expected = -1; /* One less than actually counted, because chunk 0 is special */ bal_info->completed = 0; + bal_info->cancel_pending = 0; spin_unlock(&dev_root->fs_info->balance_info_lock); /* step one make some room on all the devices */ @@ -2109,7 +2110,7 @@ int btrfs_balance(struct btrfs_root *dev_root) key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; - while (1) { + while (!bal_info->cancel_pending) { ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) goto error; @@ -2149,6 +2150,10 @@ int btrfs_balance(struct btrfs_root *dev_root) bal_info->completed, bal_info->expected); } ret = 0; + if (bal_info->cancel_pending) { + printk(KERN_INFO "btrfs: balance cancelled\n"); + ret = -EINTR; + } error: btrfs_free_path(path); spin_lock(&dev_root->fs_info->balance_info_lock); -- 1.7.2.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hugo Mills
2011-Jun-26 20:36 UTC
[PATCH v8 3/8] btrfs: Factor out enumeration of chunks to a separate function
The main balance function has two loops which are functionally identical in their looping mechanism, but which perform a different operation on the chunks they loop over. To avoid repeating code more than necessary, factor this loop out into a separate iterator function which takes a function parameter for the action to be performed. Signed-off-by: Hugo Mills <hugo@carfax.org.uk> --- fs/btrfs/volumes.c | 174 +++++++++++++++++++++++++++++---------------------- 1 files changed, 99 insertions(+), 75 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f38b231..a81fd3c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2014,6 +2014,97 @@ static u64 div_factor(u64 num, int factor) return num; } +/* Define a type, and two functions which can be used for the two + * phases of the balance operation: one for counting chunks, and one + * for actually moving them. */ +typedef void (*balance_iterator_function)(struct btrfs_root *, + struct btrfs_balance_info *, + struct btrfs_path *, + struct btrfs_key *); + +static void balance_count_chunks(struct btrfs_root *chunk_root, + struct btrfs_balance_info *bal_info, + struct btrfs_path *path, + struct btrfs_key *key) +{ + spin_lock(&chunk_root->fs_info->balance_info_lock); + bal_info->expected++; + spin_unlock(&chunk_root->fs_info->balance_info_lock); +} + +static void balance_move_chunks(struct btrfs_root *chunk_root, + struct btrfs_balance_info *bal_info, + struct btrfs_path *path, + struct btrfs_key *key) +{ + int ret; + + ret = btrfs_relocate_chunk(chunk_root, + chunk_root->root_key.objectid, + key->objectid, + key->offset); + BUG_ON(ret && ret != -ENOSPC); + spin_lock(&chunk_root->fs_info->balance_info_lock); + bal_info->completed++; + spin_unlock(&chunk_root->fs_info->balance_info_lock); + printk(KERN_INFO "btrfs: balance: %u/%u block groups completed\n", + bal_info->completed, bal_info->expected); +} + +/* Iterate through all chunks, performing some function on each one. */ +static int balance_iterate_chunks(struct btrfs_root *chunk_root, + struct btrfs_balance_info *bal_info, + balance_iterator_function iterator_fn) +{ + int ret = 0; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + while (!bal_info->cancel_pending) { + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret < 0) + break; + /* + * this shouldn''t happen, it means the last relocate + * failed + */ + if (ret == 0) + break; + + ret = btrfs_previous_item(chunk_root, path, 0, + BTRFS_CHUNK_ITEM_KEY); + if (ret) + break; + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != key.objectid) + break; + + /* chunk zero is special */ + if (found_key.offset == 0) + break; + + /* Call the function to do the work for this chunk */ + btrfs_release_path(path); + iterator_fn(chunk_root, bal_info, path, &found_key); + + key.offset = found_key.offset - 1; + } + + btrfs_free_path(path); + return ret; +} + int btrfs_balance(struct btrfs_root *dev_root) { int ret; @@ -2021,11 +2112,8 @@ int btrfs_balance(struct btrfs_root *dev_root) struct btrfs_device *device; u64 old_size; u64 size_to_free; - struct btrfs_path *path; - struct btrfs_key key; struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; struct btrfs_trans_handle *trans; - struct btrfs_key found_key; struct btrfs_balance_info *bal_info; if (dev_root->fs_info->sb->s_flags & MS_RDONLY) @@ -2046,8 +2134,7 @@ int btrfs_balance(struct btrfs_root *dev_root) } spin_lock(&dev_root->fs_info->balance_info_lock); dev_root->fs_info->balance_info = bal_info; - bal_info->expected = -1; /* One less than actually counted, - because chunk 0 is special */ + bal_info->expected = 0; bal_info->completed = 0; bal_info->cancel_pending = 0; spin_unlock(&dev_root->fs_info->balance_info_lock); @@ -2076,86 +2163,23 @@ int btrfs_balance(struct btrfs_root *dev_root) } /* step two, count the chunks */ - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; + ret = balance_iterate_chunks(chunk_root, bal_info, + balance_count_chunks); + if (ret) goto error; - } - - key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = (u64)-1; - key.type = BTRFS_CHUNK_ITEM_KEY; - - ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); - if (ret <= 0) { - printk(KERN_ERR "btrfs: Failed to find the last chunk.\n"); - BUG(); - } - - while (1) { - ret = btrfs_previous_item(chunk_root, path, 0, - BTRFS_CHUNK_ITEM_KEY); - if (ret) - break; - - spin_lock(&dev_root->fs_info->balance_info_lock); - bal_info->expected++; - spin_unlock(&dev_root->fs_info->balance_info_lock); - } - - btrfs_release_path(path); /* step three, relocate all the chunks */ - key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = (u64)-1; - key.type = BTRFS_CHUNK_ITEM_KEY; - - while (!bal_info->cancel_pending) { - ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); - if (ret < 0) - goto error; - - /* - * this shouldn''t happen, it means the last relocate - * failed - */ - if (ret == 0) - break; - - ret = btrfs_previous_item(chunk_root, path, 0, - BTRFS_CHUNK_ITEM_KEY); - if (ret) - break; - - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); - if (found_key.objectid != key.objectid) - break; - - /* chunk zero is special */ - if (found_key.offset == 0) - break; + ret = balance_iterate_chunks(chunk_root, bal_info, + balance_move_chunks); + if (ret) + goto error; - btrfs_release_path(path); - ret = btrfs_relocate_chunk(chunk_root, - chunk_root->root_key.objectid, - found_key.objectid, - found_key.offset); - BUG_ON(ret && ret != -ENOSPC); - key.offset = found_key.offset - 1; - spin_lock(&dev_root->fs_info->balance_info_lock); - bal_info->completed++; - spin_unlock(&dev_root->fs_info->balance_info_lock); - printk(KERN_INFO "btrfs: balance: %u/%u block groups completed\n", - bal_info->completed, bal_info->expected); - } ret = 0; if (bal_info->cancel_pending) { printk(KERN_INFO "btrfs: balance cancelled\n"); ret = -EINTR; } error: - btrfs_free_path(path); spin_lock(&dev_root->fs_info->balance_info_lock); kfree(dev_root->fs_info->balance_info); dev_root->fs_info->balance_info = NULL; -- 1.7.2.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
The filtered balance ioctl provides a facility to perform a balance operation on a subset of the chunks in the filesystem. This patch implements the base ioctl for this operation, and one filter type. The filter in this patch selects chunks on the basis of their chunk flags field, and can select any combination of bits set or unset. Signed-off-by: Hugo Mills <hugo@carfax.org.uk> --- fs/btrfs/ioctl.c | 42 +++++++++++++++++++++++++++++++++- fs/btrfs/ioctl.h | 27 ++++++++++++++++++++++ fs/btrfs/volumes.c | 64 +++++++++++++++++++++++++++++++++++++++++++++------ fs/btrfs/volumes.h | 6 ++++- 4 files changed, 129 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d4458d0..3e577b8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2894,6 +2894,44 @@ error: return err; } +long btrfs_ioctl_balance(struct btrfs_root *dev_root, + struct btrfs_ioctl_balance_start __user *user_filters) +{ + int ret = 0; + struct btrfs_ioctl_balance_start *dest; + + dest = kmalloc(sizeof(struct btrfs_ioctl_balance_start), GFP_KERNEL); + if (!dest) + return -ENOMEM; + + if (copy_from_user(dest, user_filters, + sizeof(struct btrfs_ioctl_balance_start))) { + ret = -EFAULT; + goto error; + } + + /* Basic sanity checking: has the user requested anything outside + * the range we know about? */ + if (dest->flags & ~BTRFS_BALANCE_FILTER_MASK) { + ret = -ENOTSUPP; + goto error; + } + + /* Do the balance */ + ret = btrfs_balance(dev_root, dest); + if (ret) + goto error; + + if (copy_to_user(user_filters, dest, + sizeof(struct btrfs_ioctl_balance_start))) { + ret = -EFAULT; + } + +error: + kfree(dest); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2938,11 +2976,13 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: - return btrfs_balance(root->fs_info->dev_root); + return btrfs_ioctl_balance(root->fs_info->dev_root, NULL); case BTRFS_IOC_BALANCE_PROGRESS: return btrfs_ioctl_balance_progress(root->fs_info, argp); case BTRFS_IOC_BALANCE_CANCEL: return btrfs_ioctl_balance_cancel(root->fs_info); + case BTRFS_IOC_BALANCE_FILTERED: + return btrfs_ioctl_balance(root->fs_info->dev_root, argp); case BTRFS_IOC_CLONE: return btrfs_ioctl_clone(file, arg, 0, 0, 0); case BTRFS_IOC_CLONE_RANGE: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index edcbe61..124296e 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -198,6 +198,31 @@ struct btrfs_ioctl_balance_progress { __u32 completed; }; +/* Types of balance filter */ +#define BTRFS_BALANCE_FILTER_COUNT_ONLY (1 << 0) + +#define BTRFS_BALANCE_FILTER_CHUNK_TYPE (1 << 1) +#define BTRFS_BALANCE_FILTER_MASK ((1 << 2) - 1) /* Logical or of all filter + * flags -- effectively versions + * the filtered balance ioctl */ + +/* All the possible options for a filter */ +struct btrfs_ioctl_balance_start { + __u64 flags; /* Bit field indicating which fields of this struct + are filled */ + + /* Output values: chunk counts */ + __u32 examined; + __u32 balanced; + + /* For FILTER_CHUNK_TYPE */ + __u64 chunk_type; /* Flag bits required */ + __u64 chunk_type_mask; /* Mask of bits to examine */ + + __u64 spare[507]; /* Make up the size of the structure to 4088 + * bytes for future expansion */ +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -256,4 +281,6 @@ struct btrfs_ioctl_balance_progress { #define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 32, \ struct btrfs_ioctl_balance_progress) #define BTRFS_IOC_BALANCE_CANCEL _IO(BTRFS_IOCTL_MAGIC, 33) +#define BTRFS_IOC_BALANCE_FILTERED _IOWR(BTRFS_IOCTL_MAGIC, 34, \ + struct btrfs_ioctl_balance_start) #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a81fd3c..ea466ab 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2014,6 +2014,36 @@ static u64 div_factor(u64 num, int factor) return num; } +int balance_chunk_filter(struct btrfs_ioctl_balance_start *filter, + struct btrfs_root *chunk_root, + struct btrfs_path *path, + struct btrfs_key *key) +{ + struct extent_buffer *eb; + struct btrfs_chunk *chunk; + + /* No filter defined, everything matches */ + if (!filter) + return 1; + + /* No flags set, everything matches */ + if (filter->flags == 0) + return 1; + + eb = path->nodes[0]; + chunk = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_chunk); + + if (filter->flags & BTRFS_BALANCE_FILTER_CHUNK_TYPE) { + if ((btrfs_chunk_type(eb, chunk) & filter->chunk_type_mask) + != filter->chunk_type) { + return 0; + } + } + + return 1; +} + /* Define a type, and two functions which can be used for the two * phases of the balance operation: one for counting chunks, and one * for actually moving them. */ @@ -2054,6 +2084,7 @@ static void balance_move_chunks(struct btrfs_root *chunk_root, /* Iterate through all chunks, performing some function on each one. */ static int balance_iterate_chunks(struct btrfs_root *chunk_root, struct btrfs_balance_info *bal_info, + struct btrfs_ioctl_balance_start *filter, balance_iterator_function iterator_fn) { int ret = 0; @@ -2069,6 +2100,9 @@ static int balance_iterate_chunks(struct btrfs_root *chunk_root, key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; + filter->examined = 0; + filter->balanced = 0; + while (!bal_info->cancel_pending) { ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) @@ -2095,17 +2129,29 @@ static int balance_iterate_chunks(struct btrfs_root *chunk_root, break; /* Call the function to do the work for this chunk */ - btrfs_release_path(path); - iterator_fn(chunk_root, bal_info, path, &found_key); + filter->examined += 1; + + if (balance_chunk_filter(filter, chunk_root, + path, &found_key)) { + btrfs_release_path(path); + iterator_fn(chunk_root, bal_info, path, &found_key); + filter->balanced += 1; + } else { + btrfs_release_path(path); + } key.offset = found_key.offset - 1; } + printk(KERN_INFO "btrfs: balance: %u chunks considered, %u chunks balanced\n", + filter->examined, filter->balanced); + btrfs_free_path(path); return ret; } -int btrfs_balance(struct btrfs_root *dev_root) +int btrfs_balance(struct btrfs_root *dev_root, + struct btrfs_ioctl_balance_start *filters) { int ret; struct list_head *devices = &dev_root->fs_info->fs_devices->devices; @@ -2164,15 +2210,17 @@ int btrfs_balance(struct btrfs_root *dev_root) /* step two, count the chunks */ ret = balance_iterate_chunks(chunk_root, bal_info, - balance_count_chunks); + filters, balance_count_chunks); if (ret) goto error; /* step three, relocate all the chunks */ - ret = balance_iterate_chunks(chunk_root, bal_info, - balance_move_chunks); - if (ret) - goto error; + if (!(filters->flags & BTRFS_BALANCE_FILTER_COUNT_ONLY)) { + ret = balance_iterate_chunks(chunk_root, bal_info, + filters, balance_move_chunks); + if (ret) + goto error; + } ret = 0; if (bal_info->cancel_pending) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7c12d61..08ec502 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -22,6 +22,7 @@ #include <linux/bio.h> #include <linux/sort.h> #include "async-thread.h" +#include "ioctl.h" #define BTRFS_STRIPE_LEN (64 * 1024) @@ -208,7 +209,10 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); -int btrfs_balance(struct btrfs_root *dev_root); +int btrfs_balance(struct btrfs_root *dev_root, + struct btrfs_ioctl_balance_start *filters); +void btrfs_unlock_volumes(void); +void btrfs_lock_volumes(void); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, -- 1.7.2.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Balance filter to take only chunks which have (or had) a stripe on the given device. Useful if a device has been forcibly removed from the filesystem, and the data from that device needs rebuilding. Signed-off-by: Hugo Mills <hugo@carfax.org.uk> --- fs/btrfs/ioctl.h | 8 ++++++-- fs/btrfs/volumes.c | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 124296e..21b0e6a 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -202,7 +202,8 @@ struct btrfs_ioctl_balance_progress { #define BTRFS_BALANCE_FILTER_COUNT_ONLY (1 << 0) #define BTRFS_BALANCE_FILTER_CHUNK_TYPE (1 << 1) -#define BTRFS_BALANCE_FILTER_MASK ((1 << 2) - 1) /* Logical or of all filter +#define BTRFS_BALANCE_FILTER_DEVID (1 << 2) +#define BTRFS_BALANCE_FILTER_MASK ((1 << 3) - 1) /* Logical or of all filter * flags -- effectively versions * the filtered balance ioctl */ @@ -219,7 +220,10 @@ struct btrfs_ioctl_balance_start { __u64 chunk_type; /* Flag bits required */ __u64 chunk_type_mask; /* Mask of bits to examine */ - __u64 spare[507]; /* Make up the size of the structure to 4088 + /* For FILTER_DEVID */ + __u64 devid; + + __u64 spare[506]; /* Make up the size of the structure to 4088 * bytes for future expansion */ }; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ea466ab..36d9018 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2021,6 +2021,7 @@ int balance_chunk_filter(struct btrfs_ioctl_balance_start *filter, { struct extent_buffer *eb; struct btrfs_chunk *chunk; + int i; /* No filter defined, everything matches */ if (!filter) @@ -2040,6 +2041,19 @@ int balance_chunk_filter(struct btrfs_ioctl_balance_start *filter, return 0; } } + if (filter->flags & BTRFS_BALANCE_FILTER_DEVID) { + int num_stripes = btrfs_chunk_num_stripes(eb, chunk); + int res = 0; + for (i = 0; i < num_stripes; i++) { + struct btrfs_stripe *stripe = btrfs_stripe_nr(chunk, i); + if (btrfs_stripe_devid(eb, stripe) == filter->devid) { + res = 1; + break; + } + } + if (!res) + return 0; + } return 1; } -- 1.7.2.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hugo Mills
2011-Jun-26 20:36 UTC
[PATCH v8 6/8] btrfs: Balance filter for virtual address ranges
Allow the balancing of chunks where some part of the chunk lies within the virtual (i.e. btrfs-internal) address range passed. Signed-off-by: Hugo Mills <hugo@carfax.org.uk> --- fs/btrfs/ioctl.h | 9 +++++++-- fs/btrfs/volumes.c | 6 ++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 21b0e6a..ba09b19 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -203,7 +203,8 @@ struct btrfs_ioctl_balance_progress { #define BTRFS_BALANCE_FILTER_CHUNK_TYPE (1 << 1) #define BTRFS_BALANCE_FILTER_DEVID (1 << 2) -#define BTRFS_BALANCE_FILTER_MASK ((1 << 3) - 1) /* Logical or of all filter +#define BTRFS_BALANCE_FILTER_VIRTUAL_ADDRESS_RANGE (1 << 3) +#define BTRFS_BALANCE_FILTER_MASK ((1 << 4) - 1) /* Logical or of all filter * flags -- effectively versions * the filtered balance ioctl */ @@ -223,7 +224,11 @@ struct btrfs_ioctl_balance_start { /* For FILTER_DEVID */ __u64 devid; - __u64 spare[506]; /* Make up the size of the structure to 4088 + /* For FILTER_VIRTUAL_ADDRESS_RANGE */ + __u64 vrange_start; + __u64 vrange_end; + + __u64 spare[504]; /* Make up the size of the structure to 4088 * bytes for future expansion */ }; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 36d9018..828aa34 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2054,6 +2054,12 @@ int balance_chunk_filter(struct btrfs_ioctl_balance_start *filter, if (!res) return 0; } + if (filter->flags & BTRFS_BALANCE_FILTER_VIRTUAL_ADDRESS_RANGE) { + u64 start = key->offset; + u64 end = start + btrfs_chunk_length(eb, chunk); + if (filter->vrange_start >= end || start >= filter->vrange_end) + return 0; + } return 1; } -- 1.7.2.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
There are a few places in btrfs where knowledge of the various parameters of a replication type is needed. Factor this out into a single function which can supply all the relevant information. Signed-off-by: Hugo Mills <hugo@carfax.org.uk> --- fs/btrfs/super.c | 16 ++--- fs/btrfs/volumes.c | 155 +++++++++++++++++++++++++--------------------------- fs/btrfs/volumes.h | 17 ++++++ 3 files changed, 98 insertions(+), 90 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0bb4ebb..2ea4e01 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -965,12 +965,12 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) struct btrfs_device_info *devices_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; + struct btrfs_replication_info repl_info; u64 skip_space; u64 type; u64 avail_space; u64 used_space; u64 min_stripe_size; - int min_stripes = 1; int i = 0, nr_devices; int ret; @@ -984,12 +984,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) /* calc min stripe number for data space alloction */ type = btrfs_get_alloc_profile(root, 1); - if (type & BTRFS_BLOCK_GROUP_RAID0) - min_stripes = 2; - else if (type & BTRFS_BLOCK_GROUP_RAID1) - min_stripes = 2; - else if (type & BTRFS_BLOCK_GROUP_RAID10) - min_stripes = 4; + btrfs_get_replication_info(&repl_info, type); if (type & BTRFS_BLOCK_GROUP_DUP) min_stripe_size = 2 * BTRFS_STRIPE_LEN; @@ -1057,14 +1052,15 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) i = nr_devices - 1; avail_space = 0; - while (nr_devices >= min_stripes) { + while (nr_devices >= repl_info.devs_min) { if (devices_info[i].max_avail >= min_stripe_size) { int j; u64 alloc_size; - avail_space += devices_info[i].max_avail * min_stripes; + avail_space += devices_info[i].max_avail + * repl_info.devs_min; alloc_size = devices_info[i].max_avail; - for (j = i + 1 - min_stripes; j <= i; j++) + for (j = i + 1 - repl_info.devs_min; j <= i; j++) devices_info[j].max_avail -= alloc_size; } i--; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 828aa34..fb11550 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -117,6 +117,52 @@ static void requeue_list(struct btrfs_pending_bios *pending_bios, pending_bios->tail = tail; } +void btrfs_get_replication_info(struct btrfs_replication_info *info, + u64 type) +{ + info->sub_stripes = 1; + info->dev_stripes = 1; + info->devs_increment = 1; + info->num_copies = 1; + info->devs_max = 0; /* 0 == as many as possible */ + info->devs_min = 1; + + if (type & BTRFS_BLOCK_GROUP_DUP) { + info->dev_stripes = 2; + info->num_copies = 2; + info->devs_max = 1; + } else if (type & BTRFS_BLOCK_GROUP_RAID0) { + info->devs_min = 2; + } else if (type & BTRFS_BLOCK_GROUP_RAID1) { + info->devs_increment = 2; + info->num_copies = 2; + info->devs_max = 2; + info->devs_min = 2; + } else if (type & BTRFS_BLOCK_GROUP_RAID10) { + info->sub_stripes = 2; + info->devs_increment = 2; + info->num_copies = 2; + info->devs_min = 4; + } + + if (type & BTRFS_BLOCK_GROUP_DATA) { + info->max_stripe_size = 1024 * 1024 * 1024; + info->min_stripe_size = 64 * 1024 * 1024; + info->max_chunk_size = 10 * info->max_stripe_size; + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { + info->max_stripe_size = 256 * 1024 * 1024; + info->min_stripe_size = 32 * 1024 * 1024; + info->max_chunk_size = info->max_stripe_size; + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + info->max_stripe_size = 8 * 1024 * 1024; + info->min_stripe_size = 1 * 1024 * 1024; + info->max_chunk_size = 2 * info->max_stripe_size; + } else { + printk(KERN_ERR "Block group is of an unknown usage type: not data, metadata or system.\n"); + BUG_ON(1); + } +} + /* * we try to collect pending bios for a device so we don''t get a large * number of procs sending bios down to the same device. This greatly @@ -1216,6 +1262,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) struct block_device *bdev; struct buffer_head *bh = NULL; struct btrfs_super_block *disk_super; + struct btrfs_replication_info repl_info; struct btrfs_fs_devices *cur_devices; u64 all_avail; u64 devid; @@ -1231,18 +1278,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) root->fs_info->avail_system_alloc_bits | root->fs_info->avail_metadata_alloc_bits; - if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && - root->fs_info->fs_devices->num_devices <= 4) { - printk(KERN_ERR "btrfs: unable to go below four devices " - "on raid10\n"); - ret = -EINVAL; - goto out; - } + btrfs_get_replication_info(&repl_info, all_avail); - if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && - root->fs_info->fs_devices->num_devices <= 2) { - printk(KERN_ERR "btrfs: unable to go below two " - "devices on raid1\n"); + if (root->fs_info->fs_devices->num_devices <= repl_info.devs_min) { + if (all_avail & BTRFS_BLOCK_GROUP_RAID10) { + printk(KERN_ERR "btrfs: unable to go below four " + "devices on raid10\n"); + } else if (all_avail & BTRFS_BLOCK_GROUP_RAID1) { + printk(KERN_ERR "btrfs: unable to go below two " + "devices on raid1\n"); + } ret = -EINVAL; goto out; } @@ -2446,16 +2491,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct extent_map_tree *em_tree; struct extent_map *em; struct btrfs_device_info *devices_info = NULL; + struct btrfs_replication_info repl_info; u64 total_avail; int num_stripes; /* total number of stripes to allocate */ - int sub_stripes; /* sub_stripes info for map */ - int dev_stripes; /* stripes per dev */ - int devs_max; /* max devs to use */ - int devs_min; /* min devs needed */ - int devs_increment; /* ndevs has to be a multiple of this */ - int ncopies; /* how many copies to data has */ int ret; - u64 max_stripe_size; u64 max_chunk_size; u64 stripe_size; u64 num_bytes; @@ -2472,56 +2511,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (list_empty(&fs_devices->alloc_list)) return -ENOSPC; - sub_stripes = 1; - dev_stripes = 1; - devs_increment = 1; - ncopies = 1; - devs_max = 0; /* 0 == as many as possible */ - devs_min = 1; - - /* - * define the properties of each RAID type. - * FIXME: move this to a global table and use it in all RAID - * calculation code - */ - if (type & (BTRFS_BLOCK_GROUP_DUP)) { - dev_stripes = 2; - ncopies = 2; - devs_max = 1; - } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) { - devs_min = 2; - } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) { - devs_increment = 2; - ncopies = 2; - devs_max = 2; - devs_min = 2; - } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) { - sub_stripes = 2; - devs_increment = 2; - ncopies = 2; - devs_min = 4; - } else { - devs_max = 1; - } - - if (type & BTRFS_BLOCK_GROUP_DATA) { - max_stripe_size = 1024 * 1024 * 1024; - max_chunk_size = 10 * max_stripe_size; - } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - max_stripe_size = 256 * 1024 * 1024; - max_chunk_size = max_stripe_size; - } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - max_stripe_size = 8 * 1024 * 1024; - max_chunk_size = 2 * max_stripe_size; - } else { - printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", - type); - BUG_ON(1); - } + btrfs_get_replication_info(&repl_info, type); /* we don''t want a chunk larger than 10% of writeable space */ max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), - max_chunk_size); + repl_info.max_chunk_size); devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, GFP_NOFS); @@ -2563,15 +2557,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, */ ret = find_free_dev_extent(trans, device, - max_stripe_size * dev_stripes, + repl_info.max_stripe_size * repl_info.dev_stripes, &dev_offset, &max_avail); if (ret && ret != -ENOSPC) goto error; if (ret == 0) - max_avail = max_stripe_size * dev_stripes; + max_avail = repl_info.max_stripe_size * repl_info.dev_stripes; - if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) + if (max_avail < BTRFS_STRIPE_LEN * repl_info.dev_stripes) continue; devices_info[ndevs].dev_offset = dev_offset; @@ -2588,28 +2582,29 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, btrfs_cmp_device_info, NULL); /* round down to number of usable stripes */ - ndevs -= ndevs % devs_increment; + ndevs -= ndevs % repl_info.devs_increment; - if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { + if (ndevs < repl_info.devs_increment * repl_info.sub_stripes + || ndevs < repl_info.devs_min) { ret = -ENOSPC; goto error; } - if (devs_max && ndevs > devs_max) - ndevs = devs_max; + if (repl_info.devs_max && ndevs > repl_info.devs_max) + ndevs = repl_info.devs_max; /* * the primary goal is to maximize the number of stripes, so use as many * devices as possible, even if the stripes are not maximum sized. */ stripe_size = devices_info[ndevs-1].max_avail; - num_stripes = ndevs * dev_stripes; + num_stripes = ndevs * repl_info.dev_stripes; - if (stripe_size * num_stripes > max_chunk_size * ncopies) { - stripe_size = max_chunk_size * ncopies; + if (stripe_size * num_stripes > max_chunk_size * repl_info.num_copies) { + stripe_size = max_chunk_size * repl_info.num_copies; do_div(stripe_size, num_stripes); } - do_div(stripe_size, dev_stripes); + do_div(stripe_size, repl_info.dev_stripes); do_div(stripe_size, BTRFS_STRIPE_LEN); stripe_size *= BTRFS_STRIPE_LEN; @@ -2621,8 +2616,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, map->num_stripes = num_stripes; for (i = 0; i < ndevs; ++i) { - for (j = 0; j < dev_stripes; ++j) { - int s = i * dev_stripes + j; + for (j = 0; j < repl_info.dev_stripes; ++j) { + int s = i * repl_info.dev_stripes + j; map->stripes[s].dev = devices_info[i].dev; map->stripes[s].physical = devices_info[i].dev_offset + j * stripe_size; @@ -2633,10 +2628,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, map->io_align = BTRFS_STRIPE_LEN; map->io_width = BTRFS_STRIPE_LEN; map->type = type; - map->sub_stripes = sub_stripes; + map->sub_stripes = repl_info.sub_stripes; *map_ret = map; - num_bytes = stripe_size * (num_stripes / ncopies); + num_bytes = stripe_size * (num_stripes / repl_info.num_copies); *stripe_size_out = stripe_size; *num_bytes_out = num_bytes; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 08ec502..4fe9580 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -164,6 +164,22 @@ struct map_lookup { struct btrfs_bio_stripe stripes[]; }; +/* + * Information about a the parameters of a replication strategy (RAID + * level) + */ +struct btrfs_replication_info { + u32 sub_stripes; + u32 dev_stripes; + u32 devs_increment; + u32 num_copies; + u32 devs_max; + u32 devs_min; + u64 max_stripe_size; + u64 min_stripe_size; + u64 max_chunk_size; +}; + #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) @@ -217,4 +233,5 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); +void btrfs_get_replication_info(struct btrfs_replication_info *info, u64 type); #endif -- 1.7.2.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hugo Mills
2011-Jun-26 20:36 UTC
[PATCH v8 8/8] btrfs: Balance filter for physical device address
Add a filter for balancing which allows the selection of chunks with data in the given byte range on any block device in the filesystem. On its own, this filter is of little use, but when used with the devid filter, it can be used to rebalance all chunks which lie on a part of a specific device. Signed-off-by: Hugo Mills <hugo@carfax.org.uk> --- fs/btrfs/ioctl.h | 9 +++++++-- fs/btrfs/volumes.c | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index ba09b19..08fcfed 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -204,7 +204,8 @@ struct btrfs_ioctl_balance_progress { #define BTRFS_BALANCE_FILTER_CHUNK_TYPE (1 << 1) #define BTRFS_BALANCE_FILTER_DEVID (1 << 2) #define BTRFS_BALANCE_FILTER_VIRTUAL_ADDRESS_RANGE (1 << 3) -#define BTRFS_BALANCE_FILTER_MASK ((1 << 4) - 1) /* Logical or of all filter +#define BTRFS_BALANCE_FILTER_DEVICE_ADDRESS_RANGE (1 << 4) +#define BTRFS_BALANCE_FILTER_MASK ((1 << 5) - 1) /* Logical or of all filter * flags -- effectively versions * the filtered balance ioctl */ @@ -228,7 +229,11 @@ struct btrfs_ioctl_balance_start { __u64 vrange_start; __u64 vrange_end; - __u64 spare[504]; /* Make up the size of the structure to 4088 + /* For FILTER_DEVICE_ADDRESS_RANGE */ + __u64 drange_start; + __u64 drange_end; + + __u64 spare[502]; /* Make up the size of the structure to 4088 * bytes for future expansion */ }; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index fb11550..fa536e9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2067,6 +2067,7 @@ int balance_chunk_filter(struct btrfs_ioctl_balance_start *filter, struct extent_buffer *eb; struct btrfs_chunk *chunk; int i; + struct btrfs_replication_info replinfo; /* No filter defined, everything matches */ if (!filter) @@ -2080,6 +2081,8 @@ int balance_chunk_filter(struct btrfs_ioctl_balance_start *filter, chunk = btrfs_item_ptr(eb, path->slots[0], struct btrfs_chunk); + btrfs_get_replication_info(&replinfo, btrfs_chunk_type(eb, chunk)); + if (filter->flags & BTRFS_BALANCE_FILTER_CHUNK_TYPE) { if ((btrfs_chunk_type(eb, chunk) & filter->chunk_type_mask) != filter->chunk_type) { @@ -2105,6 +2108,25 @@ int balance_chunk_filter(struct btrfs_ioctl_balance_start *filter, if (filter->vrange_start >= end || start >= filter->vrange_end) return 0; } + if (filter->flags & BTRFS_BALANCE_FILTER_DEVICE_ADDRESS_RANGE) { + int num_stripes = btrfs_chunk_num_stripes(eb, chunk); + int stripe_length = btrfs_chunk_length(eb, chunk) + * num_stripes / replinfo.num_copies; + int res = 0; + + for (i = 0; i < num_stripes; i++) { + struct btrfs_stripe *stripe = btrfs_stripe_nr(chunk, i); + u64 start = btrfs_stripe_offset(eb, stripe); + u64 end = start + stripe_length; + if (filter->drange_start < end + && start < filter->drange_end) { + res = 1; + break; + } + } + if (!res) + return 0; + } return 1; } -- 1.7.2.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Jun 26, 2011 at 09:36:54PM +0100, Hugo Mills wrote:> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index 828aa34..fb11550 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -117,6 +117,52 @@ static void requeue_list(struct btrfs_pending_bios *pending_bios, > pending_bios->tail = tail; > } > > +void btrfs_get_replication_info(struct btrfs_replication_info *info, > + u64 type) > +{ > + info->sub_stripes = 1; > + info->dev_stripes = 1; > + info->devs_increment = 1; > + info->num_copies = 1; > + info->devs_max = 0; /* 0 == as many as possible */ > + info->devs_min = 1; > + > + if (type & BTRFS_BLOCK_GROUP_DUP) { > + info->dev_stripes = 2; > + info->num_copies = 2; > + info->devs_max = 1; > + } else if (type & BTRFS_BLOCK_GROUP_RAID0) { > + info->devs_min = 2; > + } else if (type & BTRFS_BLOCK_GROUP_RAID1) { > + info->devs_increment = 2; > + info->num_copies = 2; > + info->devs_max = 2; > + info->devs_min = 2; > + } else if (type & BTRFS_BLOCK_GROUP_RAID10) { > + info->sub_stripes = 2; > + info->devs_increment = 2; > + info->num_copies = 2; > + info->devs_min = 4; > + } > + > + if (type & BTRFS_BLOCK_GROUP_DATA) { > + info->max_stripe_size = 1024 * 1024 * 1024; > + info->min_stripe_size = 64 * 1024 * 1024; > + info->max_chunk_size = 10 * info->max_stripe_size; > + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { > + info->max_stripe_size = 256 * 1024 * 1024; > + info->min_stripe_size = 32 * 1024 * 1024; > + info->max_chunk_size = info->max_stripe_size; > + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { > + info->max_stripe_size = 8 * 1024 * 1024; > + info->min_stripe_size = 1 * 1024 * 1024; > + info->max_chunk_size = 2 * info->max_stripe_size; > + } else { > + printk(KERN_ERR "Block group is of an unknown usage type: not data, metadata or system.\n"); > + BUG_ON(1);I''m hitting this BUG_ON with ''btrfs device delete'', type = 24 which is BTRFS_BLOCK_GROUP_RAID0 + BTRFS_BLOCK_GROUP_RAID1 . in btrfs_rm_device: 1277 all_avail = root->fs_info->avail_data_alloc_bits | 1278 root->fs_info->avail_system_alloc_bits | 1279 root->fs_info->avail_metadata_alloc_bits; the values before the call are: [ 105.107074] D: all_avail 24 [ 105.111844] D: root->fs_info->avail_data_alloc_bits 8 [ 105.118858] D: root->fs_info->avail_system_alloc_bits 16 [ 105.126110] D: root->fs_info->avail_metadata_alloc_bits 16 there are 5 devices, sdb5 - sdb9, i''m removing sdb9, after clean mount. david -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jun 28, 2011 at 06:32:43PM +0200, David Sterba wrote:> On Sun, Jun 26, 2011 at 09:36:54PM +0100, Hugo Mills wrote: > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > > index 828aa34..fb11550 100644 > > --- a/fs/btrfs/volumes.c > > +++ b/fs/btrfs/volumes.c > > @@ -117,6 +117,52 @@ static void requeue_list(struct btrfs_pending_bios *pending_bios, > > pending_bios->tail = tail; > > } > > > > +void btrfs_get_replication_info(struct btrfs_replication_info *info, > > + u64 type) > > +{ > > + info->sub_stripes = 1; > > + info->dev_stripes = 1; > > + info->devs_increment = 1; > > + info->num_copies = 1; > > + info->devs_max = 0; /* 0 == as many as possible */ > > + info->devs_min = 1; > > + > > + if (type & BTRFS_BLOCK_GROUP_DUP) { > > + info->dev_stripes = 2; > > + info->num_copies = 2; > > + info->devs_max = 1; > > + } else if (type & BTRFS_BLOCK_GROUP_RAID0) { > > + info->devs_min = 2; > > + } else if (type & BTRFS_BLOCK_GROUP_RAID1) { > > + info->devs_increment = 2; > > + info->num_copies = 2; > > + info->devs_max = 2; > > + info->devs_min = 2; > > + } else if (type & BTRFS_BLOCK_GROUP_RAID10) { > > + info->sub_stripes = 2; > > + info->devs_increment = 2; > > + info->num_copies = 2; > > + info->devs_min = 4; > > + } > > + > > + if (type & BTRFS_BLOCK_GROUP_DATA) { > > + info->max_stripe_size = 1024 * 1024 * 1024; > > + info->min_stripe_size = 64 * 1024 * 1024; > > + info->max_chunk_size = 10 * info->max_stripe_size; > > + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { > > + info->max_stripe_size = 256 * 1024 * 1024; > > + info->min_stripe_size = 32 * 1024 * 1024; > > + info->max_chunk_size = info->max_stripe_size; > > + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { > > + info->max_stripe_size = 8 * 1024 * 1024; > > + info->min_stripe_size = 1 * 1024 * 1024; > > + info->max_chunk_size = 2 * info->max_stripe_size; > > + } else { > > + printk(KERN_ERR "Block group is of an unknown usage type: not data, metadata or system.\n"); > > + BUG_ON(1);From inspection, this looks like it''s a viable solution: + info->max_stripe_size = 0; + info->min_stripe_size = -1ULL; + info->max_chunk_size = 0; We only run into problems if a user of this function passes a RAID-only block group type and then tries to use the size parameters from it. There''s only three users of the function currently, and this case is the only one that doesn''t pass a "real" block group type flag. I''ll run a quick test of dev rm and see what happens...> I''m hitting this BUG_ON with ''btrfs device delete'', type = 24 which is > BTRFS_BLOCK_GROUP_RAID0 + BTRFS_BLOCK_GROUP_RAID1 . > > in btrfs_rm_device: > > 1277 all_avail = root->fs_info->avail_data_alloc_bits | > 1278 root->fs_info->avail_system_alloc_bits | > 1279 root->fs_info->avail_metadata_alloc_bits; > > the values before the call are: > > [ 105.107074] D: all_avail 24 > [ 105.111844] D: root->fs_info->avail_data_alloc_bits 8 > [ 105.118858] D: root->fs_info->avail_system_alloc_bits 16 > [ 105.126110] D: root->fs_info->avail_metadata_alloc_bits 16 > > > there are 5 devices, sdb5 - sdb9, i''m removing sdb9, after clean > mount. > > > davidHugo. -- === Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk == PGP key: 515C238D from wwwkeys.eu.pgp.net or http://www.carfax.org.uk --- vi vi vi: the Editor of the Beast. ---
On Tue, Jun 28, 2011 at 08:26:43PM +0100, Hugo Mills wrote:> On Tue, Jun 28, 2011 at 06:32:43PM +0200, David Sterba wrote: > > On Sun, Jun 26, 2011 at 09:36:54PM +0100, Hugo Mills wrote: > > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > > > index 828aa34..fb11550 100644 > > > --- a/fs/btrfs/volumes.c > > > +++ b/fs/btrfs/volumes.c > > > @@ -117,6 +117,52 @@ static void requeue_list(struct btrfs_pending_bios *pending_bios, > > > pending_bios->tail = tail; > > > } > > > > > > +void btrfs_get_replication_info(struct btrfs_replication_info *info, > > > + u64 type) > > > +{ > > > + info->sub_stripes = 1; > > > + info->dev_stripes = 1; > > > + info->devs_increment = 1; > > > + info->num_copies = 1; > > > + info->devs_max = 0; /* 0 == as many as possible */ > > > + info->devs_min = 1; > > > + > > > + if (type & BTRFS_BLOCK_GROUP_DUP) { > > > + info->dev_stripes = 2; > > > + info->num_copies = 2; > > > + info->devs_max = 1; > > > + } else if (type & BTRFS_BLOCK_GROUP_RAID0) { > > > + info->devs_min = 2; > > > + } else if (type & BTRFS_BLOCK_GROUP_RAID1) { > > > + info->devs_increment = 2; > > > + info->num_copies = 2; > > > + info->devs_max = 2; > > > + info->devs_min = 2; > > > + } else if (type & BTRFS_BLOCK_GROUP_RAID10) { > > > + info->sub_stripes = 2; > > > + info->devs_increment = 2; > > > + info->num_copies = 2; > > > + info->devs_min = 4; > > > + } > > > + > > > + if (type & BTRFS_BLOCK_GROUP_DATA) { > > > + info->max_stripe_size = 1024 * 1024 * 1024; > > > + info->min_stripe_size = 64 * 1024 * 1024; > > > + info->max_chunk_size = 10 * info->max_stripe_size; > > > + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { > > > + info->max_stripe_size = 256 * 1024 * 1024; > > > + info->min_stripe_size = 32 * 1024 * 1024; > > > + info->max_chunk_size = info->max_stripe_size; > > > + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { > > > + info->max_stripe_size = 8 * 1024 * 1024; > > > + info->min_stripe_size = 1 * 1024 * 1024; > > > + info->max_chunk_size = 2 * info->max_stripe_size; > > > + } else { > > > + printk(KERN_ERR "Block group is of an unknown usage type: not data, metadata or system.\n"); > > > + BUG_ON(1); > > From inspection, this looks like it''s a viable solution: > > + info->max_stripe_size = 0; > + info->min_stripe_size = -1ULL; > + info->max_chunk_size = 0; > > We only run into problems if a user of this function passes a > RAID-only block group type and then tries to use the size parameters > from it. There''s only three users of the function currently, and this > case is the only one that doesn''t pass a "real" block group type flag. > > I''ll run a quick test of dev rm and see what happens...[ I didn''t apply or run this series, take this with a grain of salt ] The problem seems to be that Hugo''s function expects on-disk chunk type as it''s input. However avail_{data,metadata,system}_alloc_bits (of which all_avail is comprised) are in-memory fields, they don''t have BTRFS_BLOCK_GROUP_{DATA,METADATA_SYSTEM} set by design. There are three fields: avail_data_alloc_bits avail_metadata_alloc_bits avail_system_alloc_bits so we don''t need BTRFS_BLOCK_GROUP_{DATA,METADATA_SYSTEM} set to differentiate between data and metadata profiles. I''d say that BUG_ON should be dropped and those three lines above added or maybe a special switch for this particular case to leave info partially un-initialized, since we only need devs_min in this case. Thanks, Ilya> > I''m hitting this BUG_ON with ''btrfs device delete'', type = 24 which is > > BTRFS_BLOCK_GROUP_RAID0 + BTRFS_BLOCK_GROUP_RAID1 . > > > > in btrfs_rm_device: > > > > 1277 all_avail = root->fs_info->avail_data_alloc_bits | > > 1278 root->fs_info->avail_system_alloc_bits | > > 1279 root->fs_info->avail_metadata_alloc_bits; > > > > the values before the call are: > > > > [ 105.107074] D: all_avail 24 > > [ 105.111844] D: root->fs_info->avail_data_alloc_bits 8 > > [ 105.118858] D: root->fs_info->avail_system_alloc_bits 16 > > [ 105.126110] D: root->fs_info->avail_metadata_alloc_bits 16 > > > > > > there are 5 devices, sdb5 - sdb9, i''m removing sdb9, after clean > > mount. > > > > > > david > > Hugo. > > -- > === Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk ==> PGP key: 515C238D from wwwkeys.eu.pgp.net or http://www.carfax.org.uk > --- vi vi vi: the Editor of the Beast. ----- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hugo Mills wrote:> This patch adds an ioctl for cancelling a btrfs balance operation > mid-flight. The ioctl simply sets a flag, and the operation terminates > after the current block group move has completed. > > Signed-off-by: Hugo Mills <hugo@carfax.org.uk> > --- > fs/btrfs/ctree.h | 1 + > fs/btrfs/ioctl.c | 28 ++++++++++++++++++++++++++++ > fs/btrfs/ioctl.h | 1 + > fs/btrfs/volumes.c | 7 ++++++- > 4 files changed, 36 insertions(+), 1 deletions(-) > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > index 25aa3cf..5031085 100644 > --- a/fs/btrfs/ctree.h > +++ b/fs/btrfs/ctree.h > @@ -876,6 +876,7 @@ struct btrfs_block_group_cache { > struct btrfs_balance_info { > u32 expected; > u32 completed; > + int cancel_pending; > }; > > struct reloc_control; > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c > index 5ddf816..d4458d0 100644 > --- a/fs/btrfs/ioctl.c > +++ b/fs/btrfs/ioctl.c > @@ -2868,6 +2868,32 @@ error: > return ret; > } > > +/* > + * Cancel a running balance operation > + */ > +long btrfs_ioctl_balance_cancel(struct btrfs_fs_info *fs_info) > +{ > + int err = 0; > + > + if (!capable(CAP_SYS_ADMIN)) > + return -EPERM; > + > + spin_lock(&fs_info->balance_info_lock); > + if (!fs_info->balance_info) { > + err = -EINVAL; > + goto error; > + } > + if (fs_info->balance_info->cancel_pending) { > + err = -ECANCELED; > + goto error; > + } > + fs_info->balance_info->cancel_pending = 1; > + > +error: > + spin_unlock(&fs_info->balance_info_lock); > + return err; > +} > + > long btrfs_ioctl(struct file *file, unsigned int > cmd, unsigned long arg) > { > @@ -2915,6 +2941,8 @@ long btrfs_ioctl(struct file *file, unsigned int > return btrfs_balance(root->fs_info->dev_root); > case BTRFS_IOC_BALANCE_PROGRESS: > return btrfs_ioctl_balance_progress(root->fs_info, argp); > + case BTRFS_IOC_BALANCE_CANCEL: > + return btrfs_ioctl_balance_cancel(root->fs_info); > case BTRFS_IOC_CLONE: > return btrfs_ioctl_clone(file, arg, 0, 0, 0); > case BTRFS_IOC_CLONE_RANGE: > diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h > index 575b25f..edcbe61 100644 > --- a/fs/btrfs/ioctl.h > +++ b/fs/btrfs/ioctl.h > @@ -255,4 +255,5 @@ struct btrfs_ioctl_balance_progress { > struct btrfs_ioctl_fs_info_args) > #define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 32, \ > struct btrfs_ioctl_balance_progress) > +#define BTRFS_IOC_BALANCE_CANCEL _IO(BTRFS_IOCTL_MAGIC, 33) > #endif > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index 4c0a386..f38b231 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -2049,6 +2049,7 @@ int btrfs_balance(struct btrfs_root *dev_root) > bal_info->expected = -1; /* One less than actually counted, > because chunk 0 is special */ > bal_info->completed = 0; > + bal_info->cancel_pending = 0; > spin_unlock(&dev_root->fs_info->balance_info_lock); > > /* step one make some room on all the devices */ > @@ -2109,7 +2110,7 @@ int btrfs_balance(struct btrfs_root *dev_root) > key.offset = (u64)-1; > key.type = BTRFS_CHUNK_ITEM_KEY; > > - while (1) { > + while (!bal_info->cancel_pending) { > ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); > if (ret < 0) > goto error; > @@ -2149,6 +2150,10 @@ int btrfs_balance(struct btrfs_root *dev_root) > bal_info->completed, bal_info->expected); > } > ret = 0; > + if (bal_info->cancel_pending) { > + printk(KERN_INFO "btrfs: balance cancelled\n"); > + ret = -EINTR; > + }Why not detect if there''s any pending signal in the while loop? so we can just use Ctrl+C to cancel balance.> error: > btrfs_free_path(path); > spin_lock(&dev_root->fs_info->balance_info_lock);-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jun 29, 2011 at 02:00:20PM +0800, Li Zefan wrote:> Hugo Mills wrote: > > @@ -2149,6 +2150,10 @@ int btrfs_balance(struct btrfs_root *dev_root) > > bal_info->completed, bal_info->expected); > > } > > ret = 0; > > + if (bal_info->cancel_pending) { > > + printk(KERN_INFO "btrfs: balance cancelled\n"); > > + ret = -EINTR; > > + } > > Why not detect if there''s any pending signal in the while loop? so > we can just use Ctrl+C to cancel balance.Balance is capable of running in background, you could not send the Ctrl-C to such process. david -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html