This patchset introduce multi-task delalloc flush, it can make the delalloc flush more faster. And besides that, it also can fix the problem that we join the same transaction handler more than 2 times. Implementation: - Create a new worker pool. - Queue the inode with pending delalloc into the work queue of the worker pool when we want to force them into the disk, and then we will wait till all the works we submit are done. - The ordered extents also can be queued into this work queue. The process is similar to the second one. Miao Xie (3): Btrfs: make delalloc inodes be flushed by multi-task Btrfs: make ordered operations be handled by multi-task Btrfs: make ordered extent be flushed by multi-task fs/btrfs/ctree.h | 14 +++++++ fs/btrfs/disk-io.c | 7 ++++ fs/btrfs/inode.c | 78 ++++++++++++++++++++++++++++++++++++++--- fs/btrfs/ordered-data.c | 87 ++++++++++++++++++++++++++++++++++------------- fs/btrfs/ordered-data.h | 7 +++- fs/btrfs/relocation.c | 6 +++- fs/btrfs/transaction.c | 24 ++++++++++--- 7 files changed, 185 insertions(+), 38 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Miao Xie
2012-Oct-25 09:28 UTC
[PATCH 1/3] Btrfs: make delalloc inodes be flushed by multi-task
This patch introduce a new worker pool named "flush_workers", and if we want to force all the inode with pending delalloc to the disks, we can queue those inodes into the work queue of the worker pool, in this way, those inodes will be flushed by multi-task. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> --- fs/btrfs/ctree.h | 14 ++++++++ fs/btrfs/disk-io.c | 7 ++++ fs/btrfs/inode.c | 78 ++++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/relocation.c | 6 +++- fs/btrfs/transaction.c | 6 +++- 5 files changed, 103 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 34c5a44..cd0c6d6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1333,6 +1333,7 @@ struct btrfs_fs_info { struct btrfs_workers generic_worker; struct btrfs_workers workers; struct btrfs_workers delalloc_workers; + struct btrfs_workers flush_workers; struct btrfs_workers endio_workers; struct btrfs_workers endio_meta_workers; struct btrfs_workers endio_meta_write_workers; @@ -3271,6 +3272,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans, int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); /* inode.c */ +struct btrfs_delalloc_work { + struct inode *inode; + int wait; + int delay_iput; + struct completion completion; + struct list_head list; + struct btrfs_work work; +}; + +struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, + int wait, int delay_iput); +void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); + struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7cda519..bd70c28 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2279,6 +2279,10 @@ int open_ctree(struct super_block *sb, fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", + fs_info->thread_pool_size, + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->submit_workers, "submit", min_t(u64, fs_devices->num_devices, fs_info->thread_pool_size), @@ -2350,6 +2354,7 @@ int open_ctree(struct super_block *sb, ret |= btrfs_start_workers(&fs_info->delayed_workers); ret |= btrfs_start_workers(&fs_info->caching_workers); ret |= btrfs_start_workers(&fs_info->readahead_workers); + ret |= btrfs_start_workers(&fs_info->flush_workers); if (ret) { err = -ENOMEM; goto fail_sb_buffer; @@ -2667,6 +2672,7 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->submit_workers); btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); + btrfs_stop_workers(&fs_info->flush_workers); fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -3339,6 +3345,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); btrfs_stop_workers(&fs_info->readahead_workers); + btrfs_stop_workers(&fs_info->flush_workers); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(root, CHECK_INTEGRITY)) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f92def2..290cd77 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations; static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; +static struct kmem_cache *btrfs_delalloc_work_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; @@ -7199,6 +7200,8 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_path_cachep); if (btrfs_free_space_cachep) kmem_cache_destroy(btrfs_free_space_cachep); + if (btrfs_delalloc_work_cachep) + kmem_cache_destroy(btrfs_delalloc_work_cachep); } int btrfs_init_cachep(void) @@ -7233,6 +7236,13 @@ int btrfs_init_cachep(void) if (!btrfs_free_space_cachep) goto fail; + btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", + sizeof(struct btrfs_delalloc_work), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_delalloc_work_cachep) + goto fail; + return 0; fail: btrfs_destroy_cachep(); @@ -7443,6 +7453,49 @@ out_notrans: return ret; } +static void btrfs_run_delalloc_work(struct btrfs_work *work) +{ + struct btrfs_delalloc_work *delalloc_work; + + delalloc_work = container_of(work, struct btrfs_delalloc_work, + work); + if (delalloc_work->wait) + btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1); + else + filemap_flush(delalloc_work->inode->i_mapping); + + if (delalloc_work->delay_iput) + btrfs_add_delayed_iput(delalloc_work->inode); + else + iput(delalloc_work->inode); + complete(&delalloc_work->completion); +} + +struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, + int wait, int delay_iput) +{ + struct btrfs_delalloc_work *work; + + work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); + if (!work) + return NULL; + + init_completion(&work->completion); + INIT_LIST_HEAD(&work->list); + work->inode = inode; + work->wait = wait; + work->delay_iput = delay_iput; + work->work.func = btrfs_run_delalloc_work; + + return work; +} + +void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) +{ + wait_for_completion(&work->completion); + kmem_cache_free(btrfs_delalloc_work_cachep, work); +} + /* * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. @@ -7452,10 +7505,15 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) struct list_head *head = &root->fs_info->delalloc_inodes; struct btrfs_inode *binode; struct inode *inode; + struct btrfs_delalloc_work *work, *next; + struct list_head works; + int ret = 0; if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + INIT_LIST_HEAD(&works); + spin_lock(&root->fs_info->delalloc_lock); while (!list_empty(head)) { binode = list_entry(head->next, struct btrfs_inode, @@ -7465,11 +7523,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) list_del_init(&binode->delalloc_inodes); spin_unlock(&root->fs_info->delalloc_lock); if (inode) { - filemap_flush(inode->i_mapping); - if (delay_iput) - btrfs_add_delayed_iput(inode); - else - iput(inode); + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + if (!work) { + ret = -ENOMEM; + goto out; + } + list_add_tail(&work->list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &work->work); } cond_resched(); spin_lock(&root->fs_info->delalloc_lock); @@ -7488,7 +7549,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); - return 0; +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + return ret; } static int btrfs_symlink(struct inode *dir, struct dentry *dentry, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 776f0aa..5bef816 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4057,7 +4057,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) (unsigned long long)rc->block_group->key.objectid, (unsigned long long)rc->block_group->flags); - btrfs_start_delalloc_inodes(fs_info->tree_root, 0); + ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0); + if (ret < 0) { + err = ret; + goto out; + } btrfs_wait_ordered_extents(fs_info->tree_root, 0); while (1) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 77db875..4aed529 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1499,7 +1499,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, WARN_ON(cur_trans != trans->transaction); if (flush_on_commit || snap_pending) { - btrfs_start_delalloc_inodes(root, 1); + ret = btrfs_start_delalloc_inodes(root, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto cleanup_transaction; + } btrfs_wait_ordered_extents(root, 1); } -- 1.6.5.2 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Miao Xie
2012-Oct-25 09:31 UTC
[PATCH 2/3] Btrfs: make ordered operations be handled by multi-task
The process of the ordered operations is similar to the delalloc inode flush, so we handle them by flush workers. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> --- fs/btrfs/ordered-data.c | 46 ++++++++++++++++++++++++++++++---------------- fs/btrfs/ordered-data.h | 2 +- fs/btrfs/transaction.c | 18 ++++++++++++++---- 3 files changed, 45 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 7772f02..ab2a3c0 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -519,13 +519,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) * extra check to make sure the ordered operation list really is empty * before we return */ -void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) { struct btrfs_inode *btrfs_inode; struct inode *inode; struct list_head splice; + struct list_head works; + struct btrfs_delalloc_work *work, *next; + int ret = 0; INIT_LIST_HEAD(&splice); + INIT_LIST_HEAD(&works); mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); @@ -533,6 +537,7 @@ again: list_splice_init(&root->fs_info->ordered_operations, &splice); while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, ordered_operations); @@ -549,15 +554,26 @@ again: list_add_tail(&BTRFS_I(inode)->ordered_operations, &root->fs_info->ordered_operations); } + + if (!inode) + continue; spin_unlock(&root->fs_info->ordered_extent_lock); - if (inode) { - if (wait) - btrfs_wait_ordered_range(inode, 0, (u64)-1); - else - filemap_flush(inode->i_mapping); - btrfs_add_delayed_iput(inode); + work = btrfs_alloc_delalloc_work(inode, wait, 1); + if (!work) { + if (list_empty(&BTRFS_I(inode)->ordered_operations)) + list_add_tail(&btrfs_inode->ordered_operations, + &splice); + spin_lock(&root->fs_info->ordered_extent_lock); + list_splice_tail(&splice, + &root->fs_info->ordered_operations); + spin_unlock(&root->fs_info->ordered_extent_lock); + ret = -ENOMEM; + goto out; } + list_add_tail(&work->list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &work->work); cond_resched(); spin_lock(&root->fs_info->ordered_extent_lock); @@ -566,7 +582,13 @@ again: goto again; spin_unlock(&root->fs_info->ordered_extent_lock); +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } mutex_unlock(&root->fs_info->ordered_operations_mutex); + return ret; } /* @@ -934,15 +956,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, if (last_mod < root->fs_info->last_trans_committed) return; - /* - * the transaction is already committing. Just start the IO and - * don''t bother with all of this list nonsense - */ - if (trans && root->fs_info->running_transaction->blocked) { - btrfs_wait_ordered_range(inode, 0, (u64)-1); - return; - } - spin_lock(&root->fs_info->ordered_extent_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) { list_add_tail(&BTRFS_I(inode)->ordered_operations, @@ -959,6 +972,7 @@ int __init ordered_data_init(void) NULL); if (!btrfs_ordered_extent_cache) return -ENOMEM; + return 0; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index dd27a0b..e8dcec6 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -186,7 +186,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4aed529..621790e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1414,15 +1414,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; DEFINE_WAIT(wait); - int ret = -EIO; + int ret; int should_grow = 0; unsigned long now = get_seconds(); int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); - btrfs_run_ordered_operations(root, 0); + ret = btrfs_run_ordered_operations(root, 0); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto cleanup_transaction; + } - if (cur_trans->aborted) + if (cur_trans->aborted) { + ret = cur_trans->aborted; goto cleanup_transaction; + } /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here @@ -1525,7 +1531,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * it here and no for sure that nothing new will be added * to the list */ - btrfs_run_ordered_operations(root, 1); + ret = btrfs_run_ordered_operations(root, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto cleanup_transaction; + } prepare_to_wait(&cur_trans->writer_wait, &wait, TASK_UNINTERRUPTIBLE); -- 1.6.5.2 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Miao Xie
2012-Oct-25 09:41 UTC
[PATCH 3/3] Btrfs: make ordered extent be flushed by multi-task
Though the process of the ordered extents is a bit different with the delalloc inode flush, but we can see it as a subset of the delalloc inode flush, so we also handle them by flush workers. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> --- fs/btrfs/ordered-data.c | 41 +++++++++++++++++++++++++++++++++-------- fs/btrfs/ordered-data.h | 5 ++++- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ab2a3c0..eecc20f 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); INIT_LIST_HEAD(&entry->root_extent_list); + INIT_LIST_HEAD(&entry->work_list); + init_completion(&entry->completion); trace_btrfs_ordered_extent_add(inode, entry); @@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode, wake_up(&entry->wait); } +static void btrfs_run_ordered_extent_work(struct btrfs_work *work) +{ + struct btrfs_ordered_extent *ordered; + + ordered = container_of(work, struct btrfs_ordered_extent, flush_work); + btrfs_start_ordered_extent(ordered->inode, ordered, 1); + complete(&ordered->completion); +} + /* * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) { - struct list_head splice; + struct list_head splice, works; struct list_head *cur; - struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *ordered, *next; struct inode *inode; INIT_LIST_HEAD(&splice); + INIT_LIST_HEAD(&works); spin_lock(&root->fs_info->ordered_extent_lock); list_splice_init(&root->fs_info->ordered_extents, &splice); @@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) spin_unlock(&root->fs_info->ordered_extent_lock); if (inode) { - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - if (delay_iput) - btrfs_add_delayed_iput(inode); - else - iput(inode); + ordered->flush_work.func = btrfs_run_ordered_extent_work; + list_add_tail(&ordered->work_list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &ordered->flush_work); } else { btrfs_put_ordered_extent(ordered); } + cond_resched(); spin_lock(&root->fs_info->ordered_extent_lock); } spin_unlock(&root->fs_info->ordered_extent_lock); + + list_for_each_entry_safe(ordered, next, &works, work_list) { + list_del_init(&ordered->work_list); + wait_for_completion(&ordered->completion); + + inode = ordered->inode; + btrfs_put_ordered_extent(ordered); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); + + cond_resched(); + } } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e8dcec6..efc7c29 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -128,8 +128,11 @@ struct btrfs_ordered_extent { struct list_head root_extent_list; struct btrfs_work work; -}; + struct completion completion; + struct btrfs_work flush_work; + struct list_head work_list; +}; /* * calculates the total size you need to allocate for an ordered sum -- 1.6.5.2 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 10/25/2012 05:20 PM, Miao Xie wrote:> This patchset introduce multi-task delalloc flush, it can make the delalloc > flush more faster. And besides that, it also can fix the problem that we join > the same transaction handler more than 2 times. > > Implementation: > - Create a new worker pool. > - Queue the inode with pending delalloc into the work queue of the worker pool > when we want to force them into the disk, and then we will wait till all the > works we submit are done. > - The ordered extents also can be queued into this work queue. The process is > similar to the second one. >I can see the potential improvements brought by flushing inodes this way. But I don''t think it makes much sense by making waiting process multi-task, since even we spread wait order extents into different cpus, they just occpied the cpu and went on waiting and scheduled then, I mean, the bottleneck is on what we''re waiting for. Besides, considering that this patchset is about to getting us better performance, I''m expecting any performance numbers (I''m a little worried about context switches overhead). btw, cool ideas indeed. thanks, liubo> Miao Xie (3): > Btrfs: make delalloc inodes be flushed by multi-task > Btrfs: make ordered operations be handled by multi-task > Btrfs: make ordered extent be flushed by multi-task > > fs/btrfs/ctree.h | 14 +++++++ > fs/btrfs/disk-io.c | 7 ++++ > fs/btrfs/inode.c | 78 ++++++++++++++++++++++++++++++++++++++--- > fs/btrfs/ordered-data.c | 87 ++++++++++++++++++++++++++++++++++------------- > fs/btrfs/ordered-data.h | 7 +++- > fs/btrfs/relocation.c | 6 +++- > fs/btrfs/transaction.c | 24 ++++++++++--- > 7 files changed, 185 insertions(+), 38 deletions(-) > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html >-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On thu, 25 Oct 2012 19:53:05 +0800, Liu Bo wrote:> On 10/25/2012 05:20 PM, Miao Xie wrote: >> This patchset introduce multi-task delalloc flush, it can make the delalloc >> flush more faster. And besides that, it also can fix the problem that we join >> the same transaction handler more than 2 times. >> >> Implementation: >> - Create a new worker pool. >> - Queue the inode with pending delalloc into the work queue of the worker pool >> when we want to force them into the disk, and then we will wait till all the >> works we submit are done. >> - The ordered extents also can be queued into this work queue. The process is >> similar to the second one. >> > > I can see the potential improvements brought by flushing inodes this way. > > But I don''t think it makes much sense by making waiting process multi-task, > since even we spread wait order extents into different cpus, they just occpied > the cpu and went on waiting and scheduled then, I mean, the bottleneck is on > what we''re waiting for.Thanks for your comment, I think only btrfs_run_ordered_operations(root, 0) needn''t wait for the works, the others must wait. The first reason is to avoid changing the semantic of those tree function. The second reason is we have to wait for the completion of all works, if not, the file data in snapshots may be different with the source suvolumes because the flush may not end before the snapshot creation.> Besides, considering that this patchset is about to getting us better performance, > I''m expecting any performance numbers (I''m a little worried about context switches > overhead).OK, I''ll send it out later. Thanks Miao> btw, cool ideas indeed. > > thanks, > liubo > >> Miao Xie (3): >> Btrfs: make delalloc inodes be flushed by multi-task >> Btrfs: make ordered operations be handled by multi-task >> Btrfs: make ordered extent be flushed by multi-task >> >> fs/btrfs/ctree.h | 14 +++++++ >> fs/btrfs/disk-io.c | 7 ++++ >> fs/btrfs/inode.c | 78 ++++++++++++++++++++++++++++++++++++++--- >> fs/btrfs/ordered-data.c | 87 ++++++++++++++++++++++++++++++++++------------- >> fs/btrfs/ordered-data.h | 7 +++- >> fs/btrfs/relocation.c | 6 +++- >> fs/btrfs/transaction.c | 24 ++++++++++--- >> 7 files changed, 185 insertions(+), 38 deletions(-) >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html >> > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html >-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 10/26/2012 09:56 AM, Miao Xie wrote:>> I can see the potential improvements brought by flushing inodes this way. >> > >> > But I don''t think it makes much sense by making waiting process multi-task, >> > since even we spread wait order extents into different cpus, they just occpied >> > the cpu and went on waiting and scheduled then, I mean, the bottleneck is on >> > what we''re waiting for. > Thanks for your comment, I think only btrfs_run_ordered_operations(root, 0) needn''t > wait for the works, the others must wait. > > The first reason is to avoid changing the semantic of those tree function. The second > reason is we have to wait for the completion of all works, if not, the file data in > snapshots may be different with the source suvolumes because the flush may not end > before the snapshot creation. >Yes, it''s right that they must wait for all workers to finish. But I don''t mean that(sorry for my confusing words). IMO we don''t need to let *btrfs_wait_ordered_extents()* run as multi-task. thanks, liubo -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 26 Oct 2012 10:05:55 +0800, Liu Bo wrote:> On 10/26/2012 09:56 AM, Miao Xie wrote: >>> I can see the potential improvements brought by flushing inodes this way. >>>> >>>> But I don''t think it makes much sense by making waiting process multi-task, >>>> since even we spread wait order extents into different cpus, they just occpied >>>> the cpu and went on waiting and scheduled then, I mean, the bottleneck is on >>>> what we''re waiting for. >> Thanks for your comment, I think only btrfs_run_ordered_operations(root, 0) needn''t >> wait for the works, the others must wait. >> >> The first reason is to avoid changing the semantic of those tree function. The second >> reason is we have to wait for the completion of all works, if not, the file data in >> snapshots may be different with the source suvolumes because the flush may not end >> before the snapshot creation. >> > > Yes, it''s right that they must wait for all workers to finish. > > But I don''t mean that(sorry for my confusing words). > > IMO we don''t need to let *btrfs_wait_ordered_extents()* run as multi-task.It also need to be done by multi-task because btrfs_wait_ordered_extents() doesn''t imply that all the dirty pages in the ordered extent have been written into the disk, that is it also need do lots of things before waiting for the event - BTRFS_ORDERED_COMPLETE, so the multi-task process is useful, I think. Anyway, we need test to validate it. Thanks Miao> > thanks, > liubo > > > >-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 10/26/2012 11:25 AM, Miao Xie wrote:> On Fri, 26 Oct 2012 10:05:55 +0800, Liu Bo wrote: >> On 10/26/2012 09:56 AM, Miao Xie wrote: >>>> I can see the potential improvements brought by flushing inodes this way. >>>>> >>>>> But I don''t think it makes much sense by making waiting process multi-task, >>>>> since even we spread wait order extents into different cpus, they just occpied >>>>> the cpu and went on waiting and scheduled then, I mean, the bottleneck is on >>>>> what we''re waiting for. >>> Thanks for your comment, I think only btrfs_run_ordered_operations(root, 0) needn''t >>> wait for the works, the others must wait. >>> >>> The first reason is to avoid changing the semantic of those tree function. The second >>> reason is we have to wait for the completion of all works, if not, the file data in >>> snapshots may be different with the source suvolumes because the flush may not end >>> before the snapshot creation. >>> >> >> Yes, it''s right that they must wait for all workers to finish. >> >> But I don''t mean that(sorry for my confusing words). >> >> IMO we don''t need to let *btrfs_wait_ordered_extents()* run as multi-task. > > It also need to be done by multi-task because btrfs_wait_ordered_extents() doesn''t imply > that all the dirty pages in the ordered extent have been written into the disk, that is > it also need do lots of things before waiting for the event - BTRFS_ORDERED_COMPLETE, so > the multi-task process is useful, I think. >Well, I missed the flushing part.> Anyway, we need test to validate it. > > Thanks > Miao > >> >> thanks, >> liubo >> >> >> >> > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html >-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi, Josef Please drop this patchset from your btrfs-next tree because it may cause the performance regression in some cases. I''ll improve it later. Thanks Miao On thu, 25 Oct 2012 17:20:29 +0800, Miao Xie wrote:> This patchset introduce multi-task delalloc flush, it can make the delalloc > flush more faster. And besides that, it also can fix the problem that we join > the same transaction handler more than 2 times. > > Implementation: > - Create a new worker pool. > - Queue the inode with pending delalloc into the work queue of the worker pool > when we want to force them into the disk, and then we will wait till all the > works we submit are done. > - The ordered extents also can be queued into this work queue. The process is > similar to the second one. > > Miao Xie (3): > Btrfs: make delalloc inodes be flushed by multi-task > Btrfs: make ordered operations be handled by multi-task > Btrfs: make ordered extent be flushed by multi-task > > fs/btrfs/ctree.h | 14 +++++++ > fs/btrfs/disk-io.c | 7 ++++ > fs/btrfs/inode.c | 78 ++++++++++++++++++++++++++++++++++++++--- > fs/btrfs/ordered-data.c | 87 ++++++++++++++++++++++++++++++++++------------- > fs/btrfs/ordered-data.h | 7 +++- > fs/btrfs/relocation.c | 6 +++- > fs/btrfs/transaction.c | 24 ++++++++++--- > 7 files changed, 185 insertions(+), 38 deletions(-) > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html >-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html