While testing xfstests 068, I realized that commit bd7de2c9a449e26a5493d918618eb20ae60d56bd (Btrfs: fix deadlock with freeze and sync V2) did not fix the bug yet, since someone might jump in between checking running transaction and joining transaction, and we may still run into deadlock between freeze and sync. So IMO the safest and most efficient way is to check running transaction in joining a transaction directly. With this patch, I tested xfstests 068 for 120 times and it did not get into deadlock at least here. Signed-off-by: Liu Bo <bo.li.liu@oracle.com> --- fs/btrfs/super.c | 9 +-------- fs/btrfs/transaction.c | 11 ++++++++++- fs/btrfs/transaction.h | 1 + 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index abb9081..02a3961 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -852,14 +852,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_wait_ordered_extents(root, 0, 0); - spin_lock(&fs_info->trans_lock); - if (!fs_info->running_transaction) { - spin_unlock(&fs_info->trans_lock); - return 0; - } - spin_unlock(&fs_info->trans_lock); - - trans = btrfs_join_transaction(root); + trans = btrfs_join_transaction_only(root); if (IS_ERR(trans)) return PTR_ERR(trans); return btrfs_commit_transaction(trans, root); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 27c2600..0c17d9e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -272,6 +272,7 @@ enum btrfs_trans_type { TRANS_JOIN, TRANS_USERSPACE, TRANS_JOIN_NOLOCK, + TRANS_JOIN_ONLY, }; static int may_wait_transaction(struct btrfs_root *root, int type) @@ -302,12 +303,15 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, return ERR_PTR(-EROFS); if (current->journal_info) { - WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); + WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK && + type != TRANS_JOIN_ONLY); h = current->journal_info; h->use_count++; h->orig_rsv = h->block_rsv; h->block_rsv = NULL; goto got_it; + } else if (type == TRANS_JOIN_ONLY) { + return ERR_PTR(-ENOENT); } /* @@ -405,6 +409,11 @@ struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root return start_transaction(root, 0, TRANS_JOIN_NOLOCK); } +struct btrfs_trans_handle *btrfs_join_transaction_only(struct btrfs_root *root) +{ + return start_transaction(root, 0, TRANS_JOIN_ONLY); +} + struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_USERSPACE); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index e8b8416..59adf55 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -98,6 +98,7 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_join_transaction_only(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, -- 1.7.7.6 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
In some workloads we have nested joining transaction operations, eg. run_delalloc_nocow btrfs_join_transaction cow_file_range btrfs_join_transaction it can be a serious bug since each trans handler has only two block_rsv, orig_rsv and block_rsv, which means we may lose our first block_rsv after two joining transaction operations: 1) btrfs_start_transaction trans->block_rsv = A 2) btrfs_join_transaction trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now A trans->block_rsv = B 3) btrfs_join_transaction trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now B trans->block_rsv = C ... This uses a list of block_rsv instead so that we can either a) PUSH the old one into the list and use a new one in joining, or b) POP the old one in ending this transaction. Signed-off-by: Liu Bo <bo.li.liu@oracle.com> --- fs/btrfs/transaction.c | 25 +++++++++++++++++++++---- fs/btrfs/transaction.h | 7 ++++++- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0c17d9e..a36ae05 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -306,9 +306,17 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK && type != TRANS_JOIN_ONLY); h = current->journal_info; - h->use_count++; - h->orig_rsv = h->block_rsv; + if (h->block_rsv) { + struct btrfs_trans_rsv_item *item; + item = kmalloc(sizeof(*item), GFP_NOFS); + if (!item) + return ERR_PTR(-ENOMEM); + item->rsv = h->block_rsv; + INIT_LIST_HEAD(&item->list); + list_add(&item->list, &h->blk_rsv_list); + } h->block_rsv = NULL; + h->use_count++; goto got_it; } else if (type == TRANS_JOIN_ONLY) { return ERR_PTR(-ENOENT); @@ -367,11 +375,11 @@ again: h->use_count = 1; h->adding_csums = 0; h->block_rsv = NULL; - h->orig_rsv = NULL; h->aborted = 0; h->qgroup_reserved = qgroup_reserved; h->delayed_ref_elem.seq = 0; INIT_LIST_HEAD(&h->qgroup_ref_list); + INIT_LIST_HEAD(&h->blk_rsv_list); smp_mb(); if (cur_trans->blocked && may_wait_transaction(root, type)) { @@ -523,7 +531,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, int err = 0; if (--trans->use_count) { - trans->block_rsv = trans->orig_rsv; + trans->block_rsv = NULL; + if (!list_empty(&trans->blk_rsv_list)) { + struct btrfs_trans_rsv_item *item; + item = list_entry(trans->blk_rsv_list.next, + struct btrfs_trans_rsv_item, list); + list_del_init(&item->list); + trans->block_rsv = item->rsv; + kfree(item); + } return 0; } @@ -558,6 +574,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, count++; } btrfs_trans_release_metadata(trans, root); + BUG_ON(!list_empty(&trans->blk_rsv_list)); trans->block_rsv = NULL; sb_end_intwrite(root->fs_info->sb); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 59adf55..7fa11b7 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -57,7 +57,6 @@ struct btrfs_trans_handle { unsigned long delayed_ref_updates; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; - struct btrfs_block_rsv *orig_rsv; int aborted; int adding_csums; /* @@ -68,6 +67,12 @@ struct btrfs_trans_handle { struct btrfs_root *root; struct seq_list delayed_ref_elem; struct list_head qgroup_ref_list; + struct list_head blk_rsv_list; +}; + +struct btrfs_trans_rsv_item { + struct btrfs_block_rsv *rsv; + struct list_head list; }; struct btrfs_pending_snapshot { -- 1.7.7.6 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Liu Bo
2012-Sep-14 08:58 UTC
[PATCH 3/5] Btrfs: cleanup for duplicated code in find_free_extent
There is already an ''add free space'' phrase in front of this one, we needn''t to redo it. Signed-off-by: Liu Bo <bo.li.liu@oracle.com> --- fs/btrfs/extent-tree.c | 4 ---- 1 files changed, 0 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ba58024..6575f89 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5806,10 +5806,6 @@ checks: trace_btrfs_reserve_extent(orig_root, block_group, search_start, num_bytes); - if (offset < search_start) - btrfs_add_free_space(used_block_group, offset, - search_start - offset); - BUG_ON(offset > search_start); if (used_block_group != block_group) btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); -- 1.7.7.6 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
fs_info->hashers is now an obsolete one. Signed-off-by: Liu Bo <bo.li.liu@oracle.com> --- fs/btrfs/ctree.h | 1 - fs/btrfs/disk-io.c | 1 - 2 files changed, 0 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0d195b5..ab7cd13 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1240,7 +1240,6 @@ struct btrfs_fs_info { struct mutex reloc_mutex; struct list_head trans_list; - struct list_head hashers; struct list_head dead_roots; struct list_head caching_block_groups; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 22e98e0..fe39362 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1994,7 +1994,6 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); - INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); INIT_LIST_HEAD(&fs_info->ordered_operations); INIT_LIST_HEAD(&fs_info->caching_block_groups); -- 1.7.7.6 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Liu Bo
2012-Sep-14 08:58 UTC
[PATCH 5/5] Btrfs: kill obsolete arguments in btrfs_wait_ordered_extents
nocow_only is now an obsolete argument. Signed-off-by: Liu Bo <bo.li.liu@oracle.com> --- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/ordered-data.c | 12 +----------- fs/btrfs/ordered-data.h | 3 +-- fs/btrfs/relocation.c | 2 +- fs/btrfs/super.c | 2 +- fs/btrfs/transaction.c | 2 +- 6 files changed, 7 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6575f89..2d2aa04 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3693,7 +3693,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, if (delalloc_bytes == 0) { if (trans) return; - btrfs_wait_ordered_extents(root, 0, 0); + btrfs_wait_ordered_extents(root, 0); return; } @@ -3715,7 +3715,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_extents(root, 0, 0); + btrfs_wait_ordered_extents(root, 0); } else { time_left = schedule_timeout_killable(1); if (time_left) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 643335a..7efedee 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -466,8 +466,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -void btrfs_wait_ordered_extents(struct btrfs_root *root, - int nocow_only, int delay_iput) +void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) { struct list_head splice; struct list_head *cur; @@ -482,15 +481,6 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, cur = splice.next; ordered = list_entry(cur, struct btrfs_ordered_extent, root_extent_list); - if (nocow_only && - !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && - !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { - list_move(&ordered->root_extent_list, - &root->fs_info->ordered_extents); - cond_resched_lock(&root->fs_info->ordered_extent_lock); - continue; - } - list_del_init(&ordered->root_extent_list); atomic_inc(&ordered->refs); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e03c560..fd36800 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -183,6 +183,5 @@ void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); -void btrfs_wait_ordered_extents(struct btrfs_root *root, - int nocow_only, int delay_iput); +void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); #endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4da0865..cf56929 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4057,7 +4057,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) (unsigned long long)rc->block_group->flags); btrfs_start_delalloc_inodes(fs_info->tree_root, 0); - btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); + btrfs_wait_ordered_extents(fs_info->tree_root, 0); while (1) { mutex_lock(&fs_info->cleaner_mutex); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 02a3961..f4b4368 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -850,7 +850,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_extents(root, 0, 0); + btrfs_wait_ordered_extents(root, 0); trans = btrfs_join_transaction_only(root); if (IS_ERR(trans)) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a36ae05..cb791d4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1429,7 +1429,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (flush_on_commit || snap_pending) { btrfs_start_delalloc_inodes(root, 1); - btrfs_wait_ordered_extents(root, 0, 1); + btrfs_wait_ordered_extents(root, 1); } ret = btrfs_run_delayed_items(trans, root); -- 1.7.7.6 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On fri, 14 Sep 2012 16:58:03 +0800, Liu Bo wrote:> While testing xfstests 068, I realized that > > commit bd7de2c9a449e26a5493d918618eb20ae60d56bd > (Btrfs: fix deadlock with freeze and sync V2) > > did not fix the bug yet, since someone might jump in between checking > running transaction and joining transaction, and we may still run into > deadlock between freeze and sync.Did you meet the problem by test? I think it is impossible to happen, because nobody can start a new transaction after the filesystem is froze, so the ->running_transaction check must be false when syncing the filesystem. And beside that this patch is wrong(Please see below).> So IMO the safest and most efficient way is to check running transaction > in joining a transaction directly. > > With this patch, I tested xfstests 068 for 120 times and it did not get > into deadlock at least here. > > Signed-off-by: Liu Bo <bo.li.liu@oracle.com> > --- > fs/btrfs/super.c | 9 +-------- > fs/btrfs/transaction.c | 11 ++++++++++- > fs/btrfs/transaction.h | 1 + > 3 files changed, 12 insertions(+), 9 deletions(-) > > diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c > index abb9081..02a3961 100644 > --- a/fs/btrfs/super.c > +++ b/fs/btrfs/super.c > @@ -852,14 +852,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) > > btrfs_wait_ordered_extents(root, 0, 0); > > - spin_lock(&fs_info->trans_lock); > - if (!fs_info->running_transaction) { > - spin_unlock(&fs_info->trans_lock); > - return 0; > - } > - spin_unlock(&fs_info->trans_lock); > - > - trans = btrfs_join_transaction(root); > + trans = btrfs_join_transaction_only(root); > if (IS_ERR(trans)) > return PTR_ERR(trans); > return btrfs_commit_transaction(trans, root); > diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c > index 27c2600..0c17d9e 100644 > --- a/fs/btrfs/transaction.c > +++ b/fs/btrfs/transaction.c > @@ -272,6 +272,7 @@ enum btrfs_trans_type { > TRANS_JOIN, > TRANS_USERSPACE, > TRANS_JOIN_NOLOCK, > + TRANS_JOIN_ONLY, > }; > > static int may_wait_transaction(struct btrfs_root *root, int type) > @@ -302,12 +303,15 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, > return ERR_PTR(-EROFS); > > if (current->journal_info) { > - WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); > + WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK && > + type != TRANS_JOIN_ONLY); > h = current->journal_info; > h->use_count++; > h->orig_rsv = h->block_rsv; > h->block_rsv = NULL; > goto got_it; > + } else if (type == TRANS_JOIN_ONLY) { > + return ERR_PTR(-ENOENT); > }the code here is wrong, it makes the sync task skip the transaction commit because ->journal_info of the sync task is always NULL(Only ->journal_info of the task which starts transaction before it end the current transaction is !0). Thanks Miao -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Sep 14, 2012 at 04:58:04PM +0800, Liu Bo wrote:> --- a/fs/btrfs/transaction.c > +++ b/fs/btrfs/transaction.c > @@ -306,9 +306,17 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, > WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK && > type != TRANS_JOIN_ONLY); > h = current->journal_info; > - h->use_count++; > - h->orig_rsv = h->block_rsv; > + if (h->block_rsv) { > + struct btrfs_trans_rsv_item *item; > + item = kmalloc(sizeof(*item), GFP_NOFS);I''d rather avoid the kmalloc here and add a list hook into btrfs_block_rsv itself (used only for this purpose). It also does not increase the failure surface and we don''t have to handle error conditions from deep callchains.> + if (!item) > + return ERR_PTR(-ENOMEM); > + item->rsv = h->block_rsv; > + INIT_LIST_HEAD(&item->list); > + list_add(&item->list, &h->blk_rsv_list); > + } > h->block_rsv = NULL; > + h->use_count++; > goto got_it; > } else if (type == TRANS_JOIN_ONLY) { > return ERR_PTR(-ENOENT); > --- a/fs/btrfs/transaction.h > +++ b/fs/btrfs/transaction.h > @@ -57,7 +57,6 @@ struct btrfs_trans_handle { > unsigned long delayed_ref_updates; > struct btrfs_transaction *transaction; > struct btrfs_block_rsv *block_rsv; > - struct btrfs_block_rsv *orig_rsv; > int aborted; > int adding_csums; > /* > @@ -68,6 +67,12 @@ struct btrfs_trans_handle { > struct btrfs_root *root; > struct seq_list delayed_ref_elem; > struct list_head qgroup_ref_list; > + struct list_head blk_rsv_list;Does it refer to chain of orig_rsv''s ? Ie. naming it orig_blk_rsv_list> +}; > + > +struct btrfs_trans_rsv_item { > + struct btrfs_block_rsv *rsv; > + struct list_head list;Generally, for such ''list of single pointers'' structs I''d evaluate the possibility of embedding the hook inside the struct, the overhead (memory, processing) is not desirable.> }; > > struct btrfs_pending_snapshot {-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Sep 14, 2012 at 04:58:06PM +0800, Liu Bo wrote:> fs_info->hashers is now an obsolete one. > > Signed-off-by: Liu Bo <bo.li.liu@oracle.com>Reviewed-by: David Sterba <dsterba@suse.cz> -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/14/2012 07:15 PM, David Sterba wrote:> On Fri, Sep 14, 2012 at 04:58:04PM +0800, Liu Bo wrote: >> --- a/fs/btrfs/transaction.c >> +++ b/fs/btrfs/transaction.c >> @@ -306,9 +306,17 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, >> WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK && >> type != TRANS_JOIN_ONLY); >> h = current->journal_info; >> - h->use_count++; >> - h->orig_rsv = h->block_rsv; >> + if (h->block_rsv) { >> + struct btrfs_trans_rsv_item *item; >> + item = kmalloc(sizeof(*item), GFP_NOFS); > > I''d rather avoid the kmalloc here and add a list hook into > btrfs_block_rsv itself (used only for this purpose). > > It also does not increase the failure surface and we don''t have to > handle error conditions from deep callchains. >Actually I placed a list hook at first, but I found the same block_rsv could be inserted into the list chain twice, which will cause list_head''s terrible warnings.>> + if (!item) >> + return ERR_PTR(-ENOMEM); >> + item->rsv = h->block_rsv; >> + INIT_LIST_HEAD(&item->list); >> + list_add(&item->list, &h->blk_rsv_list); >> + } >> h->block_rsv = NULL; >> + h->use_count++; >> goto got_it; >> } else if (type == TRANS_JOIN_ONLY) { >> return ERR_PTR(-ENOENT); >> --- a/fs/btrfs/transaction.h >> +++ b/fs/btrfs/transaction.h >> @@ -57,7 +57,6 @@ struct btrfs_trans_handle { >> unsigned long delayed_ref_updates; >> struct btrfs_transaction *transaction; >> struct btrfs_block_rsv *block_rsv; >> - struct btrfs_block_rsv *orig_rsv; >> int aborted; >> int adding_csums; >> /* >> @@ -68,6 +67,12 @@ struct btrfs_trans_handle { >> struct btrfs_root *root; >> struct seq_list delayed_ref_elem; >> struct list_head qgroup_ref_list; >> + struct list_head blk_rsv_list; > > Does it refer to chain of orig_rsv''s ? Ie. naming it orig_blk_rsv_list >Make sense.>> +}; >> + >> +struct btrfs_trans_rsv_item { >> + struct btrfs_block_rsv *rsv; >> + struct list_head list; > > Generally, for such ''list of single pointers'' structs I''d evaluate the > possibility of embedding the hook inside the struct, the overhead > (memory, processing) is not desirable. >See the above, plz. thanks, liubo>> }; >> >> struct btrfs_pending_snapshot {-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/14/2012 06:07 PM, Miao Xie wrote:> On fri, 14 Sep 2012 16:58:03 +0800, Liu Bo wrote: >> While testing xfstests 068, I realized that >> >> commit bd7de2c9a449e26a5493d918618eb20ae60d56bd >> (Btrfs: fix deadlock with freeze and sync V2) >> >> did not fix the bug yet, since someone might jump in between checking >> running transaction and joining transaction, and we may still run into >> deadlock between freeze and sync. > > Did you meet the problem by test? > I think it is impossible to happen, because nobody can start a new transaction > after the filesystem is froze, so the ->running_transaction check must be false > when syncing the filesystem. And beside that this patch is wrong(Please see below). >Yes, I knew the reason but I did run into the same deadlock.>> So IMO the safest and most efficient way is to check running transaction >> in joining a transaction directly. >> >> With this patch, I tested xfstests 068 for 120 times and it did not get >> into deadlock at least here. >> >> Signed-off-by: Liu Bo <bo.li.liu@oracle.com> >> --- >> fs/btrfs/super.c | 9 +-------- >> fs/btrfs/transaction.c | 11 ++++++++++- >> fs/btrfs/transaction.h | 1 + >> 3 files changed, 12 insertions(+), 9 deletions(-) >> >> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c >> index abb9081..02a3961 100644 >> --- a/fs/btrfs/super.c >> +++ b/fs/btrfs/super.c >> @@ -852,14 +852,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) >> >> btrfs_wait_ordered_extents(root, 0, 0); >> >> - spin_lock(&fs_info->trans_lock); >> - if (!fs_info->running_transaction) { >> - spin_unlock(&fs_info->trans_lock); >> - return 0; >> - } >> - spin_unlock(&fs_info->trans_lock); >> - >> - trans = btrfs_join_transaction(root); >> + trans = btrfs_join_transaction_only(root); >> if (IS_ERR(trans)) >> return PTR_ERR(trans); >> return btrfs_commit_transaction(trans, root); >> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c >> index 27c2600..0c17d9e 100644 >> --- a/fs/btrfs/transaction.c >> +++ b/fs/btrfs/transaction.c >> @@ -272,6 +272,7 @@ enum btrfs_trans_type { >> TRANS_JOIN, >> TRANS_USERSPACE, >> TRANS_JOIN_NOLOCK, >> + TRANS_JOIN_ONLY, >> }; >> >> static int may_wait_transaction(struct btrfs_root *root, int type) >> @@ -302,12 +303,15 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, >> return ERR_PTR(-EROFS); >> >> if (current->journal_info) { >> - WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); >> + WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK && >> + type != TRANS_JOIN_ONLY); >> h = current->journal_info; >> h->use_count++; >> h->orig_rsv = h->block_rsv; >> h->block_rsv = NULL; >> goto got_it; >> + } else if (type == TRANS_JOIN_ONLY) { >> + return ERR_PTR(-ENOENT); >> } > > the code here is wrong, it makes the sync task skip the transaction commit because > ->journal_info of the sync task is always NULL(Only ->journal_info of the task which > starts transaction before it end the current transaction is !0). >I''ve double checked this and realized what you''re saying is reasonable. So there must be something wrong elsewhere, I''ll look into it :) Thanks a LOT! - liubo -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Sterba
2012-Sep-14 11:36 UTC
Re: [PATCH 3/5] Btrfs: cleanup for duplicated code in find_free_extent
On Fri, Sep 14, 2012 at 04:58:05PM +0800, Liu Bo wrote:> There is already an ''add free space'' phrase in front of this one, we > needn''t to redo it. > > Signed-off-by: Liu Bo <bo.li.liu@oracle.com>Yes, the check is duplicate and the values cannot change inbetween, moreover this calls the add_free_space twice, to it''s likely to fix things (or not, the whole function is a mess and I''m not sticking my reviewed-by tag). Also, the return value of btrfs_add_free_space should be checked, as it may silently fail in case of ENOMEM, and fail loudly if there''s failure in: btrfs_add_free_space __btrfs_add_free_space - either link_free_space - or insert_into_bitmap fail with -EEXIST BUG_ON I saw the crash inside insert_into_bitmap in logs from yesterday, I haven''t processed them yet, but it looks that it matches this situation.> --- > fs/btrfs/extent-tree.c | 4 ---- > 1 files changed, 0 insertions(+), 4 deletions(-) > > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index ba58024..6575f89 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -5806,10 +5806,6 @@ checks: > > trace_btrfs_reserve_extent(orig_root, block_group, > search_start, num_bytes); > - if (offset < search_start) > - btrfs_add_free_space(used_block_group, offset, > - search_start - offset); > - BUG_ON(offset > search_start); > if (used_block_group != block_group) > btrfs_put_block_group(used_block_group); > btrfs_put_block_group(block_group); > -- > 1.7.7.6 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Sep 14, 2012 at 07:25:45PM +0800, Liu Bo wrote:> On 09/14/2012 07:15 PM, David Sterba wrote: > > On Fri, Sep 14, 2012 at 04:58:04PM +0800, Liu Bo wrote: > >> --- a/fs/btrfs/transaction.c > >> +++ b/fs/btrfs/transaction.c > >> @@ -306,9 +306,17 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, > >> WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK && > >> type != TRANS_JOIN_ONLY); > >> h = current->journal_info; > >> - h->use_count++; > >> - h->orig_rsv = h->block_rsv; > >> + if (h->block_rsv) { > >> + struct btrfs_trans_rsv_item *item; > >> + item = kmalloc(sizeof(*item), GFP_NOFS); > > > > I''d rather avoid the kmalloc here and add a list hook into > > btrfs_block_rsv itself (used only for this purpose). > > > > It also does not increase the failure surface and we don''t have to > > handle error conditions from deep callchains. > > > > Actually I placed a list hook at first, but I found the same block_rsv could be inserted into > the list chain twice, which will cause list_head''s terrible warnings.Is it expected and correct to add the rsv twice? I know the transaction joins and commits are sometimes wildly nested so it does not mean that''s necessarily a bug. Then it is not possible to embed the list hook, we need to keep the full track of the blk_rsv stack. I''m counting two other similar structs used inside btrfs (list_head + 8B value), we could make a shared separate slab for them. struct delayed_iput { struct list_head list; /* 0 16 */ struct inode * inode; /* 16 8 */ /* size: 24, cachelines: 1, members: 2 */ /* last cacheline: 24 bytes */ }; truct seq_list { struct list_head list; /* 0 16 */ u64 seq; /* 16 8 */ /* size: 24, cachelines: 1, members: 2 */ /* last cacheline: 24 bytes */ }; seq_list is never allocated, only embedded inside tree_mod_log structures. delayed_iput is allocated on every add_delayed_iput and freed quite frequently (once per-commit at least), so this is a short-lived object as well as the proposed btrfs_trans_rsv_item. IMO this justifies using the slab. Does this sound good to you? david -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/14/2012 08:07 PM, David Sterba wrote:> On Fri, Sep 14, 2012 at 07:25:45PM +0800, Liu Bo wrote: >> On 09/14/2012 07:15 PM, David Sterba wrote: >>> On Fri, Sep 14, 2012 at 04:58:04PM +0800, Liu Bo wrote: >>>> --- a/fs/btrfs/transaction.c >>>> +++ b/fs/btrfs/transaction.c >>>> @@ -306,9 +306,17 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, >>>> WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK && >>>> type != TRANS_JOIN_ONLY); >>>> h = current->journal_info; >>>> - h->use_count++; >>>> - h->orig_rsv = h->block_rsv; >>>> + if (h->block_rsv) { >>>> + struct btrfs_trans_rsv_item *item; >>>> + item = kmalloc(sizeof(*item), GFP_NOFS); >>> >>> I''d rather avoid the kmalloc here and add a list hook into >>> btrfs_block_rsv itself (used only for this purpose). >>> >>> It also does not increase the failure surface and we don''t have to >>> handle error conditions from deep callchains. >>> >> >> Actually I placed a list hook at first, but I found the same block_rsv could be inserted into >> the list chain twice, which will cause list_head''s terrible warnings. > > Is it expected and correct to add the rsv twice? I know the transaction > joins and commits are sometimes wildly nested so it does not mean that''s > necessarily a bug. Then it is not possible to embed the list hook, we > need to keep the full track of the blk_rsv stack. > > I''m counting two other similar structs used inside btrfs > (list_head + 8B value), we could make a shared separate slab for them. > > struct delayed_iput { > struct list_head list; /* 0 16 */ > struct inode * inode; /* 16 8 */ > > /* size: 24, cachelines: 1, members: 2 */ > /* last cacheline: 24 bytes */ > }; > > truct seq_list { > struct list_head list; /* 0 16 */ > u64 seq; /* 16 8 */ > > /* size: 24, cachelines: 1, members: 2 */ > /* last cacheline: 24 bytes */ > }; > > seq_list is never allocated, only embedded inside tree_mod_log structures. > > delayed_iput is allocated on every add_delayed_iput and freed quite > frequently (once per-commit at least), so this is a short-lived object > as well as the proposed btrfs_trans_rsv_item. IMO this justifies using > the slab. Does this sound good to you? >Good, a quick thinking says it will work ;) btw, do you have any test suit to measure this kind of cacheline efficiency so that we can get some numbers to see if it really helps? - liubo> > david >-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Sep 14, 2012 at 02:58:04AM -0600, Liu Bo wrote:> In some workloads we have nested joining transaction operations, > eg. > run_delalloc_nocow > btrfs_join_transaction > cow_file_range > btrfs_join_transaction > > it can be a serious bug since each trans handler has only two > block_rsv, orig_rsv and block_rsv, which means we may lose our > first block_rsv after two joining transaction operations: > > 1) btrfs_start_transaction > trans->block_rsv = A > > 2) btrfs_join_transaction > trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now A > trans->block_rsv = B > > 3) btrfs_join_transaction > trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now B > trans->block_rsv = C > ... >I''d like to see the actual stack trace where this happens, because I don''t think it can happen. And if it is we need to look at that specific case and adjust it as necessary and not add a bunch of kmallocs just to track the block_rsv, because frankly it''s not that big of a deal, it was just put into place in case somebody wasn''t expecting a call they made to start another transaction and reset the block_rsv, which I don''t actually think happens anywhere. So NAK on this patch, give me more information so I can figure out the right way to deal with this. Thanks, Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Josef Bacik
2012-Sep-14 12:42 UTC
Re: [PATCH 1/5] Btrfs: fix deadlock with freeze and sync
On Fri, Sep 14, 2012 at 02:58:03AM -0600, Liu Bo wrote:> While testing xfstests 068, I realized that > > commit bd7de2c9a449e26a5493d918618eb20ae60d56bd > (Btrfs: fix deadlock with freeze and sync V2) > > did not fix the bug yet, since someone might jump in between checking > running transaction and joining transaction, and we may still run into > deadlock between freeze and sync. > > So IMO the safest and most efficient way is to check running transaction > in joining a transaction directly. > > With this patch, I tested xfstests 068 for 120 times and it did not get > into deadlock at least here. >This doesn''t do anything since sync() won''t have a transaction already started so this will always just return and we won''t do a commit, NAK. Thanks, Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Josef Bacik
2012-Sep-14 12:45 UTC
Re: [PATCH 5/5] Btrfs: kill obsolete arguments in btrfs_wait_ordered_extents
On Fri, Sep 14, 2012 at 02:58:07AM -0600, Liu Bo wrote:> nocow_only is now an obsolete argument. >Why is it obsolete, it looks like it''s being used to me? Thanks, Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Liu Bo
2012-Sep-14 12:55 UTC
Re: [PATCH 5/5] Btrfs: kill obsolete arguments in btrfs_wait_ordered_extents
On 09/14/2012 08:45 PM, Josef Bacik wrote:> On Fri, Sep 14, 2012 at 02:58:07AM -0600, Liu Bo wrote: >> nocow_only is now an obsolete argument. >> > > Why is it obsolete, it looks like it''s being used to me? Thanks, >All of its callers use 0 as nocow_only now. thanks, liubo> Josef >-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Josef Bacik
2012-Sep-14 13:01 UTC
Re: [PATCH 5/5] Btrfs: kill obsolete arguments in btrfs_wait_ordered_extents
On Fri, Sep 14, 2012 at 06:55:23AM -0600, Liu Bo wrote:> On 09/14/2012 08:45 PM, Josef Bacik wrote: > > On Fri, Sep 14, 2012 at 02:58:07AM -0600, Liu Bo wrote: > >> nocow_only is now an obsolete argument. > >> > > > > Why is it obsolete, it looks like it''s being used to me? Thanks, > > > > All of its callers use 0 as nocow_only now. >Ooooh I saw that wrong, thought the last argument was nocowonly. Thanks, Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/14/2012 08:41 PM, Josef Bacik wrote:> On Fri, Sep 14, 2012 at 02:58:04AM -0600, Liu Bo wrote: >> In some workloads we have nested joining transaction operations, >> eg. >> run_delalloc_nocow >> btrfs_join_transaction >> cow_file_range >> btrfs_join_transaction >> >> it can be a serious bug since each trans handler has only two >> block_rsv, orig_rsv and block_rsv, which means we may lose our >> first block_rsv after two joining transaction operations: >> >> 1) btrfs_start_transaction >> trans->block_rsv = A >> >> 2) btrfs_join_transaction >> trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now A >> trans->block_rsv = B >> >> 3) btrfs_join_transaction >> trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now B >> trans->block_rsv = C >> ... >> > > I''d like to see the actual stack trace where this happens, because I don''t think > it can happen. And if it is we need to look at that specific case and adjust it > as necessary and not add a bunch of kmallocs just to track the block_rsv, > because frankly it''s not that big of a deal, it was just put into place in case > somebody wasn''t expecting a call they made to start another transaction and > reset the block_rsv, which I don''t actually think happens anywhere. So NAK on > this patch, give me more information so I can figure out the right way to deal > with this. Thanks, >Fine, please run xfstests 068 till it hits a BUG_ON inside either btrfs_delete_delayed_dir_index or btrfs_insert_delayed_dir_index. What I saw is that the orig_rsv and block_rsv is both delalloc_block_rsv, which is already lack of space. thanks, liubo> Josef > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html >-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/14/2012 08:42 PM, Josef Bacik wrote:> On Fri, Sep 14, 2012 at 02:58:03AM -0600, Liu Bo wrote: >> While testing xfstests 068, I realized that >> >> commit bd7de2c9a449e26a5493d918618eb20ae60d56bd >> (Btrfs: fix deadlock with freeze and sync V2) >> >> did not fix the bug yet, since someone might jump in between checking >> running transaction and joining transaction, and we may still run into >> deadlock between freeze and sync. >> >> So IMO the safest and most efficient way is to check running transaction >> in joining a transaction directly. >> >> With this patch, I tested xfstests 068 for 120 times and it did not get >> into deadlock at least here. >> > > This doesn''t do anything since sync() won''t have a transaction already started > so this will always just return and we won''t do a commit, NAK. Thanks, >Yeah, I''m digging this one, there is something wrong elsewhere, and Miao has pointed my fault :) thanks, liubo> Josef >-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/14/2012 09:01 PM, Liu Bo wrote:> On 09/14/2012 08:41 PM, Josef Bacik wrote: >> On Fri, Sep 14, 2012 at 02:58:04AM -0600, Liu Bo wrote: >>> In some workloads we have nested joining transaction operations, >>> eg. >>> run_delalloc_nocow >>> btrfs_join_transaction >>> cow_file_range >>> btrfs_join_transaction >>> >>> it can be a serious bug since each trans handler has only two >>> block_rsv, orig_rsv and block_rsv, which means we may lose our >>> first block_rsv after two joining transaction operations: >>> >>> 1) btrfs_start_transaction >>> trans->block_rsv = A >>> >>> 2) btrfs_join_transaction >>> trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now A >>> trans->block_rsv = B >>> >>> 3) btrfs_join_transaction >>> trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now B >>> trans->block_rsv = C >>> ... >>> >> >> I''d like to see the actual stack trace where this happens, because I don''t think >> it can happen. And if it is we need to look at that specific case and adjust it >> as necessary and not add a bunch of kmallocs just to track the block_rsv, >> because frankly it''s not that big of a deal, it was just put into place in case >> somebody wasn''t expecting a call they made to start another transaction and >> reset the block_rsv, which I don''t actually think happens anywhere. So NAK on >> this patch, give me more information so I can figure out the right way to deal >> with this. Thanks, >> > > Fine, please run xfstests 068 till it hits a BUG_ON inside either btrfs_delete_delayed_dir_index or > btrfs_insert_delayed_dir_index. > > What I saw is that the orig_rsv and block_rsv is both delalloc_block_rsv, which is already lack of space. >and trans->use_count has been 3. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Sterba
2012-Sep-14 13:09 UTC
Re: [PATCH 5/5] Btrfs: kill obsolete arguments in btrfs_wait_ordered_extents
On Fri, Sep 14, 2012 at 08:45:16AM -0400, Josef Bacik wrote:> On Fri, Sep 14, 2012 at 02:58:07AM -0600, Liu Bo wrote: > > nocow_only is now an obsolete argument. > > > > Why is it obsolete, it looks like it''s being used to me? Thanks,It''s always 0 so the removed code never executes. I''m tracking where it started, pretty long ago: commit 7ea394f1192bee1af67ea4762c88ef4b7b0487a8 Author: Yan Zheng <zheng.yan@oracle.com> Date: Tue Aug 5 13:05:02 2008 -0400 without any explanations what it fixes, so I''d be rather cautious here. I found this hunk from commit 24bbcf0442ee04660a5a030efdbb6d03f1c275cb Author: Yan, Zheng <zheng.yan@oracle.com> Date: Thu Nov 12 09:36:34 2009 +0000 Subject: Btrfs: Add delayed iput does something suspicious: @@ -991,11 +994,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); if (flush_on_commit) { - btrfs_start_delalloc_inodes(root); - ret = btrfs_wait_ordered_extents(root, 0); + btrfs_start_delalloc_inodes(root, 1); + ret = btrfs_wait_ordered_extents(root, 0, 1); BUG_ON(ret); } else if (snap_pending) { - ret = btrfs_wait_ordered_extents(root, 1); + ret = btrfs_wait_ordered_extents(root, 0, 1); BUG_ON(ret); } --- The purpose was to add the dealy_iput argument to the end of the list, but why does it change the nocow from 1 -> 0 in the if(snap_pending) branch? The code was last touched by Sage in commit commit 0bdb1db297ab36865a63ee722d35ff0a1f0ae522 Author: Sage Weil <sage@newdream.net> Date: Fri Feb 19 14:13:50 2010 -0800 Subject: Btrfs: flush data on snapshot creation Flush any delalloc extents when we create a snapshot, so that recently written file data is always included in the snapshot. A later commit will add the ability to snapshot without the flush, but most people expect flushing. Signed-off-by: Sage Weil <sage@newdream.net> Signed-off-by: Chris Mason <chris.mason@oracle.com> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2a36e23..2d654c1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -997,13 +997,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); - if (flush_on_commit) { + if (flush_on_commit || snap_pending) { btrfs_start_delalloc_inodes(root, 1); ret = btrfs_wait_ordered_extents(root, 0, 1); BUG_ON(ret); - } else if (snap_pending) { - ret = btrfs_wait_ordered_extents(root, 0, 1); - BUG_ON(ret); } --- the calls to btrfs_wait_ordered_extents were same and thus mergeable. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Sep 14, 2012 at 09:01:36PM +0800, Liu Bo wrote:> Fine, please run xfstests 068 till it hits a BUG_ON inside either btrfs_delete_delayed_dir_index or > btrfs_insert_delayed_dir_index.068 is the freezer test, we''ve seen enough issues in the freezing area recently, I''m curious if you were able to reproduce with a non-freezing tests. It''s quite possible that this may be a race condition and the freezer just supplies the right delays and widens the race window. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Josef Bacik
2012-Sep-14 14:41 UTC
Re: [PATCH 1/5] Btrfs: fix deadlock with freeze and sync
On Fri, Sep 14, 2012 at 02:58:03AM -0600, Liu Bo wrote:> While testing xfstests 068, I realized that > > commit bd7de2c9a449e26a5493d918618eb20ae60d56bd > (Btrfs: fix deadlock with freeze and sync V2) > > did not fix the bug yet, since someone might jump in between checking > running transaction and joining transaction, and we may still run into > deadlock between freeze and sync. > > So IMO the safest and most efficient way is to check running transaction > in joining a transaction directly. > > With this patch, I tested xfstests 068 for 120 times and it did not get > into deadlock at least here. >Ok I''ve come up with a different way to fix this one and I''m still trying to reproduce the problem your 2nd patch talks about, but I''ve taken the other 3 and pushed them to btrfs-next. Thanks! Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On fri, 14 Sep 2012 21:05:01 +0800, Liu Bo wrote:> On 09/14/2012 09:01 PM, Liu Bo wrote: >> On 09/14/2012 08:41 PM, Josef Bacik wrote: >>> On Fri, Sep 14, 2012 at 02:58:04AM -0600, Liu Bo wrote: >>>> In some workloads we have nested joining transaction operations, >>>> eg. >>>> run_delalloc_nocow >>>> btrfs_join_transaction >>>> cow_file_range >>>> btrfs_join_transaction >>>> >>>> it can be a serious bug since each trans handler has only two >>>> block_rsv, orig_rsv and block_rsv, which means we may lose our >>>> first block_rsv after two joining transaction operations: >>>> >>>> 1) btrfs_start_transaction >>>> trans->block_rsv = A >>>> >>>> 2) btrfs_join_transaction >>>> trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now A >>>> trans->block_rsv = B >>>> >>>> 3) btrfs_join_transaction >>>> trans->orig_rsv = trans->block_rsv; ---> orig_rsv is now B >>>> trans->block_rsv = C >>>> ... >>>> >>> >>> I''d like to see the actual stack trace where this happens, because I don''t think >>> it can happen. And if it is we need to look at that specific case and adjust it >>> as necessary and not add a bunch of kmallocs just to track the block_rsv, >>> because frankly it''s not that big of a deal, it was just put into place in case >>> somebody wasn''t expecting a call they made to start another transaction and >>> reset the block_rsv, which I don''t actually think happens anywhere. So NAK on >>> this patch, give me more information so I can figure out the right way to deal >>> with this. Thanks, >>> >> >> Fine, please run xfstests 068 till it hits a BUG_ON inside either btrfs_delete_delayed_dir_index or >> btrfs_insert_delayed_dir_index. >> >> What I saw is that the orig_rsv and block_rsv is both delalloc_block_rsv, which is already lack of space. >> > > and trans->use_count has been 3.Hi, Liu Do you still look into this problem? I think the following patch can help you. This patch was made to improve btrfs_run_ordered_operations(), I found it can fix the problem that you pointed out in this mail. Thanks Miao Subject: [PATCH] Btrfs: make ordered operations be handled by multi-thread Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 7 ++++ fs/btrfs/ordered-data.c | 88 ++++++++++++++++++++++++++++++++++++++-------- fs/btrfs/ordered-data.h | 2 +- fs/btrfs/transaction.c | 18 +++++++-- 5 files changed, 95 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dbb461f..fd7ed9f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1328,6 +1328,7 @@ struct btrfs_fs_info { */ struct btrfs_workers fixup_workers; struct btrfs_workers delayed_workers; + struct btrfs_workers ordered_operation_workers; struct task_struct *transaction_kthread; struct task_struct *cleaner_kthread; int thread_pool_size; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7fb7069..e49665f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2317,6 +2317,10 @@ int open_ctree(struct super_block *sb, btrfs_init_workers(&fs_info->readahead_workers, "readahead", fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->ordered_operation_workers, + "ordered-operations", + fs_info->thread_pool_size, + &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -2346,6 +2350,7 @@ int open_ctree(struct super_block *sb, ret |= btrfs_start_workers(&fs_info->delayed_workers); ret |= btrfs_start_workers(&fs_info->caching_workers); ret |= btrfs_start_workers(&fs_info->readahead_workers); + ret |= btrfs_start_workers(&fs_info->ordered_operation_workers); if (ret) { err = -ENOMEM; goto fail_sb_buffer; @@ -2649,6 +2654,7 @@ fail_tree_roots: fail_sb_buffer: btrfs_stop_workers(&fs_info->generic_worker); + btrfs_stop_workers(&fs_info->ordered_operation_workers); btrfs_stop_workers(&fs_info->readahead_workers); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); @@ -3256,6 +3262,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); btrfs_stop_workers(&fs_info->readahead_workers); + btrfs_stop_workers(&fs_info->ordered_operation_workers); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(root, CHECK_INTEGRITY)) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 4ae1014..a4b1316 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -26,6 +26,7 @@ #include "extent_io.h" static struct kmem_cache *btrfs_ordered_extent_cache; +static struct kmem_cache *btrfs_ordered_operation_work_cache; static u64 entry_end(struct btrfs_ordered_extent *entry) { @@ -519,6 +520,28 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, spin_unlock(&root->fs_info->ordered_extent_lock); } +struct btrfs_ordered_operation_work { + struct inode *inode; + struct completion completion; + int wait; + struct list_head list; + struct btrfs_work work; +}; + +static void btrfs_run_ordered_operation(struct btrfs_work *work) +{ + struct btrfs_ordered_operation_work *ordered_work; + + ordered_work = container_of(work, struct btrfs_ordered_operation_work, + work); + if (ordered_work->wait) + btrfs_wait_ordered_range(ordered_work->inode, 0, (u64)-1); + else + filemap_flush(ordered_work->inode->i_mapping); + btrfs_add_delayed_iput(ordered_work->inode); + complete(&ordered_work->completion); +} + /* * this is used during transaction commit to write all the inodes * added to the ordered operation list. These files must be fully on @@ -529,13 +552,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, * extra check to make sure the ordered operation list really is empty * before we return */ -void btrfs_run_ordered_operations(struct btrfs_root *root, int wait) +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) { struct btrfs_inode *btrfs_inode; struct inode *inode; struct list_head splice; + struct list_head works; + struct btrfs_ordered_operation_work *work, *next; + int ret = 0; INIT_LIST_HEAD(&splice); + INIT_LIST_HEAD(&works); mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); @@ -543,6 +570,7 @@ again: list_splice_init(&root->fs_info->ordered_operations, &splice); while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, ordered_operations); @@ -559,16 +587,34 @@ again: list_add_tail(&BTRFS_I(inode)->ordered_operations, &root->fs_info->ordered_operations); } + + if (!inode) + continue; spin_unlock(&root->fs_info->ordered_extent_lock); - if (inode) { - if (wait) - btrfs_wait_ordered_range(inode, 0, (u64)-1); - else - filemap_flush(inode->i_mapping); - btrfs_add_delayed_iput(inode); + work = kmem_cache_zalloc(btrfs_ordered_operation_work_cache, + GFP_NOFS); + if (!work) { + if (list_empty(&BTRFS_I(inode)->ordered_operations)) + list_add_tail(&btrfs_inode->ordered_operations, + &splice); + spin_lock(&root->fs_info->ordered_extent_lock); + list_splice_tail(&splice, + &root->fs_info->ordered_operations); + spin_unlock(&root->fs_info->ordered_extent_lock); + ret = -ENOMEM; + goto out; } + init_completion(&work->completion); + INIT_LIST_HEAD(&work->list); + work->inode = inode; + work->wait = wait; + work->work.func = btrfs_run_ordered_operation; + list_add_tail(&work->list, &works); + btrfs_queue_worker(&root->fs_info->ordered_operation_workers, + &work->work); + cond_resched(); spin_lock(&root->fs_info->ordered_extent_lock); } @@ -576,7 +622,14 @@ again: goto again; spin_unlock(&root->fs_info->ordered_extent_lock); +out: + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + wait_for_completion(&work->completion); + kmem_cache_free(btrfs_ordered_operation_work_cache, work); + } mutex_unlock(&root->fs_info->ordered_operations_mutex); + return ret; } /* @@ -944,15 +997,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, if (last_mod < root->fs_info->last_trans_committed) return; - /* - * the transaction is already committing. Just start the IO and - * don''t bother with all of this list nonsense - */ - if (trans && root->fs_info->running_transaction->blocked) { - btrfs_wait_ordered_range(inode, 0, (u64)-1); - return; - } - spin_lock(&root->fs_info->ordered_extent_lock); if (list_empty(&BTRFS_I(inode)->ordered_operations)) { list_add_tail(&BTRFS_I(inode)->ordered_operations, @@ -969,6 +1013,16 @@ int __init ordered_data_init(void) NULL); if (!btrfs_ordered_extent_cache) return -ENOMEM; + + btrfs_ordered_operation_work_cache = kmem_cache_create( + "btrfs_ordered_operation_work", + sizeof(struct btrfs_ordered_operation_work), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_ordered_operation_work_cache) { + kmem_cache_destroy(btrfs_ordered_extent_cache); + return -ENOMEM; + } return 0; } @@ -976,4 +1030,6 @@ void ordered_data_exit(void) { if (btrfs_ordered_extent_cache) kmem_cache_destroy(btrfs_ordered_extent_cache); + if (btrfs_ordered_operation_work_cache) + kmem_cache_destroy(btrfs_ordered_operation_work_cache); } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index d1ddaef..82f9f67 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -186,7 +186,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -void btrfs_run_ordered_operations(struct btrfs_root *root, int wait); +int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 115f054..1538b50 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1362,15 +1362,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; DEFINE_WAIT(wait); - int ret = -EIO; + int ret; int should_grow = 0; unsigned long now = get_seconds(); int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); - btrfs_run_ordered_operations(root, 0); + ret = btrfs_run_ordered_operations(root, 0); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto cleanup_transaction; + } - if (cur_trans->aborted) + if (cur_trans->aborted) { + ret = cur_trans->aborted; goto cleanup_transaction; + } /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here @@ -1466,7 +1472,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * it here and no for sure that nothing new will be added * to the list */ - btrfs_run_ordered_operations(root, 1); + ret = btrfs_run_ordered_operations(root, 1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto cleanup_transaction; + } prepare_to_wait(&cur_trans->writer_wait, &wait, TASK_UNINTERRUPTIBLE); -- 1.7.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html