Each time pick one dead root from the list and let the caller know if it''s needed to continue. This should improve responsiveness during umount and balance which at some point waits for cleaning all currently queued dead roots. A new dead root is added to the end of the list, so the snapshots disappear in the order of deletion. The snapshot cleaning work is now done only from the cleaner thread and the others wake it if needed. Signed-off-by: David Sterba <dsterba@suse.cz> --- v1,v2: * http://thread.gmane.org/gmane.comp.file-systems.btrfs/23212 v2->v3: * remove run_again from btrfs_clean_one_deleted_snapshot and return 1 unconditionally fs/btrfs/disk-io.c | 10 ++++++-- fs/btrfs/extent-tree.c | 8 ++++++ fs/btrfs/relocation.c | 3 -- fs/btrfs/transaction.c | 56 +++++++++++++++++++++++++++++++---------------- fs/btrfs/transaction.h | 2 +- 5 files changed, 53 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 988b860..4de2351 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1690,15 +1690,19 @@ static int cleaner_kthread(void *arg) struct btrfs_root *root = arg; do { + int again = 0; + if (!(root->fs_info->sb->s_flags & MS_RDONLY) && + down_read_trylock(&root->fs_info->sb->s_umount) && mutex_trylock(&root->fs_info->cleaner_mutex)) { btrfs_run_delayed_iputs(root); - btrfs_clean_old_snapshots(root); + again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&root->fs_info->cleaner_mutex); btrfs_run_defrag_inodes(root->fs_info); + up_read(&root->fs_info->sb->s_umount); } - if (!try_to_freeze()) { + if (!try_to_freeze() && !again) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop()) schedule(); @@ -3403,8 +3407,8 @@ int btrfs_commit_super(struct btrfs_root *root) mutex_lock(&root->fs_info->cleaner_mutex); btrfs_run_delayed_iputs(root); - btrfs_clean_old_snapshots(root); mutex_unlock(&root->fs_info->cleaner_mutex); + wake_up_process(root->fs_info->cleaner_kthread); /* wait until ongoing cleanup work done */ down_write(&root->fs_info->cleanup_work_sem); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 742b7a7..a08d0fe 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7263,6 +7263,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * reference count by one. if update_ref is true, this function * also make sure backrefs for the shared block and all lower level * blocks are properly updated. + * + * If called with for_reloc == 0, may exit early with -EAGAIN */ int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int update_ref, @@ -7363,6 +7365,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { + if (!for_reloc && btrfs_fs_closing(root->fs_info)) { + pr_debug("btrfs: drop snapshot early exit\n"); + err = -EAGAIN; + goto out_end_trans; + } + ret = walk_down_tree(trans, root, path, wc); if (ret < 0) { err = ret; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8445000..50deb9ed 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4148,10 +4148,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) while (1) { mutex_lock(&fs_info->cleaner_mutex); - - btrfs_clean_old_snapshots(fs_info->tree_root); ret = relocate_block_group(rc); - mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { err = ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a0467eb..a2781c3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -950,7 +950,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, int btrfs_add_dead_root(struct btrfs_root *root) { spin_lock(&root->fs_info->trans_lock); - list_add(&root->root_list, &root->fs_info->dead_roots); + list_add_tail(&root->root_list, &root->fs_info->dead_roots); spin_unlock(&root->fs_info->trans_lock); return 0; } @@ -1876,31 +1876,49 @@ cleanup_transaction: } /* - * interface function to delete all the snapshots we have scheduled for deletion + * return < 0 if error + * 0 if there are no more dead_roots at the time of call + * 1 there are more to be processed, call me again + * + * The return value indicates there are certainly more snapshots to delete, but + * if there comes a new one during processing, it may return 0. We don''t mind, + * because btrfs_commit_super will poke cleaner thread and it will process it a + * few seconds later. */ -int btrfs_clean_old_snapshots(struct btrfs_root *root) +int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) { - LIST_HEAD(list); + int ret; struct btrfs_fs_info *fs_info = root->fs_info; + if (fs_info->sb->s_flags & MS_RDONLY) { + pr_debug("btrfs: cleaner called for RO fs!\n"); + return 0; + } + spin_lock(&fs_info->trans_lock); - list_splice_init(&fs_info->dead_roots, &list); + if (list_empty(&fs_info->dead_roots)) { + spin_unlock(&fs_info->trans_lock); + return 0; + } + root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + list_del(&root->root_list); spin_unlock(&fs_info->trans_lock); - while (!list_empty(&list)) { - int ret; - - root = list_entry(list.next, struct btrfs_root, root_list); - list_del(&root->root_list); + pr_debug("btrfs: cleaner removing %llu\n", + (unsigned long long)root->objectid); - btrfs_kill_all_delayed_nodes(root); + btrfs_kill_all_delayed_nodes(root); - if (btrfs_header_backref_rev(root->node) < - BTRFS_MIXED_BACKREF_REV) - ret = btrfs_drop_snapshot(root, NULL, 0, 0); - else - ret =btrfs_drop_snapshot(root, NULL, 1, 0); - BUG_ON(ret < 0); - } - return 0; + if (btrfs_header_backref_rev(root->node) < + BTRFS_MIXED_BACKREF_REV) + ret = btrfs_drop_snapshot(root, NULL, 0, 0); + else + ret = btrfs_drop_snapshot(root, NULL, 1, 0); + /* + * If we encounter a transaction abort during snapshot cleaning, we + * don''t want to crash here + */ + BUG_ON(ret < 0 && (ret != -EAGAIN || ret != -EROFS)); + return 1; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 3c8e0d2..f6edd5e 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -123,7 +123,7 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, int btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); -int btrfs_clean_old_snapshots(struct btrfs_root *root); +int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, -- 1.7.9 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Mar 12, 2013 at 5:13 PM, David Sterba <dsterba@suse.cz> wrote:> Each time pick one dead root from the list and let the caller know if > it''s needed to continue. This should improve responsiveness during > umount and balance which at some point waits for cleaning all currently > queued dead roots. > > A new dead root is added to the end of the list, so the snapshots > disappear in the order of deletion. > > The snapshot cleaning work is now done only from the cleaner thread and the > others wake it if needed. > > Signed-off-by: David Sterba <dsterba@suse.cz> > --- > > v1,v2: > * http://thread.gmane.org/gmane.comp.file-systems.btrfs/23212 > > v2->v3: > * remove run_again from btrfs_clean_one_deleted_snapshot and return 1 > unconditionally > > fs/btrfs/disk-io.c | 10 ++++++-- > fs/btrfs/extent-tree.c | 8 ++++++ > fs/btrfs/relocation.c | 3 -- > fs/btrfs/transaction.c | 56 +++++++++++++++++++++++++++++++---------------- > fs/btrfs/transaction.h | 2 +- > 5 files changed, 53 insertions(+), 26 deletions(-) > > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > index 988b860..4de2351 100644 > --- a/fs/btrfs/disk-io.c > +++ b/fs/btrfs/disk-io.c > @@ -1690,15 +1690,19 @@ static int cleaner_kthread(void *arg) > struct btrfs_root *root = arg; > > do { > + int again = 0; > + > if (!(root->fs_info->sb->s_flags & MS_RDONLY) && > + down_read_trylock(&root->fs_info->sb->s_umount) && > mutex_trylock(&root->fs_info->cleaner_mutex)) { > btrfs_run_delayed_iputs(root); > - btrfs_clean_old_snapshots(root); > + again = btrfs_clean_one_deleted_snapshot(root); > mutex_unlock(&root->fs_info->cleaner_mutex); > btrfs_run_defrag_inodes(root->fs_info); > + up_read(&root->fs_info->sb->s_umount); > } > > - if (!try_to_freeze()) { > + if (!try_to_freeze() && !again) { > set_current_state(TASK_INTERRUPTIBLE); > if (!kthread_should_stop()) > schedule(); > @@ -3403,8 +3407,8 @@ int btrfs_commit_super(struct btrfs_root *root) > > mutex_lock(&root->fs_info->cleaner_mutex); > btrfs_run_delayed_iputs(root); > - btrfs_clean_old_snapshots(root); > mutex_unlock(&root->fs_info->cleaner_mutex); > + wake_up_process(root->fs_info->cleaner_kthread); > > /* wait until ongoing cleanup work done */ > down_write(&root->fs_info->cleanup_work_sem); > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index 742b7a7..a08d0fe 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -7263,6 +7263,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, > * reference count by one. if update_ref is true, this function > * also make sure backrefs for the shared block and all lower level > * blocks are properly updated. > + * > + * If called with for_reloc == 0, may exit early with -EAGAIN > */ > int btrfs_drop_snapshot(struct btrfs_root *root, > struct btrfs_block_rsv *block_rsv, int update_ref, > @@ -7363,6 +7365,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, > wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); > > while (1) { > + if (!for_reloc && btrfs_fs_closing(root->fs_info)) { > + pr_debug("btrfs: drop snapshot early exit\n"); > + err = -EAGAIN; > + goto out_end_trans; > + } > + > ret = walk_down_tree(trans, root, path, wc); > if (ret < 0) { > err = ret; > diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c > index 8445000..50deb9ed 100644 > --- a/fs/btrfs/relocation.c > +++ b/fs/btrfs/relocation.c > @@ -4148,10 +4148,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) > > while (1) { > mutex_lock(&fs_info->cleaner_mutex); > - > - btrfs_clean_old_snapshots(fs_info->tree_root); > ret = relocate_block_group(rc); > - > mutex_unlock(&fs_info->cleaner_mutex); > if (ret < 0) { > err = ret; > diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c > index a0467eb..a2781c3 100644 > --- a/fs/btrfs/transaction.c > +++ b/fs/btrfs/transaction.c > @@ -950,7 +950,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, > int btrfs_add_dead_root(struct btrfs_root *root) > { > spin_lock(&root->fs_info->trans_lock); > - list_add(&root->root_list, &root->fs_info->dead_roots); > + list_add_tail(&root->root_list, &root->fs_info->dead_roots); > spin_unlock(&root->fs_info->trans_lock); > return 0; > } > @@ -1876,31 +1876,49 @@ cleanup_transaction: > } > > /* > - * interface function to delete all the snapshots we have scheduled for deletion > + * return < 0 if error > + * 0 if there are no more dead_roots at the time of call > + * 1 there are more to be processed, call me again > + * > + * The return value indicates there are certainly more snapshots to delete, but > + * if there comes a new one during processing, it may return 0. We don''t mind, > + * because btrfs_commit_super will poke cleaner thread and it will process it a > + * few seconds later. > */ > -int btrfs_clean_old_snapshots(struct btrfs_root *root) > +int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) > { > - LIST_HEAD(list); > + int ret; > struct btrfs_fs_info *fs_info = root->fs_info; > > + if (fs_info->sb->s_flags & MS_RDONLY) { > + pr_debug("btrfs: cleaner called for RO fs!\n"); > + return 0; > + } > + > spin_lock(&fs_info->trans_lock); > - list_splice_init(&fs_info->dead_roots, &list); > + if (list_empty(&fs_info->dead_roots)) { > + spin_unlock(&fs_info->trans_lock); > + return 0; > + } > + root = list_first_entry(&fs_info->dead_roots, > + struct btrfs_root, root_list); > + list_del(&root->root_list); > spin_unlock(&fs_info->trans_lock); > > - while (!list_empty(&list)) { > - int ret; > - > - root = list_entry(list.next, struct btrfs_root, root_list); > - list_del(&root->root_list); > + pr_debug("btrfs: cleaner removing %llu\n", > + (unsigned long long)root->objectid); > > - btrfs_kill_all_delayed_nodes(root); > + btrfs_kill_all_delayed_nodes(root); > > - if (btrfs_header_backref_rev(root->node) < > - BTRFS_MIXED_BACKREF_REV) > - ret = btrfs_drop_snapshot(root, NULL, 0, 0); > - else > - ret =btrfs_drop_snapshot(root, NULL, 1, 0); > - BUG_ON(ret < 0); > - } > - return 0; > + if (btrfs_header_backref_rev(root->node) < > + BTRFS_MIXED_BACKREF_REV) > + ret = btrfs_drop_snapshot(root, NULL, 0, 0); > + else > + ret = btrfs_drop_snapshot(root, NULL, 1, 0); > + /* > + * If we encounter a transaction abort during snapshot cleaning, we > + * don''t want to crash here > + */ > + BUG_ON(ret < 0 && (ret != -EAGAIN || ret != -EROFS)); > + return 1; > } > diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h > index 3c8e0d2..f6edd5e 100644 > --- a/fs/btrfs/transaction.h > +++ b/fs/btrfs/transaction.h > @@ -123,7 +123,7 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, > > int btrfs_add_dead_root(struct btrfs_root *root); > int btrfs_defrag_root(struct btrfs_root *root); > -int btrfs_clean_old_snapshots(struct btrfs_root *root); > +int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); > int btrfs_commit_transaction(struct btrfs_trans_handle *trans, > struct btrfs_root *root); > int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, > -- > 1.7.9 >Thanks for addressing this issue, David. Alex. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Quoting David Sterba (2013-03-12 11:13:28)> Each time pick one dead root from the list and let the caller know if > it''s needed to continue. This should improve responsiveness during > umount and balance which at some point waits for cleaning all currently > queued dead roots. > > A new dead root is added to the end of the list, so the snapshots > disappear in the order of deletion. > > The snapshot cleaning work is now done only from the cleaner thread and the > others wake it if needed.> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > index 988b860..4de2351 100644 > --- a/fs/btrfs/disk-io.c > +++ b/fs/btrfs/disk-io.c > @@ -1690,15 +1690,19 @@ static int cleaner_kthread(void *arg) > struct btrfs_root *root = arg; > > do { > + int again = 0; > + > if (!(root->fs_info->sb->s_flags & MS_RDONLY) && > + down_read_trylock(&root->fs_info->sb->s_umount) && > mutex_trylock(&root->fs_info->cleaner_mutex)) { > btrfs_run_delayed_iputs(root); > - btrfs_clean_old_snapshots(root); > + again = btrfs_clean_one_deleted_snapshot(root); > mutex_unlock(&root->fs_info->cleaner_mutex); > btrfs_run_defrag_inodes(root->fs_info); > + up_read(&root->fs_info->sb->s_umount);Can we use just the cleaner mutex for this? We''re deadlocking during 068 with autodefrag on because the cleaner is holding s_umount while autodefrag is trying to bump the writer count. If unmount takes the cleaner mutex once it should wait long enough for the cleaner to stop. -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, May 06, 2013 at 08:41:06PM -0400, Chris Mason wrote:> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > > index 988b860..4de2351 100644 > > --- a/fs/btrfs/disk-io.c > > +++ b/fs/btrfs/disk-io.c > > @@ -1690,15 +1690,19 @@ static int cleaner_kthread(void *arg) > > struct btrfs_root *root = arg; > > > > do { > > + int again = 0; > > + > > if (!(root->fs_info->sb->s_flags & MS_RDONLY) && > > + down_read_trylock(&root->fs_info->sb->s_umount) && > > mutex_trylock(&root->fs_info->cleaner_mutex)) { > > btrfs_run_delayed_iputs(root); > > - btrfs_clean_old_snapshots(root); > > + again = btrfs_clean_one_deleted_snapshot(root); > > mutex_unlock(&root->fs_info->cleaner_mutex); > > btrfs_run_defrag_inodes(root->fs_info); > > + up_read(&root->fs_info->sb->s_umount); > > Can we use just the cleaner mutex for this? We''re deadlocking during > 068 with autodefrag on because the cleaner is holding s_umount while > autodefrag is trying to bump the writer count.I have now reproduced the deadlock and see where it''s stuck. It did not happen with running 068 in a loop, but after interrupting the test.> If unmount takes the cleaner mutex once it should wait long enough for > the cleaner to stop.You mean removing s_umount from here completely? I''m not sure about other mis-interaction, eg with remount + autodefrag. Miao sent a patch for that case http://www.spinics.net/lists/linux-btrfs/msg16634.html (but it would not fix this deadlock). I''m for keeping the clean-by-one patch for 3.10, we can fix other regressions during rc cycle. david -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Quoting David Sterba (2013-05-07 07:54:49)> On Mon, May 06, 2013 at 08:41:06PM -0400, Chris Mason wrote: > > > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > > > index 988b860..4de2351 100644 > > > --- a/fs/btrfs/disk-io.c > > > +++ b/fs/btrfs/disk-io.c > > > @@ -1690,15 +1690,19 @@ static int cleaner_kthread(void *arg) > > > struct btrfs_root *root = arg; > > > > > > do { > > > + int again = 0; > > > + > > > if (!(root->fs_info->sb->s_flags & MS_RDONLY) && > > > + down_read_trylock(&root->fs_info->sb->s_umount) && > > > mutex_trylock(&root->fs_info->cleaner_mutex)) { > > > btrfs_run_delayed_iputs(root); > > > - btrfs_clean_old_snapshots(root); > > > + again = btrfs_clean_one_deleted_snapshot(root); > > > mutex_unlock(&root->fs_info->cleaner_mutex); > > > btrfs_run_defrag_inodes(root->fs_info); > > > + up_read(&root->fs_info->sb->s_umount); > > > > Can we use just the cleaner mutex for this? We''re deadlocking during > > 068 with autodefrag on because the cleaner is holding s_umount while > > autodefrag is trying to bump the writer count. > > I have now reproduced the deadlock and see where it''s stuck. It did not > happen with running 068 in a loop, but after interrupting the test.Hmmm, interrupting the test may just mean you''ve left it frozen? It happens every time for me with autodefrag on.> > > If unmount takes the cleaner mutex once it should wait long enough for > > the cleaner to stop. > > You mean removing s_umount from here completely? I''m not sure about > other mis-interaction, eg with remount + autodefrag. Miao sent a patch > for that case http://www.spinics.net/lists/linux-btrfs/msg16634.html > (but it would not fix this deadlock).Mostly we need to pull the run_defrag_inodes out of the s_umount. It may be much smarter to put that into a dedicated worker pool.> > I''m for keeping the clean-by-one patch for 3.10, we can fix other > regressions during rc cycle.I do agree, and left it in the pull that Linus took. Thanks for working on this one. -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On tue, 7 May 2013 13:54:49 +0200, David Sterba wrote:> On Mon, May 06, 2013 at 08:41:06PM -0400, Chris Mason wrote: >>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c >>> index 988b860..4de2351 100644 >>> --- a/fs/btrfs/disk-io.c >>> +++ b/fs/btrfs/disk-io.c >>> @@ -1690,15 +1690,19 @@ static int cleaner_kthread(void *arg) >>> struct btrfs_root *root = arg; >>> >>> do { >>> + int again = 0; >>> + >>> if (!(root->fs_info->sb->s_flags & MS_RDONLY) && >>> + down_read_trylock(&root->fs_info->sb->s_umount) && >>> mutex_trylock(&root->fs_info->cleaner_mutex)) { >>> btrfs_run_delayed_iputs(root); >>> - btrfs_clean_old_snapshots(root); >>> + again = btrfs_clean_one_deleted_snapshot(root); >>> mutex_unlock(&root->fs_info->cleaner_mutex); >>> btrfs_run_defrag_inodes(root->fs_info); >>> + up_read(&root->fs_info->sb->s_umount); >> >> Can we use just the cleaner mutex for this? We''re deadlocking during >> 068 with autodefrag on because the cleaner is holding s_umount while >> autodefrag is trying to bump the writer count. > > I have now reproduced the deadlock and see where it''s stuck. It did not > happen with running 068 in a loop, but after interrupting the test. > >> If unmount takes the cleaner mutex once it should wait long enough for >> the cleaner to stop. > > You mean removing s_umount from here completely? I''m not sure about > other mis-interaction, eg with remount + autodefrag. Miao sent a patch > for that case http://www.spinics.net/lists/linux-btrfs/msg16634.html > (but it would not fix this deadlock).I have given up this patch and fix this problem by the other way. http://marc.info/?l=linux-btrfs&m=136142833013628&w=2 I think we need use s_umount here, all things we need do is to check R/O in cleaner_mutex. Or we may continue to delete the dead tree after the fs is remounted to be R/O. Thanks Miao> > I''m for keeping the clean-by-one patch for 3.10, we can fix other > regressions during rc cycle. > > david > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html >-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi David, I believe this patch has the following problem: On Tue, Mar 12, 2013 at 5:13 PM, David Sterba <dsterba@suse.cz> wrote:> Each time pick one dead root from the list and let the caller know if > it''s needed to continue. This should improve responsiveness during > umount and balance which at some point waits for cleaning all currently > queued dead roots. > > A new dead root is added to the end of the list, so the snapshots > disappear in the order of deletion. > > The snapshot cleaning work is now done only from the cleaner thread and the > others wake it if needed. > > Signed-off-by: David Sterba <dsterba@suse.cz> > --- > > v1,v2: > * http://thread.gmane.org/gmane.comp.file-systems.btrfs/23212 > > v2->v3: > * remove run_again from btrfs_clean_one_deleted_snapshot and return 1 > unconditionally > > fs/btrfs/disk-io.c | 10 ++++++-- > fs/btrfs/extent-tree.c | 8 ++++++ > fs/btrfs/relocation.c | 3 -- > fs/btrfs/transaction.c | 56 +++++++++++++++++++++++++++++++---------------- > fs/btrfs/transaction.h | 2 +- > 5 files changed, 53 insertions(+), 26 deletions(-) > > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > index 988b860..4de2351 100644 > --- a/fs/btrfs/disk-io.c > +++ b/fs/btrfs/disk-io.c > @@ -1690,15 +1690,19 @@ static int cleaner_kthread(void *arg) > struct btrfs_root *root = arg; > > do { > + int again = 0; > + > if (!(root->fs_info->sb->s_flags & MS_RDONLY) && > + down_read_trylock(&root->fs_info->sb->s_umount) && > mutex_trylock(&root->fs_info->cleaner_mutex)) { > btrfs_run_delayed_iputs(root); > - btrfs_clean_old_snapshots(root); > + again = btrfs_clean_one_deleted_snapshot(root); > mutex_unlock(&root->fs_info->cleaner_mutex); > btrfs_run_defrag_inodes(root->fs_info); > + up_read(&root->fs_info->sb->s_umount); > } > > - if (!try_to_freeze()) { > + if (!try_to_freeze() && !again) { > set_current_state(TASK_INTERRUPTIBLE); > if (!kthread_should_stop()) > schedule(); > @@ -3403,8 +3407,8 @@ int btrfs_commit_super(struct btrfs_root *root) > > mutex_lock(&root->fs_info->cleaner_mutex); > btrfs_run_delayed_iputs(root); > - btrfs_clean_old_snapshots(root); > mutex_unlock(&root->fs_info->cleaner_mutex); > + wake_up_process(root->fs_info->cleaner_kthread); > > /* wait until ongoing cleanup work done */ > down_write(&root->fs_info->cleanup_work_sem); > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index 742b7a7..a08d0fe 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -7263,6 +7263,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, > * reference count by one. if update_ref is true, this function > * also make sure backrefs for the shared block and all lower level > * blocks are properly updated. > + * > + * If called with for_reloc == 0, may exit early with -EAGAIN > */ > int btrfs_drop_snapshot(struct btrfs_root *root, > struct btrfs_block_rsv *block_rsv, int update_ref, > @@ -7363,6 +7365,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, > wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); > > while (1) { > + if (!for_reloc && btrfs_fs_closing(root->fs_info)) { > + pr_debug("btrfs: drop snapshot early exit\n"); > + err = -EAGAIN; > + goto out_end_trans; > + }Here you exit the loop, but the "drop_progress" in the root item is incorrect. When the system is remounted, and snapshot deletion resumes, it seems that it tries to resume from the EXTENT_ITEM that does not exist anymore, and [1] shows that btrfs_lookup_extent_info() simply does not find the needed extent. So then I hit panic in walk_down_tree(): BUG: wc->refs[level - 1] == 0 I fixed it like follows: There is a place where btrfs_drop_snapshot() checks if it needs to detach from transaction and re-attach. So I moved the exit point there and the code is like this: if (btrfs_should_end_transaction(trans, tree_root) || (!for_reloc && btrfs_need_cleaner_sleep(root))) { ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); if (ret) { btrfs_abort_transaction(trans, tree_root, ret); err = ret; goto out_end_trans; } btrfs_end_transaction_throttle(trans, tree_root); if (!for_reloc && btrfs_need_cleaner_sleep(root)) { err = -EAGAIN; goto out_free; } trans = btrfs_start_transaction(tree_root, 0); ... With this fix, I do not hit the panic, and snapshot deletion proceeds and completes alright after mount. Do you agree to my analysis or I am missing something? It seems that Josef''s btrfs-next still has this issue (as does Chris''s for-linus).> + > ret = walk_down_tree(trans, root, path, wc); > if (ret < 0) { > err = ret; > diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c > index 8445000..50deb9ed 100644 > --- a/fs/btrfs/relocation.c > +++ b/fs/btrfs/relocation.c > @@ -4148,10 +4148,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) > > while (1) { > mutex_lock(&fs_info->cleaner_mutex); > - > - btrfs_clean_old_snapshots(fs_info->tree_root); > ret = relocate_block_group(rc); > - > mutex_unlock(&fs_info->cleaner_mutex); > if (ret < 0) { > err = ret; > diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c > index a0467eb..a2781c3 100644 > --- a/fs/btrfs/transaction.c > +++ b/fs/btrfs/transaction.c > @@ -950,7 +950,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, > int btrfs_add_dead_root(struct btrfs_root *root) > { > spin_lock(&root->fs_info->trans_lock); > - list_add(&root->root_list, &root->fs_info->dead_roots); > + list_add_tail(&root->root_list, &root->fs_info->dead_roots); > spin_unlock(&root->fs_info->trans_lock); > return 0; > } > @@ -1876,31 +1876,49 @@ cleanup_transaction: > } > > /* > - * interface function to delete all the snapshots we have scheduled for deletion > + * return < 0 if error > + * 0 if there are no more dead_roots at the time of call > + * 1 there are more to be processed, call me again > + * > + * The return value indicates there are certainly more snapshots to delete, but > + * if there comes a new one during processing, it may return 0. We don''t mind, > + * because btrfs_commit_super will poke cleaner thread and it will process it a > + * few seconds later. > */ > -int btrfs_clean_old_snapshots(struct btrfs_root *root) > +int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) > { > - LIST_HEAD(list); > + int ret; > struct btrfs_fs_info *fs_info = root->fs_info; > > + if (fs_info->sb->s_flags & MS_RDONLY) { > + pr_debug("btrfs: cleaner called for RO fs!\n"); > + return 0; > + } > + > spin_lock(&fs_info->trans_lock); > - list_splice_init(&fs_info->dead_roots, &list); > + if (list_empty(&fs_info->dead_roots)) { > + spin_unlock(&fs_info->trans_lock); > + return 0; > + } > + root = list_first_entry(&fs_info->dead_roots, > + struct btrfs_root, root_list); > + list_del(&root->root_list); > spin_unlock(&fs_info->trans_lock); > > - while (!list_empty(&list)) { > - int ret; > - > - root = list_entry(list.next, struct btrfs_root, root_list); > - list_del(&root->root_list); > + pr_debug("btrfs: cleaner removing %llu\n", > + (unsigned long long)root->objectid); > > - btrfs_kill_all_delayed_nodes(root); > + btrfs_kill_all_delayed_nodes(root); > > - if (btrfs_header_backref_rev(root->node) < > - BTRFS_MIXED_BACKREF_REV) > - ret = btrfs_drop_snapshot(root, NULL, 0, 0); > - else > - ret =btrfs_drop_snapshot(root, NULL, 1, 0); > - BUG_ON(ret < 0); > - } > - return 0; > + if (btrfs_header_backref_rev(root->node) < > + BTRFS_MIXED_BACKREF_REV) > + ret = btrfs_drop_snapshot(root, NULL, 0, 0); > + else > + ret = btrfs_drop_snapshot(root, NULL, 1, 0); > + /* > + * If we encounter a transaction abort during snapshot cleaning, we > + * don''t want to crash here > + */ > + BUG_ON(ret < 0 && (ret != -EAGAIN || ret != -EROFS)); > + return 1; > } > diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h > index 3c8e0d2..f6edd5e 100644 > --- a/fs/btrfs/transaction.h > +++ b/fs/btrfs/transaction.h > @@ -123,7 +123,7 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, > > int btrfs_add_dead_root(struct btrfs_root *root); > int btrfs_defrag_root(struct btrfs_root *root); > -int btrfs_clean_old_snapshots(struct btrfs_root *root); > +int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); > int btrfs_commit_transaction(struct btrfs_trans_handle *trans, > struct btrfs_root *root); > int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, > -- > 1.7.9 >Thanks, Alex. [1] kernel: [ 1306.640085] ------------[ cut here ]------------ kernel: [ 1306.640117] WARNING: at /mnt/work/alex/zadara-btrfs/fs/btrfs/extent-tree.c:823 btrfs_lookup_extent_info+0x39b/0x3a0 [btrfs]() kernel: [ 1306.640119] Hardware name: Bochs kernel: [ 1306.640120] Modules linked in: dm_btrfs(OF) raid1(OF) xt_multiport btrfs(OF) xt_tcpudp nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack iptable_filter ip_tables x_tables ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi xfrm_user xfrm4_tunnel tunnel4 ipcomp xfrm_ipcomp esp4 ah4 8021q garp stp llc bonding dm_zcache(OF) dm_iostat(OF) deflate zlib_deflate ctr twofish_generic twofish_x86_64_3way twofish_x86_64 twofish_common camellia_generic camellia_x86_64 serpent_sse2_x86_64 glue_helper lrw serpent_generic xts gf128mul blowfish_generic blowfish_x86_64 blowfish_common ablk_helper cryptd cast5_generic cast_common des_generic xcbc rmd160 scst_vdisk(OF) crypto_null iscsi_scst(OF) af_key xfrm_algo scst(OF) libcrc32c kvm cirrus ttm drm_kms_helper microcode psmouse virtio_balloon serio_raw drm sysimgblt sysfillrect syscopyarea nfsd(OF) nfs_acl nfsv4 i2c_piix4 auth_rpcgss nfs fscache lockd sunrpc mac_hid lp parport floppy i kernel: xgbevf [last unloaded: btrfs] kernel: [ 1306.640204] Pid: 31823, comm: btrfs-cleaner Tainted: GF O 3.8.13-030813-generic #201305111843 kernel: [ 1306.640206] Call Trace: kernel: [ 1306.640216] [<ffffffff8105990f>] warn_slowpath_common+0x7f/0xc0 kernel: [ 1306.640219] [<ffffffff8105996a>] warn_slowpath_null+0x1a/0x20 kernel: [ 1306.640228] [<ffffffffa06c98fb>] btrfs_lookup_extent_info+0x39b/0x3a0 [btrfs] kernel: [ 1306.640242] [<ffffffffa07046f5>] ? alloc_extent_buffer+0x355/0x3e0 [btrfs] kernel: [ 1306.640251] [<ffffffffa06cca19>] do_walk_down+0x109/0x600 [btrfs] kernel: [ 1306.640255] [<ffffffff8106b4ef>] ? try_to_del_timer_sync+0x4f/0x70 kernel: [ 1306.640262] [<ffffffffa06b8e5a>] ? btrfs_free_path+0x2a/0x40 [btrfs] kernel: [ 1306.640271] [<ffffffffa06ccfd9>] walk_down_tree+0xc9/0x100 [btrfs] kernel: [ 1306.640280] [<ffffffffa06d066b>] btrfs_drop_snapshot+0x5eb/0xbe0 [btrfs] kernel: [ 1306.640294] [<ffffffffa072ff70>] ? btrfs_kill_all_delayed_nodes+0xf0/0x110 [btrfs] kernel: [ 1306.640305] [<ffffffffa06e53a7>] btrfs_clean_old_snapshots+0x127/0x1d0 [btrfs] kernel: [ 1306.640315] [<ffffffffa06dc740>] cleaner_kthread+0x300/0x380 [btrfs] kernel: [ 1306.640325] [<ffffffffa06dc440>] ? btrfs_destroy_delayed_refs.isra.105+0x220/0x220 [btrfs] kernel: [ 1306.640329] [<ffffffff8107f050>] kthread+0xc0/0xd0 kernel: [ 1306.640331] [<ffffffff8107ef90>] ? flush_kthread_worker+0xb0/0xb0 kernel: [ 1306.640335] [<ffffffff816f61ec>] ret_from_fork+0x7c/0xb0 kernel: [ 1306.640337] [<ffffffff8107ef90>] ? flush_kthread_worker+0xb0/0xb0 kernel: [ 1306.640339] ---[ end trace f5a6c532b98c6e92 ]--- kernel: [ 1306.640343] [31823]btrfs*[do_walk_down:6780] BUG: wc->refs[level - 1] == 0 kernel: [ 1306.640343] kernel: [ 1306.640345] Pid: 31823, comm: btrfs-cleaner Tainted: GF W O 3.8.13-030813-generic #201305111843 kernel: [ 1306.640346] Call Trace: kernel: [ 1306.640355] [<ffffffffa06cce77>] do_walk_down+0x567/0x600 [btrfs] kernel: [ 1306.640357] [<ffffffff8106b4ef>] ? try_to_del_timer_sync+0x4f/0x70 kernel: [ 1306.640365] [<ffffffffa06b8e5a>] ? btrfs_free_path+0x2a/0x40 [btrfs] kernel: [ 1306.640373] [<ffffffffa06ccfd9>] walk_down_tree+0xc9/0x100 [btrfs] kernel: [ 1306.640382] [<ffffffffa06d066b>] btrfs_drop_snapshot+0x5eb/0xbe0 [btrfs] kernel: [ 1306.640395] [<ffffffffa072ff70>] ? btrfs_kill_all_delayed_nodes+0xf0/0x110 [btrfs] kernel: [ 1306.640406] [<ffffffffa06e53a7>] btrfs_clean_old_snapshots+0x127/0x1d0 [btrfs] kernel: [ 1306.640416] [<ffffffffa06dc740>] cleaner_kthread+0x300/0x380 [btrfs] kernel: [ 1306.640426] [<ffffffffa06dc440>] ? btrfs_destroy_delayed_refs.isra.105+0x220/0x220 [btrfs] kernel: [ 1306.640429] [<ffffffff8107f050>] kthread+0xc0/0xd0 kernel: [ 1306.640431] [<ffffffff8107ef90>] ? flush_kthread_worker+0xb0/0xb0 kernel: [ 1306.640433] [<ffffffff816f61ec>] ret_from_fork+0x7c/0xb0 kernel: [ 1306.640435] [<ffffffff8107ef90>] ? flush_kthread_worker+0xb0/0xb0 kernel: [ 1306.640437] Kernel panic - not syncing: BUG: wc->refs[level - 1] == 0 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jul 04, 2013 at 06:29:23PM +0300, Alex Lyakas wrote:> > @@ -7363,6 +7365,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, > > wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); > > > > while (1) { > > + if (!for_reloc && btrfs_fs_closing(root->fs_info)) { > > + pr_debug("btrfs: drop snapshot early exit\n"); > > + err = -EAGAIN; > > + goto out_end_trans; > > + } > Here you exit the loop, but the "drop_progress" in the root item is > incorrect. When the system is remounted, and snapshot deletion > resumes, it seems that it tries to resume from the EXTENT_ITEM that > does not exist anymore, and [1] shows that btrfs_lookup_extent_info() > simply does not find the needed extent. > So then I hit panic in walk_down_tree(): > BUG: wc->refs[level - 1] == 0 > > I fixed it like follows: > There is a place where btrfs_drop_snapshot() checks if it needs to > detach from transaction and re-attach. So I moved the exit point there > and the code is like this: > > if (btrfs_should_end_transaction(trans, tree_root) || > (!for_reloc && btrfs_need_cleaner_sleep(root))) { > ret = btrfs_update_root(trans, tree_root, > &root->root_key, > root_item); > if (ret) { > btrfs_abort_transaction(trans, tree_root, ret); > err = ret; > goto out_end_trans; > } > > btrfs_end_transaction_throttle(trans, tree_root); > if (!for_reloc && btrfs_need_cleaner_sleep(root)) { > err = -EAGAIN; > goto out_free; > } > trans = btrfs_start_transaction(tree_root, 0); > ... > > With this fix, I do not hit the panic, and snapshot deletion proceeds > and completes alright after mount. > > Do you agree to my analysis or I am missing something? It seems that > Josef''s btrfs-next still has this issue (as does Chris''s for-linus).Sound analysis and I agree with the fix. The clean-by-one patch has been merged into 3.10 so we need a stable fix for that. thanks, david -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi David, On Thu, Jul 4, 2013 at 8:03 PM, David Sterba <dsterba@suse.cz> wrote:> On Thu, Jul 04, 2013 at 06:29:23PM +0300, Alex Lyakas wrote: >> > @@ -7363,6 +7365,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, >> > wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); >> > >> > while (1) { >> > + if (!for_reloc && btrfs_fs_closing(root->fs_info)) { >> > + pr_debug("btrfs: drop snapshot early exit\n"); >> > + err = -EAGAIN; >> > + goto out_end_trans; >> > + } >> Here you exit the loop, but the "drop_progress" in the root item is >> incorrect. When the system is remounted, and snapshot deletion >> resumes, it seems that it tries to resume from the EXTENT_ITEM that >> does not exist anymore, and [1] shows that btrfs_lookup_extent_info() >> simply does not find the needed extent. >> So then I hit panic in walk_down_tree(): >> BUG: wc->refs[level - 1] == 0 >> >> I fixed it like follows: >> There is a place where btrfs_drop_snapshot() checks if it needs to >> detach from transaction and re-attach. So I moved the exit point there >> and the code is like this: >> >> if (btrfs_should_end_transaction(trans, tree_root) || >> (!for_reloc && btrfs_need_cleaner_sleep(root))) { >> ret = btrfs_update_root(trans, tree_root, >> &root->root_key, >> root_item); >> if (ret) { >> btrfs_abort_transaction(trans, tree_root, ret); >> err = ret; >> goto out_end_trans; >> } >> >> btrfs_end_transaction_throttle(trans, tree_root); >> if (!for_reloc && btrfs_need_cleaner_sleep(root)) { >> err = -EAGAIN; >> goto out_free; >> } >> trans = btrfs_start_transaction(tree_root, 0); >> ... >> >> With this fix, I do not hit the panic, and snapshot deletion proceeds >> and completes alright after mount. >> >> Do you agree to my analysis or I am missing something? It seems that >> Josef''s btrfs-next still has this issue (as does Chris''s for-linus). > > Sound analysis and I agree with the fix. The clean-by-one patch has been > merged into 3.10 so we need a stable fix for that.Thanks for confirming, David! From more testing, I have two more notes: # After applying the fix, whenever snapshot deletion is resumed after mount, and successfully completes, then I unmount again, and rmmod btrfs, linux complains about loosing few "struct extent_buffer" during kem_cache_delete(). So somewhere on that path: if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { ... } else { ===> HERE and later we perhaps somehow overwrite the contents of "struct btrfs_path" that is used in the whole function. Because at the end of the function we always do btrfs_free_path(), which inside does btrfs_release_path(). I was not able to determine where the leak happens, do you have any hint? No other activity happens in the system except the resumed snap deletion, and this problem only happens when resuming. # This is for Josef: after I unmount the fs with ongoing snap deletion (after applying my fix), and run the latest btrfsck - it complains a lot about problems in extent tree:( But after I mount again, snap deletion resumes then completes, then I unmount and btrfsck is happy again. So probably it does not account orphan roots properly? David, will you provide a fixed patch, if possible? Thanks! Alex.> > thanks, > david-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jul 04, 2013 at 10:52:39PM +0300, Alex Lyakas wrote:> Hi David, > > On Thu, Jul 4, 2013 at 8:03 PM, David Sterba <dsterba@suse.cz> wrote: > > On Thu, Jul 04, 2013 at 06:29:23PM +0300, Alex Lyakas wrote: > >> > @@ -7363,6 +7365,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, > >> > wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); > >> > > >> > while (1) { > >> > + if (!for_reloc && btrfs_fs_closing(root->fs_info)) { > >> > + pr_debug("btrfs: drop snapshot early exit\n"); > >> > + err = -EAGAIN; > >> > + goto out_end_trans; > >> > + } > >> Here you exit the loop, but the "drop_progress" in the root item is > >> incorrect. When the system is remounted, and snapshot deletion > >> resumes, it seems that it tries to resume from the EXTENT_ITEM that > >> does not exist anymore, and [1] shows that btrfs_lookup_extent_info() > >> simply does not find the needed extent. > >> So then I hit panic in walk_down_tree(): > >> BUG: wc->refs[level - 1] == 0 > >> > >> I fixed it like follows: > >> There is a place where btrfs_drop_snapshot() checks if it needs to > >> detach from transaction and re-attach. So I moved the exit point there > >> and the code is like this: > >> > >> if (btrfs_should_end_transaction(trans, tree_root) || > >> (!for_reloc && btrfs_need_cleaner_sleep(root))) { > >> ret = btrfs_update_root(trans, tree_root, > >> &root->root_key, > >> root_item); > >> if (ret) { > >> btrfs_abort_transaction(trans, tree_root, ret); > >> err = ret; > >> goto out_end_trans; > >> } > >> > >> btrfs_end_transaction_throttle(trans, tree_root); > >> if (!for_reloc && btrfs_need_cleaner_sleep(root)) { > >> err = -EAGAIN; > >> goto out_free; > >> } > >> trans = btrfs_start_transaction(tree_root, 0); > >> ... > >> > >> With this fix, I do not hit the panic, and snapshot deletion proceeds > >> and completes alright after mount. > >> > >> Do you agree to my analysis or I am missing something? It seems that > >> Josef''s btrfs-next still has this issue (as does Chris''s for-linus). > > > > Sound analysis and I agree with the fix. The clean-by-one patch has been > > merged into 3.10 so we need a stable fix for that. > Thanks for confirming, David! > > From more testing, I have two more notes: > > # After applying the fix, whenever snapshot deletion is resumed after > mount, and successfully completes, then I unmount again, and rmmod > btrfs, linux complains about loosing few "struct extent_buffer" during > kem_cache_delete(). > So somewhere on that path: > if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { > ... > } else { > ===> HERE > > and later we perhaps somehow overwrite the contents of "struct > btrfs_path" that is used in the whole function. Because at the end of > the function we always do btrfs_free_path(), which inside does > btrfs_release_path(). I was not able to determine where the leak > happens, do you have any hint? No other activity happens in the system > except the resumed snap deletion, and this problem only happens when > resuming. > > # This is for Josef: after I unmount the fs with ongoing snap deletion > (after applying my fix), and run the latest btrfsck - it complains a > lot about problems in extent tree:( But after I mount again, snap > deletion resumes then completes, then I unmount and btrfsck is happy > again. So probably it does not account orphan roots properly? >It should but it may not be working properly. I know it works right for normal inodes, probably not too hard to fix it for snapshots, I''ll throw it on the TODO list. Thanks, Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi, On Thu, Jul 4, 2013 at 10:52 PM, Alex Lyakas <alex.btrfs@zadarastorage.com> wrote:> Hi David, > > On Thu, Jul 4, 2013 at 8:03 PM, David Sterba <dsterba@suse.cz> wrote: >> On Thu, Jul 04, 2013 at 06:29:23PM +0300, Alex Lyakas wrote: >>> > @@ -7363,6 +7365,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, >>> > wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); >>> > >>> > while (1) { >>> > + if (!for_reloc && btrfs_fs_closing(root->fs_info)) { >>> > + pr_debug("btrfs: drop snapshot early exit\n"); >>> > + err = -EAGAIN; >>> > + goto out_end_trans; >>> > + } >>> Here you exit the loop, but the "drop_progress" in the root item is >>> incorrect. When the system is remounted, and snapshot deletion >>> resumes, it seems that it tries to resume from the EXTENT_ITEM that >>> does not exist anymore, and [1] shows that btrfs_lookup_extent_info() >>> simply does not find the needed extent. >>> So then I hit panic in walk_down_tree(): >>> BUG: wc->refs[level - 1] == 0 >>> >>> I fixed it like follows: >>> There is a place where btrfs_drop_snapshot() checks if it needs to >>> detach from transaction and re-attach. So I moved the exit point there >>> and the code is like this: >>> >>> if (btrfs_should_end_transaction(trans, tree_root) || >>> (!for_reloc && btrfs_need_cleaner_sleep(root))) { >>> ret = btrfs_update_root(trans, tree_root, >>> &root->root_key, >>> root_item); >>> if (ret) { >>> btrfs_abort_transaction(trans, tree_root, ret); >>> err = ret; >>> goto out_end_trans; >>> } >>> >>> btrfs_end_transaction_throttle(trans, tree_root); >>> if (!for_reloc && btrfs_need_cleaner_sleep(root)) { >>> err = -EAGAIN; >>> goto out_free; >>> } >>> trans = btrfs_start_transaction(tree_root, 0); >>> ... >>> >>> With this fix, I do not hit the panic, and snapshot deletion proceeds >>> and completes alright after mount. >>> >>> Do you agree to my analysis or I am missing something? It seems that >>> Josef''s btrfs-next still has this issue (as does Chris''s for-linus). >> >> Sound analysis and I agree with the fix. The clean-by-one patch has been >> merged into 3.10 so we need a stable fix for that. > Thanks for confirming, David! > > From more testing, I have two more notes: > > # After applying the fix, whenever snapshot deletion is resumed after > mount, and successfully completes, then I unmount again, and rmmod > btrfs, linux complains about loosing few "struct extent_buffer" during > kem_cache_delete(). > So somewhere on that path: > if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { > ... > } else { > ===> HERE > > and later we perhaps somehow overwrite the contents of "struct > btrfs_path" that is used in the whole function. Because at the end of > the function we always do btrfs_free_path(), which inside does > btrfs_release_path(). I was not able to determine where the leak > happens, do you have any hint? No other activity happens in the system > except the resumed snap deletion, and this problem only happens when > resuming. >I found where the memory leak happens. When we abort snapshot deletion in the middle, then this btrfs_root is basically left alone hanging in the air. It is out of the "dead_roots" already, so when del_fs_roots() is called during unmount, it will not free this root and its root->node (which is the one that triggers memory leak warning on kmem_cache_destroy) and perhaps other stuff too. The issue still exists in btrfs-next. Simplest fix I came up with was: diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d275681..52a2c54 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7468,6 +7468,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int err = 0; int ret; int level; + bool root_freed = false; path = btrfs_alloc_path(); if (!path) { @@ -7641,6 +7642,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, free_extent_buffer(root->commit_root); btrfs_put_fs_root(root); } + root_freed = true; + out_end_trans: btrfs_end_transaction_throttle(trans, tree_root); out_free: @@ -7649,6 +7652,18 @@ out_free: out: if (err) btrfs_std_error(root->fs_info, err); + + /* + * If the root was not freed by any reason, this means that FS had + * a problem and will probably be unmounted soon. + * But we need to put the root back into the ''dead_roots'' list, + * so that it will be properly freed during unmount. + */ + if (!root_freed) { + WARN_ON(err == 0); + btrfs_add_dead_root(root); + } + return err; } With this fix, I don''t see any memleak warnings (also by enabling LEAK_DEBUG) while aborting and resuming snapshot deletion.> # This is for Josef: after I unmount the fs with ongoing snap deletion > (after applying my fix), and run the latest btrfsck - it complains a > lot about problems in extent tree:( But after I mount again, snap > deletion resumes then completes, then I unmount and btrfsck is happy > again. So probably it does not account orphan roots properly? > > David, will you provide a fixed patch, if possible? >Josef, David, I feel that I am not helpful enough by pinpointing the problem and suggesting a fix, but not providing actual patch that fixes it and can be applied. The reason is that it is difficult for me to test the fix thoroughly on the latest upstream kernel (like btrfs-next), for reasons I''m sure you understand. So I appreciate if you could post these two fixes to the upstream kernel; but otherwise, I will try to work and test them on the latest kernel myself. Thanks, Alex.> Thanks! > Alex. > >> >> thanks, >> david-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Jul 14, 2013 at 07:20:04PM +0300, Alex Lyakas wrote:> Hi, > > On Thu, Jul 4, 2013 at 10:52 PM, Alex Lyakas > <alex.btrfs@zadarastorage.com> wrote: > > Hi David, > > > > On Thu, Jul 4, 2013 at 8:03 PM, David Sterba <dsterba@suse.cz> wrote: > >> On Thu, Jul 04, 2013 at 06:29:23PM +0300, Alex Lyakas wrote: > >>> > @@ -7363,6 +7365,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, > >>> > wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); > >>> > > >>> > while (1) { > >>> > + if (!for_reloc && btrfs_fs_closing(root->fs_info)) { > >>> > + pr_debug("btrfs: drop snapshot early exit\n"); > >>> > + err = -EAGAIN; > >>> > + goto out_end_trans; > >>> > + } > >>> Here you exit the loop, but the "drop_progress" in the root item is > >>> incorrect. When the system is remounted, and snapshot deletion > >>> resumes, it seems that it tries to resume from the EXTENT_ITEM that > >>> does not exist anymore, and [1] shows that btrfs_lookup_extent_info() > >>> simply does not find the needed extent. > >>> So then I hit panic in walk_down_tree(): > >>> BUG: wc->refs[level - 1] == 0 > >>> > >>> I fixed it like follows: > >>> There is a place where btrfs_drop_snapshot() checks if it needs to > >>> detach from transaction and re-attach. So I moved the exit point there > >>> and the code is like this: > >>> > >>> if (btrfs_should_end_transaction(trans, tree_root) || > >>> (!for_reloc && btrfs_need_cleaner_sleep(root))) { > >>> ret = btrfs_update_root(trans, tree_root, > >>> &root->root_key, > >>> root_item); > >>> if (ret) { > >>> btrfs_abort_transaction(trans, tree_root, ret); > >>> err = ret; > >>> goto out_end_trans; > >>> } > >>> > >>> btrfs_end_transaction_throttle(trans, tree_root); > >>> if (!for_reloc && btrfs_need_cleaner_sleep(root)) { > >>> err = -EAGAIN; > >>> goto out_free; > >>> } > >>> trans = btrfs_start_transaction(tree_root, 0); > >>> ... > >>> > >>> With this fix, I do not hit the panic, and snapshot deletion proceeds > >>> and completes alright after mount. > >>> > >>> Do you agree to my analysis or I am missing something? It seems that > >>> Josef''s btrfs-next still has this issue (as does Chris''s for-linus). > >> > >> Sound analysis and I agree with the fix. The clean-by-one patch has been > >> merged into 3.10 so we need a stable fix for that. > > Thanks for confirming, David! > > > > From more testing, I have two more notes: > > > > # After applying the fix, whenever snapshot deletion is resumed after > > mount, and successfully completes, then I unmount again, and rmmod > > btrfs, linux complains about loosing few "struct extent_buffer" during > > kem_cache_delete(). > > So somewhere on that path: > > if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { > > ... > > } else { > > ===> HERE > > > > and later we perhaps somehow overwrite the contents of "struct > > btrfs_path" that is used in the whole function. Because at the end of > > the function we always do btrfs_free_path(), which inside does > > btrfs_release_path(). I was not able to determine where the leak > > happens, do you have any hint? No other activity happens in the system > > except the resumed snap deletion, and this problem only happens when > > resuming. > > > I found where the memory leak happens. When we abort snapshot deletion > in the middle, then this btrfs_root is basically left alone hanging in > the air. It is out of the "dead_roots" already, so when del_fs_roots() > is called during unmount, it will not free this root and its > root->node (which is the one that triggers memory leak warning on > kmem_cache_destroy) and perhaps other stuff too. The issue still > exists in btrfs-next. > > Simplest fix I came up with was: > > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index d275681..52a2c54 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -7468,6 +7468,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, > int err = 0; > int ret; > int level; > + bool root_freed = false; > > path = btrfs_alloc_path(); > if (!path) { > @@ -7641,6 +7642,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, > free_extent_buffer(root->commit_root); > btrfs_put_fs_root(root); > } > + root_freed = true; > + > out_end_trans: > btrfs_end_transaction_throttle(trans, tree_root); > out_free: > @@ -7649,6 +7652,18 @@ out_free: > out: > if (err) > btrfs_std_error(root->fs_info, err); > + > + /* > + * If the root was not freed by any reason, this means that FS had > + * a problem and will probably be unmounted soon. > + * But we need to put the root back into the ''dead_roots'' list, > + * so that it will be properly freed during unmount. > + */ > + if (!root_freed) { > + WARN_ON(err == 0); > + btrfs_add_dead_root(root); > + } > + > return err; > } > > With this fix, I don''t see any memleak warnings (also by enabling > LEAK_DEBUG) while aborting and resuming snapshot deletion. > > > > # This is for Josef: after I unmount the fs with ongoing snap deletion > > (after applying my fix), and run the latest btrfsck - it complains a > > lot about problems in extent tree:( But after I mount again, snap > > deletion resumes then completes, then I unmount and btrfsck is happy > > again. So probably it does not account orphan roots properly? > > > > David, will you provide a fixed patch, if possible? > > > > Josef, David, I feel that I am not helpful enough by pinpointing the > problem and suggesting a fix, but not providing actual patch that > fixes it and can be applied. The reason is that it is difficult for me > to test the fix thoroughly on the latest upstream kernel (like > btrfs-next), for reasons I''m sure you understand. So I appreciate if > you could post these two fixes to the upstream kernel; but otherwise, > I will try to work and test them on the latest kernel myself. >This is perfect, you''ve given great fixes and great analysis. Since you have an actual patch for this one please re-send with a Signed-off-by and such so I can apply it. Thanks, Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html