This comes from one of btrfs''s project ideas, As we defragment files, we break any sharing from other snapshots. The balancing code will preserve the sharing, and defrag needs to grow this as well. Now we''re able to fill the blank with this patch, in which we make full use of backref walking stuff. Here is the basic idea, o set the writeback ranges started by defragment with flag EXTENT_DEFRAG o at endio, after we finish updating fs tree, we use backref walking to find all parents of the ranges and re-link them with the new COWed file layout by adding corresponding backrefs. Signed-off-by: Li Zefan <lizf@cn.fujitsu.com> Signed-off-by: Liu Bo <bo.li.liu@oracle.com> --- v4->v5: - Clarify the comments for duplicated refs. - Clear defrag flag after we''re ready to defrag. - Fix a bug on HOLE extent. v3->v4: - Fix duplicated refs bugs detected by mounting with autodefrag, thanks for the bug report from Mitch and Chris. v2->v3: - Rebase v1->v2: - Address comments from David. fs/btrfs/inode.c | 644 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 644 insertions(+), 0 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cb4ea9b8..700fb48 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -54,6 +54,7 @@ #include "locking.h" #include "free-space-cache.h" #include "inode-map.h" +#include "backref.h" struct btrfs_iget_args { u64 ino; @@ -1892,6 +1893,630 @@ out: return ret; } +/* snapshot-aware defrag */ +struct sa_defrag_extent_backref { + struct rb_node node; + struct old_sa_defrag_extent *old; + u64 root_id; + u64 inum; + u64 file_pos; + u64 extent_offset; + u64 num_bytes; + u64 generation; +}; + +struct old_sa_defrag_extent { + struct list_head list; + struct new_sa_defrag_extent *new; + + u64 extent_offset; + u64 bytenr; + u64 offset; + u64 len; + int count; +}; + +struct new_sa_defrag_extent { + struct rb_root root; + struct list_head head; + struct btrfs_path *path; + struct inode *inode; + u64 file_pos; + u64 len; + u64 bytenr; + u64 disk_len; + u8 compress_type; +}; + +static int backref_comp(struct sa_defrag_extent_backref *b1, + struct sa_defrag_extent_backref *b2) +{ + if (b1->root_id < b2->root_id) + return -1; + else if (b1->root_id > b2->root_id) + return 1; + + if (b1->inum < b2->inum) + return -1; + else if (b1->inum > b2->inum) + return 1; + + if (b1->file_pos < b2->file_pos) + return -1; + else if (b1->file_pos > b2->file_pos) + return 1; + + /* + * [------------------------------] ===> (a range of space) + * |<--->| |<---->| =============> (fs/file tree A) + * |<---------------------------->| ===> (fs/file tree B) + * + * A range of space can refer to two file extents in one tree while + * refer to only one file extent in another tree. + * + * So we may process a disk offset more than one time(two extents in A) + * and locate at the same extent(one extent in B), then insert two same + * backrefs(both refer to the extent in B). + */ + return 0; +} + +static void backref_insert(struct rb_root *root, + struct sa_defrag_extent_backref *backref) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct sa_defrag_extent_backref *entry; + int ret; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct sa_defrag_extent_backref, node); + + ret = backref_comp(backref, entry); + if (ret < 0) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&backref->node, parent, p); + rb_insert_color(&backref->node, root); +} + +/* + * Note the backref might has changed, and in this case we just return 0. + */ +static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, + void *ctx) +{ + struct btrfs_file_extent_item *extent; + struct btrfs_fs_info *fs_info; + struct old_sa_defrag_extent *old = ctx; + struct new_sa_defrag_extent *new = old->new; + struct btrfs_path *path = new->path; + struct btrfs_key key; + struct btrfs_root *root; + struct sa_defrag_extent_backref *backref; + struct extent_buffer *leaf; + struct inode *inode = new->inode; + int slot; + int ret; + u64 extent_offset; + u64 num_bytes; + + if (BTRFS_I(inode)->root->root_key.objectid == root_id && + inum == btrfs_ino(inode)) + return 0; + + key.objectid = root_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + fs_info = BTRFS_I(inode)->root->fs_info; + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + if (PTR_ERR(root) == -ENOENT) + return 0; + WARN_ON(1); + pr_debug("inum=%llu, offset=%llu, root_id=%llu\n", + inum, offset, root_id); + return PTR_ERR(root); + } + + key.objectid = inum; + key.type = BTRFS_EXTENT_DATA_KEY; + if (offset > (u64)-1 << 32) + key.offset = 0; + else + key.offset = offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + WARN_ON(1); + return ret; + } + + while (1) { + cond_resched(); + + leaf = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + goto out; + } + continue; + } + + path->slots[0]++; + + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.objectid > inum) + goto out; + + if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) + continue; + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + + if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) + continue; + + extent_offset = btrfs_file_extent_offset(leaf, extent); + if (key.offset - extent_offset != offset) + continue; + + num_bytes = btrfs_file_extent_num_bytes(leaf, extent); + if (extent_offset >= old->extent_offset + old->offset + + old->len || extent_offset + num_bytes <+ old->extent_offset + old->offset) + continue; + + break; + } + + backref = kmalloc(sizeof(*backref), GFP_NOFS); + if (!backref) { + ret = -ENOENT; + goto out; + } + + backref->root_id = root_id; + backref->inum = inum; + backref->file_pos = offset + extent_offset; + backref->num_bytes = num_bytes; + backref->extent_offset = extent_offset; + backref->generation = btrfs_file_extent_generation(leaf, extent); + backref->old = old; + backref_insert(&new->root, backref); + old->count++; +out: + btrfs_release_path(path); + WARN_ON(ret); + return ret; +} + +static noinline bool record_extent_backrefs(struct btrfs_path *path, + struct new_sa_defrag_extent *new) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info; + struct old_sa_defrag_extent *old, *tmp; + int ret; + + new->path = path; + + list_for_each_entry_safe(old, tmp, &new->head, list) { + ret = iterate_inodes_from_logical(old->bytenr, fs_info, + path, record_one_backref, + old); + BUG_ON(ret < 0 && ret != -ENOENT); + + /* no backref to be processed for this extent */ + if (!old->count) { + list_del(&old->list); + kfree(old); + } + } + + if (list_empty(&new->head)) + return false; + + return true; +} + +static int relink_is_mergable(struct extent_buffer *leaf, + struct btrfs_file_extent_item *fi, + u64 disk_bytenr) +{ + if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr) + return 0; + + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) + return 0; + + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + return 0; + + return 1; +} + +/* + * Note the backref might has changed, and in this case we just return 0. + */ +static noinline int relink_extent_backref(struct btrfs_path *path, + struct sa_defrag_extent_backref *prev, + struct sa_defrag_extent_backref *backref) +{ + struct btrfs_file_extent_item *extent; + struct btrfs_file_extent_item *item; + struct btrfs_ordered_extent *ordered; + struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + struct extent_buffer *leaf; + struct old_sa_defrag_extent *old = backref->old; + struct new_sa_defrag_extent *new = old->new; + struct inode *src_inode = new->inode; + struct inode *inode; + struct extent_state *cached = NULL; + int ret = 0; + u64 start; + u64 len; + u64 lock_start; + u64 lock_end; + bool merge = false; + + if (prev && prev->root_id == backref->root_id && + prev->inum == backref->inum && + prev->file_pos + prev->num_bytes == backref->file_pos) + merge = true; + + /* step 1: get root */ + key.objectid = backref->root_id; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + fs_info = BTRFS_I(src_inode)->root->fs_info; + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + if (PTR_ERR(root) == -ENOENT) + return 0; + return PTR_ERR(root); + } + + /* step 2: get inode */ + key.objectid = backref->inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) { + if (inode && !IS_ERR(inode)) + iput(inode); + return 0; + } + + /* step 3: relink backref */ + lock_start = backref->file_pos; + lock_end = backref->file_pos + backref->num_bytes - 1; + lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, + 0, &cached); + + ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); + if (ordered) { + btrfs_put_ordered_extent(ordered); + goto out_unlock; + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } + + key.objectid = backref->inum; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = backref->file_pos; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out_free_path; + } else if (ret > 0) { + ret = 0; + goto out_free_path; + } + + extent = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_generation(path->nodes[0], extent) !+ backref->generation) + goto out_free_path; + + btrfs_release_path(path); + + start = backref->file_pos; + if (backref->extent_offset < old->extent_offset + old->offset) + start += old->extent_offset + old->offset - + backref->extent_offset; + + len = min(backref->extent_offset + backref->num_bytes, + old->extent_offset + old->offset + old->len); + len -= max(backref->extent_offset, old->extent_offset + old->offset); + + ret = btrfs_drop_extents(trans, root, inode, start, + start + len, 1); + if (ret) + goto out_free_path; +again: + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + if (merge) { + struct btrfs_file_extent_item *fi; + u64 extent_len; + struct btrfs_key found_key; + + ret = btrfs_search_slot(trans, root, &key, path, 1, 1); + if (ret < 0) + goto out_free_path; + + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_len = btrfs_file_extent_num_bytes(leaf, fi); + + if (relink_is_mergable(leaf, fi, new->bytenr) && + extent_len + found_key.offset == start) { + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_len + len); + btrfs_mark_buffer_dirty(leaf); + inode_add_bytes(inode, len); + + ret = 1; + goto out_free_path; + } else { + merge = false; + btrfs_release_path(path); + goto again; + } + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*extent)); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_free_path; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); + btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); + btrfs_set_file_extent_num_bytes(leaf, item, len); + btrfs_set_file_extent_ram_bytes(leaf, item, new->len); + btrfs_set_file_extent_generation(leaf, item, trans->transid); + btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_compression(leaf, item, new->compress_type); + btrfs_set_file_extent_encryption(leaf, item, 0); + btrfs_set_file_extent_other_encoding(leaf, item, 0); + + btrfs_mark_buffer_dirty(leaf); + inode_add_bytes(inode, len); + + ret = btrfs_inc_extent_ref(trans, root, new->bytenr, + new->disk_len, 0, + backref->root_id, backref->inum, + new->file_pos, 0); /* start - extent_offset */ + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_free_path; + } + + ret = 1; +out_free_path: + btrfs_release_path(path); + btrfs_end_transaction(trans, root); +out_unlock: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, + &cached, GFP_NOFS); + iput(inode); + return ret; +} + +static void relink_file_extents(struct new_sa_defrag_extent *new) +{ + struct btrfs_path *path; + struct old_sa_defrag_extent *old, *tmp; + struct sa_defrag_extent_backref *backref; + struct sa_defrag_extent_backref *prev = NULL; + struct inode *inode; + struct btrfs_root *root; + struct rb_node *node; + int ret; + + inode = new->inode; + root = BTRFS_I(inode)->root; + + path = btrfs_alloc_path(); + if (!path) + return; + + if (!record_extent_backrefs(path, new)) { + btrfs_free_path(path); + goto out; + } + btrfs_release_path(path); + + while (1) { + node = rb_first(&new->root); + if (!node) + break; + rb_erase(node, &new->root); + + backref = rb_entry(node, struct sa_defrag_extent_backref, node); + + ret = relink_extent_backref(path, prev, backref); + WARN_ON(ret < 0); + + kfree(prev); + + if (ret == 1) + prev = backref; + else + prev = NULL; + cond_resched(); + } + kfree(prev); + + btrfs_free_path(path); + + list_for_each_entry_safe(old, tmp, &new->head, list) { + list_del(&old->list); + kfree(old); + } +out: + atomic_dec(&root->fs_info->defrag_running); + wake_up(&root->fs_info->transaction_wait); + + kfree(new); +} + +static struct new_sa_defrag_extent * +record_old_file_extents(struct inode *inode, + struct btrfs_ordered_extent *ordered) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct btrfs_key key; + struct old_sa_defrag_extent *old, *tmp; + struct new_sa_defrag_extent *new; + int ret; + + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) + return NULL; + + new->inode = inode; + new->file_pos = ordered->file_offset; + new->len = ordered->len; + new->bytenr = ordered->start; + new->disk_len = ordered->disk_len; + new->compress_type = ordered->compress_type; + new->root = RB_ROOT; + INIT_LIST_HEAD(&new->head); + + path = btrfs_alloc_path(); + if (!path) + goto out_kfree; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = new->file_pos; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out_free_path; + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + /* find out all the old extents for the file range */ + while (1) { + struct btrfs_file_extent_item *extent; + struct extent_buffer *l; + int slot; + u64 num_bytes; + u64 offset; + u64 end; + u64 disk_bytenr; + u64 extent_offset; + + l = path->nodes[0]; + slot = path->slots[0]; + + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out_free_list; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid != btrfs_ino(inode)) + break; + if (key.type != BTRFS_EXTENT_DATA_KEY) + break; + if (key.offset >= new->file_pos + new->len) + break; + + extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); + + num_bytes = btrfs_file_extent_num_bytes(l, extent); + if (key.offset + num_bytes < new->file_pos) + goto next; + + disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); + if (!disk_bytenr) + goto next; + + extent_offset = btrfs_file_extent_offset(l, extent); + + old = kmalloc(sizeof(*old), GFP_NOFS); + if (!old) + goto out_free_list; + + offset = max(new->file_pos, key.offset); + end = min(new->file_pos + new->len, key.offset + num_bytes); + + old->bytenr = disk_bytenr; + old->extent_offset = extent_offset; + old->offset = offset - key.offset; + old->len = end - offset; + old->new = new; + old->count = 0; + list_add_tail(&old->list, &new->head); +next: + path->slots[0]++; + cond_resched(); + } + + btrfs_free_path(path); + atomic_inc(&root->fs_info->defrag_running); + + return new; + +out_free_list: + list_for_each_entry_safe(old, tmp, &new->head, list) { + list_del(&old->list); + kfree(old); + } +out_free_path: + btrfs_free_path(path); +out_kfree: + kfree(new); + return NULL; +} + /* * helper function for btrfs_finish_ordered_io, this * just reads in some of the csum leaves to prime them into ram @@ -1909,6 +2534,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; + struct new_sa_defrag_extent *new = NULL; int compress_type = 0; int ret; bool nolock; @@ -1943,6 +2569,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset + ordered_extent->len - 1, 0, &cached_state); + ret = test_range_bit(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + EXTENT_DEFRAG, 1, cached_state); + if (ret) { + u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); + if (last_snapshot >= BTRFS_I(inode)->generation) + /* the inode is shared */ + new = record_old_file_extents(inode, ordered_extent); + + clear_extent_bit(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); + } + if (nolock) trans = btrfs_join_transaction_nolock(root); else @@ -2012,6 +2652,10 @@ out: */ btrfs_remove_ordered_extent(inode, ordered_extent); + /* for snapshot-aware defrag */ + if (new) + relink_file_extents(new); + /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ -- 1.7.7.6 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jan 16, 2013 at 6:36 AM, Liu Bo <bo.li.liu@oracle.com> wrote:> This comes from one of btrfs''s project ideas, > As we defragment files, we break any sharing from other snapshots. > The balancing code will preserve the sharing, and defrag needs to grow this > as well. > > Now we''re able to fill the blank with this patch, in which we make full use of > backref walking stuff. > > Here is the basic idea, > o set the writeback ranges started by defragment with flag EXTENT_DEFRAG > o at endio, after we finish updating fs tree, we use backref walking to find > all parents of the ranges and re-link them with the new COWed file layout by > adding corresponding backrefs. > > Signed-off-by: Li Zefan <lizf@cn.fujitsu.com> > Signed-off-by: Liu Bo <bo.li.liu@oracle.com> > --- > v4->v5: > - Clarify the comments for duplicated refs. > - Clear defrag flag after we''re ready to defrag. > - Fix a bug on HOLE extent. > v3->v4: > - Fix duplicated refs bugs detected by mounting with autodefrag, thanks > for the bug report from Mitch and Chris. > v2->v3: > - Rebase > v1->v2: > - Address comments from David. >I''ve been testing this patch on a 3.7.2 kernel merged with the for-linus branch for the 3.8_rc kernels, and I''m seeing the following error: [16028.159400] general protection fault: 0000 [#1] SMP [16028.159461] Modules linked in: ipv6 snd_hda_codec_analog snd_hda_intel snd_hda_codec tg3 snd_hwdep snd_pcm snd_page_alloc snd_timer snd sr_mod ppdev parport_pc parport microcode iTCO_wdt iTCO_vendor_support floppy lpc_ich i2c_i801 serio_raw pcspkr ablk_helper cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd [16028.159952] CPU 0 [16028.159975] Pid: 4420, comm: btrfs-cleaner Not tainted 3.7.2-sad+ #4 Dell Inc. OptiPlex 745 /0WF810 [16028.160002] RIP: 0010:[<ffffffffa017b4f2>] [<ffffffffa017b4f2>] btrfs_clean_old_snapshots+0xa6/0x12c [btrfs] [16028.160002] RSP: 0000:ffff880078609e38 EFLAGS: 00010282 [16028.160002] RAX: dead000000200200 RBX: ffff880000000000 RCX: 0000000000018e20 [16028.160002] RDX: dead000000100100 RSI: 000000000000001b RDI: 000000000000001b [16028.160002] RBP: ffff880078609e78 R08: 00000000001c001b R09: ffffffffa015aa01 [16028.160002] R10: ffffffffa016bbbd R11: ffff8800183a4800 R12: 0000160000000000 [16028.160002] R13: ffff880078609e38 R14: ffff8800183a4800 R15: ffff8800183a4c38 [16028.160002] FS: 0000000000000000(0000) GS:ffff88007f200000(0000) knlGS:0000000000000000 [16028.160002] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [16028.160002] CR2: 00007f64f5214d96 CR3: 0000000011ef2000 CR4: 00000000000007f0 [16028.160002] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [16028.160002] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [16028.160002] Process btrfs-cleaner (pid: 4420, threadinfo ffff880078608000, task ffff88007ca62120) [16028.160002] Stack: [16028.160002] ffff8800183a4c38 ffff8800020e3c38 ffff880078609e48 ffff88007921b800 [16028.160002] ffff88007ca62120 ffff88007ca62120 ffff88007ca62120 0000000000000000 [16028.160002] ffff880078609eb8 ffffffffa0173f68 ffff88007921b800 0000000000000000 [16028.160002] Call Trace: [16028.160002] [<ffffffffa0173f68>] cleaner_kthread+0x5a/0xe6 [btrfs] [16028.160002] [<ffffffffa0173f0e>] ? transaction_kthread+0x1a0/0x1a0 [btrfs] [16028.160002] [<ffffffff8104c9c3>] kthread+0xba/0xc2 [16028.160002] [<ffffffff8104c909>] ? kthread_freezable_should_stop+0x52/0x52 [16028.160002] [<ffffffff815f9d9c>] ret_from_fork+0x7c/0xb0 [16028.160002] [<ffffffff8104c909>] ? kthread_freezable_should_stop+0x52/0x52 [16028.160002] Code: 49 bc 00 00 00 00 00 16 00 00 48 bb 00 00 00 00 00 88 ff ff eb 7d 4d 8d b7 c8 fb ff ff 4d 85 ff 75 02 0f 0b 49 8b 17 49 8b 47 08 <48> 89 42 08 48 89 10 48 be 00 01 10 00 00 00 ad de 49 89 37 48 [16028.160002] RIP [<ffffffffa017b4f2>] btrfs_clean_old_snapshots+0xa6/0x12c [btrfs] [16028.160002] RSP <ffff880078609e38> [16028.170584] ---[ end trace 4034e68ac40e6c2b ]--- Using gdb to identify the location of the GPF gives me the following: (gdb) list *(btrfs_clean_old_snapshots+0xa6) 0x2a4f2 is in btrfs_clean_old_snapshots (include/linux/list.h:88). 83 * This is only for internal list manipulation where we know 84 * the prev/next entries already! 85 */ 86 static inline void __list_del(struct list_head * prev, struct list_head * next) 87 { 88 next->prev = prev; 89 prev->next = next; 90 } 91 92 /** I''ve tried to trap the error with a BUG_ON prior to deleting the list, but my attempt isn''t catching the error: @@ -1769,6 +1769,7 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) int ret; root = list_entry(list.next, struct btrfs_root, root_list); + BUG_ON(&root->root_list == NULL); list_del(&root->root_list); btrfs_kill_all_delayed_nodes(root); -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 17, 2013 at 08:42:46AM -0600, Mitch Harder wrote:> On Wed, Jan 16, 2013 at 6:36 AM, Liu Bo <bo.li.liu@oracle.com> wrote: > > This comes from one of btrfs''s project ideas, > > As we defragment files, we break any sharing from other snapshots. > > The balancing code will preserve the sharing, and defrag needs to grow this > > as well. > > > > Now we''re able to fill the blank with this patch, in which we make full use of > > backref walking stuff. > > > > Here is the basic idea, > > o set the writeback ranges started by defragment with flag EXTENT_DEFRAG > > o at endio, after we finish updating fs tree, we use backref walking to find > > all parents of the ranges and re-link them with the new COWed file layout by > > adding corresponding backrefs. > > > > Signed-off-by: Li Zefan <lizf@cn.fujitsu.com> > > Signed-off-by: Liu Bo <bo.li.liu@oracle.com> > > --- > > v4->v5: > > - Clarify the comments for duplicated refs. > > - Clear defrag flag after we''re ready to defrag. > > - Fix a bug on HOLE extent. > > v3->v4: > > - Fix duplicated refs bugs detected by mounting with autodefrag, thanks > > for the bug report from Mitch and Chris. > > v2->v3: > > - Rebase > > v1->v2: > > - Address comments from David. > > > > I''ve been testing this patch on a 3.7.2 kernel merged with the > for-linus branch for the 3.8_rc kernels, and I''m seeing the following > error:Hi Mitch, Insteresting! I don''t even change the snapshot code ever. Is it reproducable stably from your side? Still with the snapshot-test-pub scripts? thanks, liubo> > [16028.159400] general protection fault: 0000 [#1] SMP > [16028.159461] Modules linked in: ipv6 snd_hda_codec_analog > snd_hda_intel snd_hda_codec tg3 snd_hwdep snd_pcm snd_page_alloc > snd_timer snd sr_mod ppdev parport_pc parport microcode iTCO_wdt > iTCO_vendor_support floppy lpc_ich i2c_i801 serio_raw pcspkr > ablk_helper cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs > nfs lockd sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 > mbcache sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd > [16028.159952] CPU 0 > [16028.159975] Pid: 4420, comm: btrfs-cleaner Not tainted 3.7.2-sad+ > #4 Dell Inc. OptiPlex 745 /0WF810 > [16028.160002] RIP: 0010:[<ffffffffa017b4f2>] [<ffffffffa017b4f2>] > btrfs_clean_old_snapshots+0xa6/0x12c [btrfs] > [16028.160002] RSP: 0000:ffff880078609e38 EFLAGS: 00010282 > [16028.160002] RAX: dead000000200200 RBX: ffff880000000000 RCX: 0000000000018e20 > [16028.160002] RDX: dead000000100100 RSI: 000000000000001b RDI: 000000000000001b > [16028.160002] RBP: ffff880078609e78 R08: 00000000001c001b R09: ffffffffa015aa01 > [16028.160002] R10: ffffffffa016bbbd R11: ffff8800183a4800 R12: 0000160000000000 > [16028.160002] R13: ffff880078609e38 R14: ffff8800183a4800 R15: ffff8800183a4c38 > [16028.160002] FS: 0000000000000000(0000) GS:ffff88007f200000(0000) > knlGS:0000000000000000 > [16028.160002] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > [16028.160002] CR2: 00007f64f5214d96 CR3: 0000000011ef2000 CR4: 00000000000007f0 > [16028.160002] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > [16028.160002] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > [16028.160002] Process btrfs-cleaner (pid: 4420, threadinfo > ffff880078608000, task ffff88007ca62120) > [16028.160002] Stack: > [16028.160002] ffff8800183a4c38 ffff8800020e3c38 ffff880078609e48 > ffff88007921b800 > [16028.160002] ffff88007ca62120 ffff88007ca62120 ffff88007ca62120 > 0000000000000000 > [16028.160002] ffff880078609eb8 ffffffffa0173f68 ffff88007921b800 > 0000000000000000 > [16028.160002] Call Trace: > [16028.160002] [<ffffffffa0173f68>] cleaner_kthread+0x5a/0xe6 [btrfs] > [16028.160002] [<ffffffffa0173f0e>] ? transaction_kthread+0x1a0/0x1a0 [btrfs] > [16028.160002] [<ffffffff8104c9c3>] kthread+0xba/0xc2 > [16028.160002] [<ffffffff8104c909>] ? kthread_freezable_should_stop+0x52/0x52 > [16028.160002] [<ffffffff815f9d9c>] ret_from_fork+0x7c/0xb0 > [16028.160002] [<ffffffff8104c909>] ? kthread_freezable_should_stop+0x52/0x52 > [16028.160002] Code: 49 bc 00 00 00 00 00 16 00 00 48 bb 00 00 00 00 > 00 88 ff ff eb 7d 4d 8d b7 c8 fb ff ff 4d 85 ff 75 02 0f 0b 49 8b 17 > 49 8b 47 08 <48> 89 42 08 48 89 10 48 be 00 01 10 00 00 00 ad de 49 89 > 37 48 > [16028.160002] RIP [<ffffffffa017b4f2>] > btrfs_clean_old_snapshots+0xa6/0x12c [btrfs] > [16028.160002] RSP <ffff880078609e38> > [16028.170584] ---[ end trace 4034e68ac40e6c2b ]--- > > Using gdb to identify the location of the GPF gives me the following: > > (gdb) list *(btrfs_clean_old_snapshots+0xa6) > 0x2a4f2 is in btrfs_clean_old_snapshots (include/linux/list.h:88). > 83 * This is only for internal list manipulation where we know > 84 * the prev/next entries already! > 85 */ > 86 static inline void __list_del(struct list_head * prev, struct > list_head * next) > 87 { > 88 next->prev = prev; > 89 prev->next = next; > 90 } > 91 > 92 /** > > I''ve tried to trap the error with a BUG_ON prior to deleting the list, > but my attempt isn''t catching the error: > > @@ -1769,6 +1769,7 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) > int ret; > > root = list_entry(list.next, struct btrfs_root, root_list); > + BUG_ON(&root->root_list == NULL); > list_del(&root->root_list); > > btrfs_kill_all_delayed_nodes(root); > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 17, 2013 at 6:53 PM, Liu Bo <bo.li.liu@oracle.com> wrote:> On Thu, Jan 17, 2013 at 08:42:46AM -0600, Mitch Harder wrote: >> On Wed, Jan 16, 2013 at 6:36 AM, Liu Bo <bo.li.liu@oracle.com> wrote: >> > This comes from one of btrfs''s project ideas, >> > As we defragment files, we break any sharing from other snapshots. >> > The balancing code will preserve the sharing, and defrag needs to grow this >> > as well. >> > >> > Now we''re able to fill the blank with this patch, in which we make full use of >> > backref walking stuff. >> > >> > Here is the basic idea, >> > o set the writeback ranges started by defragment with flag EXTENT_DEFRAG >> > o at endio, after we finish updating fs tree, we use backref walking to find >> > all parents of the ranges and re-link them with the new COWed file layout by >> > adding corresponding backrefs. >> > >> > Signed-off-by: Li Zefan <lizf@cn.fujitsu.com> >> > Signed-off-by: Liu Bo <bo.li.liu@oracle.com> >> > --- >> > v4->v5: >> > - Clarify the comments for duplicated refs. >> > - Clear defrag flag after we''re ready to defrag. >> > - Fix a bug on HOLE extent. >> > v3->v4: >> > - Fix duplicated refs bugs detected by mounting with autodefrag, thanks >> > for the bug report from Mitch and Chris. >> > v2->v3: >> > - Rebase >> > v1->v2: >> > - Address comments from David. >> > >> >> I''ve been testing this patch on a 3.7.2 kernel merged with the >> for-linus branch for the 3.8_rc kernels, and I''m seeing the following >> error: > > Hi Mitch, > > Insteresting! I don''t even change the snapshot code ever.Yes, this patch series has been excellent at tickling unrelated issues.> Is it reproducable stably from your side? Still with the > snapshot-test-pub scripts?I''m still using the same snapshot-test scripts, but they don''t reproduce reliably. I have to run for a while after my script reaches the point where it starts deleting snapshots to make space. But, I''ve been able to hit this error four times with this script. I''ll try to keep playing with this to make a better reproducer, and to isolate the problem with the parameter supplied to list_del. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 17, 2013 at 08:42:46AM -0600, Mitch Harder wrote:> [16028.160002] RAX: dead000000200200 RBX: ffff880000000000 RCX: 0000000000018e20 > [16028.160002] RDX: dead000000100100 RSI: 000000000000001b RDI: 000000000000001bRAX: dead000000200200 RDX: dead000000100100 list_head poisons to mark deleted entries> I''ve tried to trap the error with a BUG_ON prior to deleting the list, > but my attempt isn''t catching the error: > > @@ -1769,6 +1769,7 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) > int ret; > > root = list_entry(list.next, struct btrfs_root, root_list); > + BUG_ON(&root->root_list == NULL);You''re taking an address and comparing it to NULL? This works, but in under very limited conditions :) If root is not null, then the structure is valid, but the root_list hook is not valid anymore, ie. an inconsistency. david -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 18, 2013 at 6:19 AM, David Sterba <dsterba@suse.cz> wrote:> On Thu, Jan 17, 2013 at 08:42:46AM -0600, Mitch Harder wrote: >> [16028.160002] RAX: dead000000200200 RBX: ffff880000000000 RCX: 0000000000018e20 >> [16028.160002] RDX: dead000000100100 RSI: 000000000000001b RDI: 000000000000001b > > RAX: dead000000200200 > RDX: dead000000100100 > > list_head poisons to mark deleted entries > >> I''ve tried to trap the error with a BUG_ON prior to deleting the list, >> but my attempt isn''t catching the error: >> >> @@ -1769,6 +1769,7 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) >> int ret; >> >> root = list_entry(list.next, struct btrfs_root, root_list); >> + BUG_ON(&root->root_list == NULL); > > You''re taking an address and comparing it to NULL? This works, but in > under very limited conditions :) > > If root is not null, then the structure is valid, but the root_list hook > is not valid anymore, ie. an inconsistency.Thanks, your feedback is kind. I wasn''t thinking when I wrote that. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 17, 2013 at 8:42 AM, Mitch Harder <mitch.harder@sabayonlinux.org> wrote:> On Wed, Jan 16, 2013 at 6:36 AM, Liu Bo <bo.li.liu@oracle.com> wrote: >> This comes from one of btrfs''s project ideas, >> As we defragment files, we break any sharing from other snapshots. >> The balancing code will preserve the sharing, and defrag needs to grow this >> as well. >> >> Now we''re able to fill the blank with this patch, in which we make full use of >> backref walking stuff. >> >> Here is the basic idea, >> o set the writeback ranges started by defragment with flag EXTENT_DEFRAG >> o at endio, after we finish updating fs tree, we use backref walking to find >> all parents of the ranges and re-link them with the new COWed file layout by >> adding corresponding backrefs. >> >> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com> >> Signed-off-by: Liu Bo <bo.li.liu@oracle.com> >> --- >> v4->v5: >> - Clarify the comments for duplicated refs. >> - Clear defrag flag after we''re ready to defrag. >> - Fix a bug on HOLE extent. >> v3->v4: >> - Fix duplicated refs bugs detected by mounting with autodefrag, thanks >> for the bug report from Mitch and Chris. >> v2->v3: >> - Rebase >> v1->v2: >> - Address comments from David. >> > > I''ve been testing this patch on a 3.7.2 kernel merged with the > for-linus branch for the 3.8_rc kernels, and I''m seeing the following > error: >I''ve reproduced the error with CONFIG_DEBUG_LIST enabled, which shows some problem with an entry in the list. [59312.260441] ------------[ cut here ]------------ [59312.260454] WARNING: at lib/list_debug.c:62 __list_del_entry+0x8d/0x98() [59312.260458] Hardware name: OptiPlex 745 [59312.260461] list_del corruption. next->prev should be ffff88006511c438, but was dead000000200200 [59312.260464] Modules linked in: ipv6 snd_hda_codec_analog snd_hda_intel i2c_i801 tg3 snd_hda_codec iTCO_wdt snd_hwdep snd_pcm ppdev parport_pc sr_mod microcode floppy parport snd_page_alloc snd_timer snd iTCO_vendor_support lpc_ich serio_raw pcspkr ablk_helper cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd [59312.260519] Pid: 20523, comm: btrfs-cleaner Not tainted 3.7.2-sad+ #1 [59312.260521] Call Trace: [59312.260529] [<ffffffff81030586>] warn_slowpath_common+0x83/0x9b [59312.260549] [<ffffffffa015aa01>] ? reada_for_balance+0x187/0x218 [btrfs] [59312.260554] [<ffffffff81030641>] warn_slowpath_fmt+0x46/0x48 [59312.260566] [<ffffffffa015aa01>] ? reada_for_balance+0x187/0x218 [btrfs] [59312.260570] [<ffffffff812099e5>] __list_del_entry+0x8d/0x98 [59312.260574] [<ffffffff812099fe>] list_del+0xe/0x2e [59312.260590] [<ffffffffa017b325>] btrfs_clean_old_snapshots+0x101/0x168 [btrfs] [59312.260605] [<ffffffffa0173d99>] cleaner_kthread+0x5a/0xe6 [btrfs] [59312.260619] [<ffffffffa0173d3f>] ? transaction_kthread+0x1a0/0x1a0 [btrfs] [59312.260624] [<ffffffff8104c750>] kthread+0xba/0xc2 [59312.260629] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [59312.260634] [<ffffffff815f2f1c>] ret_from_fork+0x7c/0xb0 [59312.260639] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [59312.260642] ---[ end trace 61b4cbd93690300f ]--- [59318.623735] ------------[ cut here ]------------ [59318.623751] WARNING: at lib/list_debug.c:53 __list_del_entry+0x8d/0x98() [59318.623755] Hardware name: OptiPlex 745 [59318.623760] list_del corruption, ffff88006511c438->next is LIST_POISON1 (dead000000100100) [59318.623766] Modules linked in: ipv6 snd_hda_codec_analog snd_hda_intel i2c_i801 tg3 snd_hda_codec iTCO_wdt snd_hwdep snd_pcm ppdev parport_pc sr_mod microcode floppy parport snd_page_alloc snd_timer snd iTCO_vendor_support lpc_ich serio_raw pcspkr ablk_helper cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd [59318.623840] Pid: 20523, comm: btrfs-cleaner Tainted: G W 3.7.2-sad+ #1 [59318.623844] Call Trace: [59318.623855] [<ffffffff81030586>] warn_slowpath_common+0x83/0x9b [59318.623878] [<ffffffffa015aab9>] ? btrfs_free_path+0x27/0x2c [btrfs] [59318.623885] [<ffffffff81030641>] warn_slowpath_fmt+0x46/0x48 [59318.623901] [<ffffffffa015aab9>] ? btrfs_free_path+0x27/0x2c [btrfs] [59318.623907] [<ffffffff812099e5>] __list_del_entry+0x8d/0x98 [59318.623912] [<ffffffff812099fe>] list_del+0xe/0x2e [59318.623935] [<ffffffffa017b325>] btrfs_clean_old_snapshots+0x101/0x168 [btrfs] [59318.623955] [<ffffffffa0173d99>] cleaner_kthread+0x5a/0xe6 [btrfs] [59318.623975] [<ffffffffa0173d3f>] ? transaction_kthread+0x1a0/0x1a0 [btrfs] [59318.623981] [<ffffffff8104c750>] kthread+0xba/0xc2 [59318.623988] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [59318.623994] [<ffffffff815f2f1c>] ret_from_fork+0x7c/0xb0 [59318.624000] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [59318.624022] ---[ end trace 61b4cbd936903010 ]--- [59318.626394] general protection fault: 0000 [#1] SMP [59318.626439] Modules linked in: ipv6 snd_hda_codec_analog snd_hda_intel i2c_i801 tg3 snd_hda_codec iTCO_wdt snd_hwdep snd_pcm ppdev parport_pc sr_mod microcode floppy parport snd_page_alloc snd_timer snd iTCO_vendor_support lpc_ich serio_raw pcspkr ablk_helper cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd [59318.626832] CPU 0 [59318.626849] Pid: 20523, comm: btrfs-cleaner Tainted: G W 3.7.2-sad+ #1 Dell Inc. OptiPlex 745 /0WF810 [59318.626926] RIP: 0010:[<ffffffffa017b349>] [<ffffffffa017b349>] btrfs_clean_old_snapshots+0x125/0x168 [btrfs] [59318.627018] RSP: 0018:ffff880078f43e38 EFLAGS: 00010206 [59318.627054] RAX: 0005800000021000 RBX: ffff880000000000 RCX: 0000000000000008 [59318.627098] RDX: 0000000000000000 RSI: ffff880078f43d70 RDI: ffff88006511c470 [59318.627141] RBP: ffff880078f43e78 R08: 0000000000000000 R09: ffff88004b61c3f0 [59318.627184] R10: 0000000000000001 R11: 0000000000000000 R12: 0000160000000000 [59318.627228] R13: ffff880078f43e38 R14: ffff88006511c000 R15: ffff88006511c438 [59318.627272] FS: 0000000000000000(0000) GS:ffff88007f200000(0000) knlGS:0000000000000000 [59318.627322] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [59318.627358] CR2: 00007ff8b6d33375 CR3: 00000000788a0000 CR4: 00000000000007f0 [59318.627402] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [59318.627445] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [59318.627490] Process btrfs-cleaner (pid: 20523, threadinfo ffff880078f42000, task ffff88007c999a80) [59318.627543] Stack: [59318.627557] ffff88006511c438 ffff880002046438 ffff880078f43e48 ffff880028017800 [59318.627611] ffff88007c999a80 ffff88007c999a80 ffff88007c999a80 0000000000000000 [59318.627663] ffff880078f43eb8 ffffffffa0173d99 ffff880028017800 0000000000000000 [59318.627719] Call Trace: [59318.627751] [<ffffffffa0173d99>] cleaner_kthread+0x5a/0xe6 [btrfs] [59318.627804] [<ffffffffa0173d3f>] ? transaction_kthread+0x1a0/0x1a0 [btrfs] [59318.627853] [<ffffffff8104c750>] kthread+0xba/0xc2 [59318.627898] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [59318.627948] [<ffffffff815f2f1c>] ret_from_fork+0x7c/0xb0 [59318.627986] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [59318.628033] Code: 89 ff e8 cb e6 08 e1 4c 89 f7 e8 22 f4 03 00 49 8b 87 c8 fb ff ff 48 8b 80 50 01 00 00 48 8b 00 4c 01 e0 48 c1 f8 06 48 c1 e0 0c <0f> b6 44 18 3f 31 c9 31 d2 85 c0 7e 05 ba 01 00 00 00 31 f6 4c [59318.628279] RIP [<ffffffffa017b349>] btrfs_clean_old_snapshots+0x125/0x168 [btrfs] [59318.628295] RSP <ffff880078f43e38> [59318.634447] ---[ end trace 61b4cbd936903011 ]--- -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 22, 2013 at 11:41:19AM -0600, Mitch Harder wrote:> On Thu, Jan 17, 2013 at 8:42 AM, Mitch Harder > <mitch.harder@sabayonlinux.org> wrote: > > On Wed, Jan 16, 2013 at 6:36 AM, Liu Bo <bo.li.liu@oracle.com> wrote: > >> This comes from one of btrfs''s project ideas, > >> As we defragment files, we break any sharing from other snapshots. > >> The balancing code will preserve the sharing, and defrag needs to grow this > >> as well.[...]> > > > I''ve been testing this patch on a 3.7.2 kernel merged with the > > for-linus branch for the 3.8_rc kernels, and I''m seeing the following > > error: > > > > I''ve reproduced the error with CONFIG_DEBUG_LIST enabled, which shows > some problem with an entry in the list. > > [59312.260441] ------------[ cut here ]------------ > [59312.260454] WARNING: at lib/list_debug.c:62 __list_del_entry+0x8d/0x98() > [59312.260458] Hardware name: OptiPlex 745 > [59312.260461] list_del corruption. next->prev should be > ffff88006511c438, but was dead000000200200LIST_POISON2 -> (000000200200) So we can know that the next one is deleted from the list even _earlier_ than the current one is. Any other messages before this warning complains? thanks, liubo> [59312.260464] Modules linked in: ipv6 snd_hda_codec_analog > snd_hda_intel i2c_i801 tg3 snd_hda_codec iTCO_wdt snd_hwdep snd_pcm > ppdev parport_pc sr_mod microcode floppy parport snd_page_alloc > snd_timer snd iTCO_vendor_support lpc_ich serio_raw pcspkr ablk_helper > cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd > sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache > sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd > [59312.260519] Pid: 20523, comm: btrfs-cleaner Not tainted 3.7.2-sad+ #1 > [59312.260521] Call Trace: > [59312.260529] [<ffffffff81030586>] warn_slowpath_common+0x83/0x9b > [59312.260549] [<ffffffffa015aa01>] ? reada_for_balance+0x187/0x218 [btrfs] > [59312.260554] [<ffffffff81030641>] warn_slowpath_fmt+0x46/0x48 > [59312.260566] [<ffffffffa015aa01>] ? reada_for_balance+0x187/0x218 [btrfs] > [59312.260570] [<ffffffff812099e5>] __list_del_entry+0x8d/0x98 > [59312.260574] [<ffffffff812099fe>] list_del+0xe/0x2e > [59312.260590] [<ffffffffa017b325>] > btrfs_clean_old_snapshots+0x101/0x168 [btrfs] > [59312.260605] [<ffffffffa0173d99>] cleaner_kthread+0x5a/0xe6 [btrfs] > [59312.260619] [<ffffffffa0173d3f>] ? transaction_kthread+0x1a0/0x1a0 [btrfs] > [59312.260624] [<ffffffff8104c750>] kthread+0xba/0xc2 > [59312.260629] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [59312.260634] [<ffffffff815f2f1c>] ret_from_fork+0x7c/0xb0 > [59312.260639] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [59312.260642] ---[ end trace 61b4cbd93690300f ]--- > [59318.623735] ------------[ cut here ]------------ > [59318.623751] WARNING: at lib/list_debug.c:53 __list_del_entry+0x8d/0x98() > [59318.623755] Hardware name: OptiPlex 745 > [59318.623760] list_del corruption, ffff88006511c438->next is > LIST_POISON1 (dead000000100100) > [59318.623766] Modules linked in: ipv6 snd_hda_codec_analog > snd_hda_intel i2c_i801 tg3 snd_hda_codec iTCO_wdt snd_hwdep snd_pcm > ppdev parport_pc sr_mod microcode floppy parport snd_page_alloc > snd_timer snd iTCO_vendor_support lpc_ich serio_raw pcspkr ablk_helper > cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd > sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache > sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd > [59318.623840] Pid: 20523, comm: btrfs-cleaner Tainted: G W > 3.7.2-sad+ #1 > [59318.623844] Call Trace: > [59318.623855] [<ffffffff81030586>] warn_slowpath_common+0x83/0x9b > [59318.623878] [<ffffffffa015aab9>] ? btrfs_free_path+0x27/0x2c [btrfs] > [59318.623885] [<ffffffff81030641>] warn_slowpath_fmt+0x46/0x48 > [59318.623901] [<ffffffffa015aab9>] ? btrfs_free_path+0x27/0x2c [btrfs] > [59318.623907] [<ffffffff812099e5>] __list_del_entry+0x8d/0x98 > [59318.623912] [<ffffffff812099fe>] list_del+0xe/0x2e > [59318.623935] [<ffffffffa017b325>] > btrfs_clean_old_snapshots+0x101/0x168 [btrfs] > [59318.623955] [<ffffffffa0173d99>] cleaner_kthread+0x5a/0xe6 [btrfs] > [59318.623975] [<ffffffffa0173d3f>] ? transaction_kthread+0x1a0/0x1a0 [btrfs] > [59318.623981] [<ffffffff8104c750>] kthread+0xba/0xc2 > [59318.623988] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [59318.623994] [<ffffffff815f2f1c>] ret_from_fork+0x7c/0xb0 > [59318.624000] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [59318.624022] ---[ end trace 61b4cbd936903010 ]--- > [59318.626394] general protection fault: 0000 [#1] SMP > [59318.626439] Modules linked in: ipv6 snd_hda_codec_analog > snd_hda_intel i2c_i801 tg3 snd_hda_codec iTCO_wdt snd_hwdep snd_pcm > ppdev parport_pc sr_mod microcode floppy parport snd_page_alloc > snd_timer snd iTCO_vendor_support lpc_ich serio_raw pcspkr ablk_helper > cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd > sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache > sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd > [59318.626832] CPU 0 > [59318.626849] Pid: 20523, comm: btrfs-cleaner Tainted: G W > 3.7.2-sad+ #1 Dell Inc. OptiPlex 745 > /0WF810 > [59318.626926] RIP: 0010:[<ffffffffa017b349>] [<ffffffffa017b349>] > btrfs_clean_old_snapshots+0x125/0x168 [btrfs] > [59318.627018] RSP: 0018:ffff880078f43e38 EFLAGS: 00010206 > [59318.627054] RAX: 0005800000021000 RBX: ffff880000000000 RCX: 0000000000000008 > [59318.627098] RDX: 0000000000000000 RSI: ffff880078f43d70 RDI: ffff88006511c470 > [59318.627141] RBP: ffff880078f43e78 R08: 0000000000000000 R09: ffff88004b61c3f0 > [59318.627184] R10: 0000000000000001 R11: 0000000000000000 R12: 0000160000000000 > [59318.627228] R13: ffff880078f43e38 R14: ffff88006511c000 R15: ffff88006511c438 > [59318.627272] FS: 0000000000000000(0000) GS:ffff88007f200000(0000) > knlGS:0000000000000000 > [59318.627322] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > [59318.627358] CR2: 00007ff8b6d33375 CR3: 00000000788a0000 CR4: 00000000000007f0 > [59318.627402] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > [59318.627445] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > [59318.627490] Process btrfs-cleaner (pid: 20523, threadinfo > ffff880078f42000, task ffff88007c999a80) > [59318.627543] Stack: > [59318.627557] ffff88006511c438 ffff880002046438 ffff880078f43e48 > ffff880028017800 > [59318.627611] ffff88007c999a80 ffff88007c999a80 ffff88007c999a80 > 0000000000000000 > [59318.627663] ffff880078f43eb8 ffffffffa0173d99 ffff880028017800 > 0000000000000000 > [59318.627719] Call Trace: > [59318.627751] [<ffffffffa0173d99>] cleaner_kthread+0x5a/0xe6 [btrfs] > [59318.627804] [<ffffffffa0173d3f>] ? transaction_kthread+0x1a0/0x1a0 [btrfs] > [59318.627853] [<ffffffff8104c750>] kthread+0xba/0xc2 > [59318.627898] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [59318.627948] [<ffffffff815f2f1c>] ret_from_fork+0x7c/0xb0 > [59318.627986] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [59318.628033] Code: 89 ff e8 cb e6 08 e1 4c 89 f7 e8 22 f4 03 00 49 > 8b 87 c8 fb ff ff 48 8b 80 50 01 00 00 48 8b 00 4c 01 e0 48 c1 f8 06 > 48 c1 e0 0c <0f> b6 44 18 3f 31 c9 31 d2 85 c0 7e 05 ba 01 00 00 00 31 > f6 4c > [59318.628279] RIP [<ffffffffa017b349>] > btrfs_clean_old_snapshots+0x125/0x168 [btrfs] > [59318.628295] RSP <ffff880078f43e38> > [59318.634447] ---[ end trace 61b4cbd936903011 ]----- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jan 23, 2013 at 1:51 AM, Liu Bo <bo.li.liu@oracle.com> wrote:> On Tue, Jan 22, 2013 at 11:41:19AM -0600, Mitch Harder wrote: >> On Thu, Jan 17, 2013 at 8:42 AM, Mitch Harder >> <mitch.harder@sabayonlinux.org> wrote: >> > On Wed, Jan 16, 2013 at 6:36 AM, Liu Bo <bo.li.liu@oracle.com> wrote: >> >> This comes from one of btrfs''s project ideas, >> >> As we defragment files, we break any sharing from other snapshots. >> >> The balancing code will preserve the sharing, and defrag needs to grow this >> >> as well. > [...] >> > >> > I''ve been testing this patch on a 3.7.2 kernel merged with the >> > for-linus branch for the 3.8_rc kernels, and I''m seeing the following >> > error: >> > >> >> I''ve reproduced the error with CONFIG_DEBUG_LIST enabled, which shows >> some problem with an entry in the list. >> >> [59312.260441] ------------[ cut here ]------------ >> [59312.260454] WARNING: at lib/list_debug.c:62 __list_del_entry+0x8d/0x98() >> [59312.260458] Hardware name: OptiPlex 745 >> [59312.260461] list_del corruption. next->prev should be >> ffff88006511c438, but was dead000000200200 > > LIST_POISON2 -> (000000200200) > So we can know that the next one is deleted from the list even _earlier_ > than the current one is. > > Any other messages before this warning complains? >Just some normal feedback from a metadata balance I had run. [14057.193343] device fsid 28c688c5-7dbd-4071-b271-1bf6726d8835 devid 1 transid 4 /dev/sda7 [14057.194438] btrfs: force lzo compression [14057.194446] btrfs: enabling auto defrag [14057.194449] btrfs: disk space caching is enabled [14057.194452] btrfs flagging fs with big metadata feature [14057.194455] btrfs: lzo incompat flag set. [57508.799193] btrfs: relocating block group 14516486144 flags 4 [57632.178797] btrfs: found 6775 extents [57633.214701] btrfs: relocating block group 11832131584 flags 4 [57776.400102] btrfs: found 6480 extents [57777.021175] btrfs: relocating block group 10489954304 flags 4 [57949.182725] btrfs: found 6681 extents [59312.260441] ------------[ cut here ]------------ [59312.260454] WARNING: at lib/list_debug.c:62 __list_del_entry+0x8d/0x98() [59312.260458] Hardware name: OptiPlex 745 ... I''m going to try to wrap some debugging around the section of code in btrfs_clean_old_snapshots() where the dead_roots list is spliced onto the root list being processed. The double entry may be slipping in here. 1764 spin_lock(&fs_info->trans_lock); 1765 list_splice_init(&fs_info->dead_roots, &list); 1766 spin_unlock(&fs_info->trans_lock); -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jan 23, 2013 at 10:05:04AM -0600, Mitch Harder wrote:> On Wed, Jan 23, 2013 at 1:51 AM, Liu Bo <bo.li.liu@oracle.com> wrote: > > On Tue, Jan 22, 2013 at 11:41:19AM -0600, Mitch Harder wrote: > >> On Thu, Jan 17, 2013 at 8:42 AM, Mitch Harder > >> <mitch.harder@sabayonlinux.org> wrote: > >> > On Wed, Jan 16, 2013 at 6:36 AM, Liu Bo <bo.li.liu@oracle.com> wrote: > >> >> This comes from one of btrfs''s project ideas, > >> >> As we defragment files, we break any sharing from other snapshots. > >> >> The balancing code will preserve the sharing, and defrag needs to grow this > >> >> as well. > > [...] > >> > > >> > I''ve been testing this patch on a 3.7.2 kernel merged with the > >> > for-linus branch for the 3.8_rc kernels, and I''m seeing the following > >> > error: > >> > > >> > >> I''ve reproduced the error with CONFIG_DEBUG_LIST enabled, which shows > >> some problem with an entry in the list. > >> > >> [59312.260441] ------------[ cut here ]------------ > >> [59312.260454] WARNING: at lib/list_debug.c:62 __list_del_entry+0x8d/0x98() > >> [59312.260458] Hardware name: OptiPlex 745 > >> [59312.260461] list_del corruption. next->prev should be > >> ffff88006511c438, but was dead000000200200 > > > > LIST_POISON2 -> (000000200200) > > So we can know that the next one is deleted from the list even _earlier_ > > than the current one is. > > > > Any other messages before this warning complains? > > > > Just some normal feedback from a metadata balance I had run.Well, these do fit my expectation, since balance also involves with playing with root_list, which may lead to the bad situation.> > [14057.193343] device fsid 28c688c5-7dbd-4071-b271-1bf6726d8835 devid > 1 transid 4 /dev/sda7 > [14057.194438] btrfs: force lzo compression > [14057.194446] btrfs: enabling auto defrag > [14057.194449] btrfs: disk space caching is enabled > [14057.194452] btrfs flagging fs with big metadata feature > [14057.194455] btrfs: lzo incompat flag set. > [57508.799193] btrfs: relocating block group 14516486144 flags 4 > [57632.178797] btrfs: found 6775 extents > [57633.214701] btrfs: relocating block group 11832131584 flags 4 > [57776.400102] btrfs: found 6480 extents > [57777.021175] btrfs: relocating block group 10489954304 flags 4 > [57949.182725] btrfs: found 6681 extents > [59312.260441] ------------[ cut here ]------------ > [59312.260454] WARNING: at lib/list_debug.c:62 __list_del_entry+0x8d/0x98() > [59312.260458] Hardware name: OptiPlex 745 > ... > > I''m going to try to wrap some debugging around the section of code in > btrfs_clean_old_snapshots() where the dead_roots list is spliced onto > the root list being processed. The double entry may be slipping in > here. > > 1764 spin_lock(&fs_info->trans_lock); > 1765 list_splice_init(&fs_info->dead_roots, &list); > 1766 spin_unlock(&fs_info->trans_lock);hmm, I don''t think there is anything wrong in this code. But you can give it a shot anyway :) thanks, liubo -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jan 23, 2013 at 6:52 PM, Liu Bo <bo.li.liu@oracle.com> wrote:> On Wed, Jan 23, 2013 at 10:05:04AM -0600, Mitch Harder wrote: >> On Wed, Jan 23, 2013 at 1:51 AM, Liu Bo <bo.li.liu@oracle.com> wrote: >> > On Tue, Jan 22, 2013 at 11:41:19AM -0600, Mitch Harder wrote: >> >> On Thu, Jan 17, 2013 at 8:42 AM, Mitch Harder >> >> <mitch.harder@sabayonlinux.org> wrote: >> >> > On Wed, Jan 16, 2013 at 6:36 AM, Liu Bo <bo.li.liu@oracle.com> wrote: >> >> >> This comes from one of btrfs''s project ideas, >> >> >> As we defragment files, we break any sharing from other snapshots. >> >> >> The balancing code will preserve the sharing, and defrag needs to grow this >> >> >> as well. >> > [...] >> >> > >> >> > I''ve been testing this patch on a 3.7.2 kernel merged with the >> >> > for-linus branch for the 3.8_rc kernels, and I''m seeing the following >> >> > error: >> >> > >> >> >> >> I''ve reproduced the error with CONFIG_DEBUG_LIST enabled, which shows >> >> some problem with an entry in the list. >> >> >> >> [59312.260441] ------------[ cut here ]------------ >> >> [59312.260454] WARNING: at lib/list_debug.c:62 __list_del_entry+0x8d/0x98() >> >> [59312.260458] Hardware name: OptiPlex 745 >> >> [59312.260461] list_del corruption. next->prev should be >> >> ffff88006511c438, but was dead000000200200 >> > >> > LIST_POISON2 -> (000000200200) >> > So we can know that the next one is deleted from the list even _earlier_ >> > than the current one is. >> > >> > Any other messages before this warning complains? >> > >> >> Just some normal feedback from a metadata balance I had run. > > Well, these do fit my expectation, since balance also involves with playing with > root_list, which may lead to the bad situation. > >> >> [14057.193343] device fsid 28c688c5-7dbd-4071-b271-1bf6726d8835 devid >> 1 transid 4 /dev/sda7 >> [14057.194438] btrfs: force lzo compression >> [14057.194446] btrfs: enabling auto defrag >> [14057.194449] btrfs: disk space caching is enabled >> [14057.194452] btrfs flagging fs with big metadata feature >> [14057.194455] btrfs: lzo incompat flag set. >> [57508.799193] btrfs: relocating block group 14516486144 flags 4 >> [57632.178797] btrfs: found 6775 extents >> [57633.214701] btrfs: relocating block group 11832131584 flags 4 >> [57776.400102] btrfs: found 6480 extents >> [57777.021175] btrfs: relocating block group 10489954304 flags 4 >> [57949.182725] btrfs: found 6681 extents >> [59312.260441] ------------[ cut here ]------------ >> [59312.260454] WARNING: at lib/list_debug.c:62 __list_del_entry+0x8d/0x98() >> [59312.260458] Hardware name: OptiPlex 745 >> ... >> >> I''m going to try to wrap some debugging around the section of code in >> btrfs_clean_old_snapshots() where the dead_roots list is spliced onto >> the root list being processed. The double entry may be slipping in >> here. >> >> 1764 spin_lock(&fs_info->trans_lock); >> 1765 list_splice_init(&fs_info->dead_roots, &list); >> 1766 spin_unlock(&fs_info->trans_lock); > > hmm, I don''t think there is anything wrong in this code. But you can > give it a shot anyway :) >I''ve changed up my reproducer to try some things that may hit the issue quicker and more reliably. It gave me a slightly different set of warnings in dmesg, which seem to suggest issues in the dead_root list. [43925.656065] device fsid a8f6fadb-3022-4c01-b369-f1f3f638c052 devid 1 transid 310 /dev/sda7 [43925.658062] btrfs: force lzo compression [43925.658072] btrfs: enabling auto defrag [43925.658075] btrfs: disk space caching is enabled [43925.658078] btrfs: lzo incompat flag set. [44503.421293] btrfs: unlinked 1 orphans [44898.287365] btrfs: unlinked 1 orphans [45080.641383] btrfs: unlinked 1 orphans [45250.063773] btrfs: unlinked 1 orphans [46223.387355] btrfs: unlinked 1 orphans [46476.473944] btrfs: unlinked 1 orphans [46499.665615] btrfs: unlinked 1 orphans [46769.785454] ------------[ cut here ]------------ [46769.785471] WARNING: at lib/list_debug.c:36 __list_add+0x9d/0xba() [46769.785474] Hardware name: OptiPlex 745 [46769.785478] list_add double add: new=ffff880050c27c38, prev=ffff880078f3e720, next=ffff880050c27c38. [46769.785480] Modules linked in: ipv6 snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm snd_page_alloc snd_timer tg3 sr_mod snd i2c_i801 ppdev parport_pc iTCO_wdt iTCO_vendor_support lpc_ich pcspkr parport floppy serio_raw microcode ablk_helper cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd [46769.785537] Pid: 18291, comm: btrfs-endio-wri Not tainted 3.7.4-sad-v1+ #3 [46769.785539] Call Trace: [46769.785549] [<ffffffff81030586>] warn_slowpath_common+0x83/0x9b [46769.785553] [<ffffffff81030641>] warn_slowpath_fmt+0x46/0x48 [46769.785558] [<ffffffff8120987b>] __list_add+0x9d/0xba [46769.785586] [<ffffffffa0179dd6>] btrfs_add_dead_root+0x42/0x56 [btrfs] [46769.785603] [<ffffffffa0187b67>] btrfs_destroy_inode+0x227/0x25b [btrfs] [46769.785611] [<ffffffff8111393a>] destroy_inode+0x3b/0x54 [46769.785615] [<ffffffff81113a9c>] evict+0x149/0x151 [46769.785619] [<ffffffff81114322>] iput+0x12c/0x135 [46769.785636] [<ffffffffa018455f>] relink_extent_backref+0x669/0x6af [btrfs] [46769.785642] [<ffffffff815e9849>] ? __slab_free+0x17c/0x21b [46769.785658] [<ffffffffa0184d15>] ? btrfs_finish_ordered_io+0x770/0x827 [btrfs] [46769.785674] [<ffffffffa0184ce5>] btrfs_finish_ordered_io+0x740/0x827 [btrfs] [46769.785691] [<ffffffffa0184de1>] finish_ordered_fn+0x15/0x17 [btrfs] [46769.785706] [<ffffffffa019e5c9>] worker_loop+0x14c/0x493 [btrfs] [46769.785722] [<ffffffffa019e47d>] ? btrfs_queue_worker+0x258/0x258 [btrfs] [46769.785728] [<ffffffff8104c750>] kthread+0xba/0xc2 [46769.785732] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [46769.785737] [<ffffffff815f301c>] ret_from_fork+0x7c/0xb0 [46769.785741] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [46769.785745] ---[ end trace 7528086f91b151b5 ]--- [46799.053062] ------------[ cut here ]------------ [46799.053078] WARNING: at lib/list_debug.c:62 __list_del_entry+0x8d/0x98() [46799.053082] Hardware name: OptiPlex 745 [46799.053087] list_del corruption. next->prev should be ffff880050c27c38, but was ffff8800057fde38 [46799.053090] Modules linked in: ipv6 snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm snd_page_alloc snd_timer tg3 sr_mod snd i2c_i801 ppdev parport_pc iTCO_wdt iTCO_vendor_support lpc_ich pcspkr parport floppy serio_raw microcode ablk_helper cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd [46799.053163] Pid: 18210, comm: btrfs-cleaner Tainted: G W 3.7.4-sad-v1+ #3 [46799.053166] Call Trace: [46799.053180] [<ffffffff81030586>] warn_slowpath_common+0x83/0x9b [46799.053184] [<ffffffff81030641>] warn_slowpath_fmt+0x46/0x48 [46799.053190] [<ffffffff810ab4e9>] ? __trace_bprintk+0x48/0x4a [46799.053194] [<ffffffff812097a5>] __list_del_entry+0x8d/0x98 [46799.053198] [<ffffffff812097be>] list_del+0xe/0x2e [46799.053220] [<ffffffffa017b2f5>] btrfs_clean_old_snapshots+0xed/0x150 [btrfs] [46799.053235] [<ffffffffa0173d7d>] cleaner_kthread+0x5a/0xe6 [btrfs] [46799.053249] [<ffffffffa0173d23>] ? transaction_kthread+0x1a0/0x1a0 [btrfs] [46799.053254] [<ffffffff8104c750>] kthread+0xba/0xc2 [46799.053259] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [46799.053264] [<ffffffff815f301c>] ret_from_fork+0x7c/0xb0 [46799.053269] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [46799.053272] ---[ end trace 7528086f91b151b6 ]--- [46811.162649] ------------[ cut here ]------------ [46811.162665] WARNING: at lib/list_debug.c:53 __list_del_entry+0x8d/0x98() [46811.162669] Hardware name: OptiPlex 745 [46811.162674] list_del corruption, ffff880050c27c38->next is LIST_POISON1 (dead000000100100) [46811.162678] Modules linked in: ipv6 snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm snd_page_alloc snd_timer tg3 sr_mod snd i2c_i801 ppdev parport_pc iTCO_wdt iTCO_vendor_support lpc_ich pcspkr parport floppy serio_raw microcode ablk_helper cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd [46811.162750] Pid: 18210, comm: btrfs-cleaner Tainted: G W 3.7.4-sad-v1+ #3 [46811.162754] Call Trace: [46811.162764] [<ffffffff81030586>] warn_slowpath_common+0x83/0x9b [46811.162771] [<ffffffff81030641>] warn_slowpath_fmt+0x46/0x48 [46811.162779] [<ffffffff810ab4e9>] ? __trace_bprintk+0x48/0x4a [46811.162785] [<ffffffff812097a5>] __list_del_entry+0x8d/0x98 [46811.162791] [<ffffffff812097be>] list_del+0xe/0x2e [46811.162820] [<ffffffffa017b2f5>] btrfs_clean_old_snapshots+0xed/0x150 [btrfs] [46811.162841] [<ffffffffa0173d7d>] cleaner_kthread+0x5a/0xe6 [btrfs] [46811.162862] [<ffffffffa0173d23>] ? transaction_kthread+0x1a0/0x1a0 [btrfs] [46811.162869] [<ffffffff8104c750>] kthread+0xba/0xc2 [46811.162875] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [46811.162882] [<ffffffff815f301c>] ret_from_fork+0x7c/0xb0 [46811.162888] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [46811.162892] ---[ end trace 7528086f91b151b7 ]--- [46811.162904] BUG: unable to handle kernel paging request at 0000000047c5a000 [46811.163003] IP: [<ffffffffa017b30b>] btrfs_clean_old_snapshots+0x103/0x150 [btrfs] [46811.163003] PGD 0 [46811.163003] Oops: 0000 [#1] SMP [46811.163003] Modules linked in: ipv6 snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm snd_page_alloc snd_timer tg3 sr_mod snd i2c_i801 ppdev parport_pc iTCO_wdt iTCO_vendor_support lpc_ich pcspkr parport floppy serio_raw microcode ablk_helper cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd [46811.163003] CPU 0 [46811.163003] Pid: 18210, comm: btrfs-cleaner Tainted: G W 3.7.4-sad-v1+ #3 Dell Inc. OptiPlex 745 /0WF810 [46811.163003] RIP: 0010:[<ffffffffa017b30b>] [<ffffffffa017b30b>] btrfs_clean_old_snapshots+0x103/0x150 [btrfs] [46811.163003] RSP: 0018:ffff8800057fde38 EFLAGS: 00010296 [46811.163003] RAX: 0000000047c5a000 RBX: ffff880050c27800 RCX: 0000000000000008 [46811.163003] RDX: 0000000000000000 RSI: ffff8800057fdd70 RDI: ffff880050c27c70 [46811.163003] RBP: ffff8800057fde78 R08: 0000000000000000 R09: 0000000000000283 [46811.163003] R10: 0000000000000001 R11: 0000000000000000 R12: ffff880000000000 [46811.163003] R13: 0000160000000000 R14: ffff8800057fde38 R15: ffff880050c27c38 [46811.163003] FS: 0000000000000000(0000) GS:ffff88007f200000(0000) knlGS:0000000000000000 [46811.163003] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [46811.163003] CR2: 0000000047c5a000 CR3: 000000003f270000 CR4: 00000000000007f0 [46811.163003] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [46811.163003] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [46811.163003] Process btrfs-cleaner (pid: 18210, threadinfo ffff8800057fc000, task ffff88007c030d40) [46811.163003] Stack: [46811.163003] ffff880050c27c38 ffff88001f488438 ffff8800057fde48 ffff88002d15b800 [46811.163003] ffff88007c030d40 ffff88007c030d40 ffff88007c030d40 0000000000000000 [46811.163003] ffff8800057fdeb8 ffffffffa0173d7d ffff88002d15b800 0000000000000000 [46811.163003] Call Trace: [46811.163003] [<ffffffffa0173d7d>] cleaner_kthread+0x5a/0xe6 [btrfs] [46811.163003] [<ffffffffa0173d23>] ? transaction_kthread+0x1a0/0x1a0 [btrfs] [46811.163003] [<ffffffff8104c750>] kthread+0xba/0xc2 [46811.163003] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [46811.163003] [<ffffffff815f301c>] ret_from_fork+0x7c/0xb0 [46811.163003] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [46811.163003] Code: c7 c7 d5 b2 17 a0 31 c0 e8 b4 01 f3 e0 4c 89 ff e8 bb e4 08 e1 48 89 df e8 f2 f5 03 00 49 8b 87 c8 fb ff ff 48 8b 80 50 01 00 00 <48> 8b 00 4c 01 e8 48 c1 f8 06 48 c1 e0 0c 42 0f b6 44 20 3f 31 [46811.163003] RIP [<ffffffffa017b30b>] btrfs_clean_old_snapshots+0x103/0x150 [btrfs] [46811.163003] RSP <ffff8800057fde38> [46811.163003] CR2: 0000000047c5a000 [46811.238512] ---[ end trace 7528086f91b151b8 ]--- -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 25 Jan 2013 08:55:58 -0600, Mitch Harder wrote:> On Wed, Jan 23, 2013 at 6:52 PM, Liu Bo <bo.li.liu@oracle.com> wrote: >> On Wed, Jan 23, 2013 at 10:05:04AM -0600, Mitch Harder wrote: >>> On Wed, Jan 23, 2013 at 1:51 AM, Liu Bo <bo.li.liu@oracle.com> wrote: >>>> On Tue, Jan 22, 2013 at 11:41:19AM -0600, Mitch Harder wrote: >>>>> On Thu, Jan 17, 2013 at 8:42 AM, Mitch Harder >>>>> <mitch.harder@sabayonlinux.org> wrote: >>>>>> On Wed, Jan 16, 2013 at 6:36 AM, Liu Bo <bo.li.liu@oracle.com> wrote: >>>>>>> This comes from one of btrfs''s project ideas, >>>>>>> As we defragment files, we break any sharing from other snapshots. >>>>>>> The balancing code will preserve the sharing, and defrag needs to grow this >>>>>>> as well. >>>> [...] >>>>>> >>>>>> I''ve been testing this patch on a 3.7.2 kernel merged with the >>>>>> for-linus branch for the 3.8_rc kernels, and I''m seeing the following >>>>>> error:[...]> > I''ve changed up my reproducer to try some things that may hit the > issue quicker and more reliably. > > It gave me a slightly different set of warnings in dmesg, which seem > to suggest issues in the dead_root list.[...]> [46769.785454] ------------[ cut here ]------------ > [46769.785471] WARNING: at lib/list_debug.c:36 __list_add+0x9d/0xba() > [46769.785474] Hardware name: OptiPlex 745 > [46769.785478] list_add double add: new=ffff880050c27c38, > prev=ffff880078f3e720, next=ffff880050c27c38. > [46769.785480] Modules linked in: ipv6 snd_hda_codec_analog > snd_hda_intel snd_hda_codec snd_hwdep snd_pcm snd_page_alloc snd_timer > tg3 sr_mod snd i2c_i801 ppdev parport_pc iTCO_wdt iTCO_vendor_support > lpc_ich pcspkr parport floppy serio_raw microcode ablk_helper cryptd > lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc > reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd > hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd > [46769.785537] Pid: 18291, comm: btrfs-endio-wri Not tainted 3.7.4-sad-v1+ #3 > [46769.785539] Call Trace: > [46769.785549] [<ffffffff81030586>] warn_slowpath_common+0x83/0x9b > [46769.785553] [<ffffffff81030641>] warn_slowpath_fmt+0x46/0x48 > [46769.785558] [<ffffffff8120987b>] __list_add+0x9d/0xba > [46769.785586] [<ffffffffa0179dd6>] btrfs_add_dead_root+0x42/0x56 [btrfs] > [46769.785603] [<ffffffffa0187b67>] btrfs_destroy_inode+0x227/0x25b [btrfs] > [46769.785611] [<ffffffff8111393a>] destroy_inode+0x3b/0x54 > [46769.785615] [<ffffffff81113a9c>] evict+0x149/0x151 > [46769.785619] [<ffffffff81114322>] iput+0x12c/0x135 > [46769.785636] [<ffffffffa018455f>] relink_extent_backref+0x669/0x6af [btrfs] > [46769.785642] [<ffffffff815e9849>] ? __slab_free+0x17c/0x21b > [46769.785658] [<ffffffffa0184d15>] ? > btrfs_finish_ordered_io+0x770/0x827 [btrfs] > [46769.785674] [<ffffffffa0184ce5>] btrfs_finish_ordered_io+0x740/0x827 [btrfs] > [46769.785691] [<ffffffffa0184de1>] finish_ordered_fn+0x15/0x17 [btrfs] > [46769.785706] [<ffffffffa019e5c9>] worker_loop+0x14c/0x493 [btrfs] > [46769.785722] [<ffffffffa019e47d>] ? btrfs_queue_worker+0x258/0x258 [btrfs] > [46769.785728] [<ffffffff8104c750>] kthread+0xba/0xc2 > [46769.785732] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [46769.785737] [<ffffffff815f301c>] ret_from_fork+0x7c/0xb0 > [46769.785741] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [46769.785745] ---[ end trace 7528086f91b151b5 ]--- > [46799.053062] ------------[ cut here ]------------Well, the issue that I had reported on IRC some days ago which looks similar (the top part of the call trace is similar: iput -> evict -> destroy_inode -> btrfs_destroy_inode -> btrfs_add_dead_root -> list_add which warns in list_add in your case and crashes in my case) was without Liu Bo''s "snapshot-aware defrag" patch. A 3.8.0-rc4 kernel and nothing else. The reproducer was to create and destroy subvolumes and snapshots. I used btrfs-receive to fill them with data. The crash happened on umount. Every time. del_fs_roots() is attempting to empty the dead_roots list, and via btrfs_destroy_inode() deeper in the call stack they are added back to the dead_roots list. BUG: unable to handle kernel paging request at ffff88042503b830 IP: [<ffffffff814532b7>] __list_add+0x17/0xd0 PGD 1e0c063 PUD bf58e067 PMD bf6b7067 PTE 800000042503b160 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: btrfs bonding raid1 mpt2sas scsi_transport_sas raid_class CPU 2 Pid: 10259, comm: umount Not tainted 3.8.0-rc4+ #16 Supermicro X8SIL/X8SIL RIP: 0010:[<ffffffff814532b7>] [<ffffffff814532b7>] __list_add+0x17/0xd0 RSP: 0018:ffff8802f67a1bd8 EFLAGS: 00010286 RAX: ffff880425b7c560 RBX: ffff880423ca2828 RCX: 0000000000000001 RDX: ffff88042503b828 RSI: ffff8804257794c0 RDI: ffff880423ca2828 RBP: ffff8802f67a1bf8 R08: 0000000000077850 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff880423ca2000 R13: ffff880423ca2898 R14: 0000000000000000 R15: ffff8802f67a1d30 FS: 00007f6e89bba740(0000) GS:ffff88042ea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: ffff88042503b830 CR3: 000000029a56c000 CR4: 00000000000007e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process umount (pid: 10259, threadinfo ffff8802f67a0000, task ffff880425b7c560) Stack: ffffffffa00a414f ffff880423ca2000 ffff880423ca2000 ffff880423ca2898 ffff8802f67a1c18 ffffffffa00a4170 ffff88042a60c1f8 ffff88042a60c1f8 ffff8802f67a1c48 ffffffffa00b3180 ffff88042a60c1f8 ffff88042a60c280 Call Trace: [<ffffffffa00a414f>] ? btrfs_add_dead_root+0x1f/0x60 [btrfs] [<ffffffffa00a4170>] btrfs_add_dead_root+0x40/0x60 [btrfs] [<ffffffffa00b3180>] btrfs_destroy_inode+0x1d0/0x2d0 [btrfs] [<ffffffff811b5d17>] destroy_inode+0x37/0x60 [<ffffffff811b5e4d>] evict+0x10d/0x1a0 [<ffffffff811b65f5>] iput+0x105/0x190 [<ffffffffa009bd68>] free_fs_root+0x18/0x90 [btrfs] [<ffffffffa009f1ab>] btrfs_free_fs_root+0x7b/0x90 [btrfs] [<ffffffffa009f26f>] del_fs_roots+0xaf/0xf0 [btrfs] [<ffffffffa00a0bc6>] close_ctree+0x1c6/0x300 [btrfs] [<ffffffff811b6a7c>] ? evict_inodes+0xec/0x100 [<ffffffffa00763a4>] btrfs_put_super+0x14/0x20 [btrfs] [<ffffffff8119dfcc>] generic_shutdown_super+0x5c/0xe0 [<ffffffff8119e0e1>] kill_anon_super+0x11/0x20 [<ffffffffa007a3a5>] btrfs_kill_super+0x15/0x90 [btrfs] [<ffffffff8119f111>] ? deactivate_super+0x41/0x70 [<ffffffff8119e4dd>] deactivate_locked_super+0x3d/0x70 [<ffffffff8119f119>] deactivate_super+0x49/0x70 [<ffffffff811ba772>] mntput_no_expire+0xd2/0x130 [<ffffffff811bb621>] sys_umount+0x71/0x390 [<ffffffff81983012>] system_call_fastpath+0x16/0x1b Code: 48 83 c4 08 5b 5d c3 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 ec 20 48 89 5d e8 4c 89 65 f0 48 89 fb 4c 89 6d f8 <4c> 8b 42 08 49 89 f5 49 89 d4 49 39 f0 75 31 4d 8b 45 00 4d 39 RIP [<ffffffff814532b7>] __list_add+0x17/0xd0 RSP <ffff8802f67a1bd8> CR2: ffff88042503b830 ---[ end trace 5e44f1afc74751aa ]--- -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 25, 2013 at 08:55:58AM -0600, Mitch Harder wrote:> On Wed, Jan 23, 2013 at 6:52 PM, Liu Bo <bo.li.liu@oracle.com> wrote: > > On Wed, Jan 23, 2013 at 10:05:04AM -0600, Mitch Harder wrote: > >> On Wed, Jan 23, 2013 at 1:51 AM, Liu Bo <bo.li.liu@oracle.com> wrote: > >> > On Tue, Jan 22, 2013 at 11:41:19AM -0600, Mitch Harder wrote: > >> >> On Thu, Jan 17, 2013 at 8:42 AM, Mitch Harder > >> >> <mitch.harder@sabayonlinux.org> wrote: > >> >> > On Wed, Jan 16, 2013 at 6:36 AM, Liu Bo <bo.li.liu@oracle.com> wrote: > >> >> >> This comes from one of btrfs''s project ideas, > >> >> >> As we defragment files, we break any sharing from other snapshots. > >> >> >> The balancing code will preserve the sharing, and defrag needs to grow this > >> >> >> as well. > >> > [...] > >> >> > > >> >> > I''ve been testing this patch on a 3.7.2 kernel merged with the > >> >> > for-linus branch for the 3.8_rc kernels, and I''m seeing the following > >> >> > error: > >> >> > > >> >> > >> >> I''ve reproduced the error with CONFIG_DEBUG_LIST enabled, which shows > >> >> some problem with an entry in the list. > >> >> > >> >> [59312.260441] ------------[ cut here ]------------ > >> >> [59312.260454] WARNING: at lib/list_debug.c:62 __list_del_entry+0x8d/0x98() > >> >> [59312.260458] Hardware name: OptiPlex 745 > >> >> [59312.260461] list_del corruption. next->prev should be > >> >> ffff88006511c438, but was dead000000200200 > >> > > >> > LIST_POISON2 -> (000000200200) > >> > So we can know that the next one is deleted from the list even _earlier_ > >> > than the current one is. > >> > > >> > Any other messages before this warning complains? > >> > > >> > >> Just some normal feedback from a metadata balance I had run. > > > > Well, these do fit my expectation, since balance also involves with playing with > > root_list, which may lead to the bad situation. > > > >> > >> [14057.193343] device fsid 28c688c5-7dbd-4071-b271-1bf6726d8835 devid > >> 1 transid 4 /dev/sda7 > >> [14057.194438] btrfs: force lzo compression > >> [14057.194446] btrfs: enabling auto defrag > >> [14057.194449] btrfs: disk space caching is enabled > >> [14057.194452] btrfs flagging fs with big metadata feature > >> [14057.194455] btrfs: lzo incompat flag set. > >> [57508.799193] btrfs: relocating block group 14516486144 flags 4 > >> [57632.178797] btrfs: found 6775 extents > >> [57633.214701] btrfs: relocating block group 11832131584 flags 4 > >> [57776.400102] btrfs: found 6480 extents > >> [57777.021175] btrfs: relocating block group 10489954304 flags 4 > >> [57949.182725] btrfs: found 6681 extents > >> [59312.260441] ------------[ cut here ]------------ > >> [59312.260454] WARNING: at lib/list_debug.c:62 __list_del_entry+0x8d/0x98() > >> [59312.260458] Hardware name: OptiPlex 745 > >> ... > >> > >> I''m going to try to wrap some debugging around the section of code in > >> btrfs_clean_old_snapshots() where the dead_roots list is spliced onto > >> the root list being processed. The double entry may be slipping in > >> here. > >> > >> 1764 spin_lock(&fs_info->trans_lock); > >> 1765 list_splice_init(&fs_info->dead_roots, &list); > >> 1766 spin_unlock(&fs_info->trans_lock); > > > > hmm, I don''t think there is anything wrong in this code. But you can > > give it a shot anyway :) > > > > I''ve changed up my reproducer to try some things that may hit the > issue quicker and more reliably. > > It gave me a slightly different set of warnings in dmesg, which seem > to suggest issues in the dead_root list.Great! Many thanks for nail it down, we really shouldn''t iput() after btrfs_iget(). Could you please try this(remove iput()) and see if it gets us rid of the trouble? thanks, liubo diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1683f48..c7a0fb7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2337,7 +2337,6 @@ out_free_path: out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, &cached, GFP_NOFS); - iput(inode); return ret; }> > [43925.656065] device fsid a8f6fadb-3022-4c01-b369-f1f3f638c052 devid > 1 transid 310 /dev/sda7 > [43925.658062] btrfs: force lzo compression > [43925.658072] btrfs: enabling auto defrag > [43925.658075] btrfs: disk space caching is enabled > [43925.658078] btrfs: lzo incompat flag set. > [44503.421293] btrfs: unlinked 1 orphans > [44898.287365] btrfs: unlinked 1 orphans > [45080.641383] btrfs: unlinked 1 orphans > [45250.063773] btrfs: unlinked 1 orphans > [46223.387355] btrfs: unlinked 1 orphans > [46476.473944] btrfs: unlinked 1 orphans > [46499.665615] btrfs: unlinked 1 orphans > [46769.785454] ------------[ cut here ]------------ > [46769.785471] WARNING: at lib/list_debug.c:36 __list_add+0x9d/0xba() > [46769.785474] Hardware name: OptiPlex 745 > [46769.785478] list_add double add: new=ffff880050c27c38, > prev=ffff880078f3e720, next=ffff880050c27c38. > [46769.785480] Modules linked in: ipv6 snd_hda_codec_analog > snd_hda_intel snd_hda_codec snd_hwdep snd_pcm snd_page_alloc snd_timer > tg3 sr_mod snd i2c_i801 ppdev parport_pc iTCO_wdt iTCO_vendor_support > lpc_ich pcspkr parport floppy serio_raw microcode ablk_helper cryptd > lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc > reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd > hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd > [46769.785537] Pid: 18291, comm: btrfs-endio-wri Not tainted 3.7.4-sad-v1+ #3 > [46769.785539] Call Trace: > [46769.785549] [<ffffffff81030586>] warn_slowpath_common+0x83/0x9b > [46769.785553] [<ffffffff81030641>] warn_slowpath_fmt+0x46/0x48 > [46769.785558] [<ffffffff8120987b>] __list_add+0x9d/0xba > [46769.785586] [<ffffffffa0179dd6>] btrfs_add_dead_root+0x42/0x56 [btrfs] > [46769.785603] [<ffffffffa0187b67>] btrfs_destroy_inode+0x227/0x25b [btrfs] > [46769.785611] [<ffffffff8111393a>] destroy_inode+0x3b/0x54 > [46769.785615] [<ffffffff81113a9c>] evict+0x149/0x151 > [46769.785619] [<ffffffff81114322>] iput+0x12c/0x135 > [46769.785636] [<ffffffffa018455f>] relink_extent_backref+0x669/0x6af [btrfs] > [46769.785642] [<ffffffff815e9849>] ? __slab_free+0x17c/0x21b > [46769.785658] [<ffffffffa0184d15>] ? > btrfs_finish_ordered_io+0x770/0x827 [btrfs] > [46769.785674] [<ffffffffa0184ce5>] btrfs_finish_ordered_io+0x740/0x827 [btrfs] > [46769.785691] [<ffffffffa0184de1>] finish_ordered_fn+0x15/0x17 [btrfs] > [46769.785706] [<ffffffffa019e5c9>] worker_loop+0x14c/0x493 [btrfs] > [46769.785722] [<ffffffffa019e47d>] ? btrfs_queue_worker+0x258/0x258 [btrfs] > [46769.785728] [<ffffffff8104c750>] kthread+0xba/0xc2 > [46769.785732] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [46769.785737] [<ffffffff815f301c>] ret_from_fork+0x7c/0xb0 > [46769.785741] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [46769.785745] ---[ end trace 7528086f91b151b5 ]--- > [46799.053062] ------------[ cut here ]------------ > [46799.053078] WARNING: at lib/list_debug.c:62 __list_del_entry+0x8d/0x98() > [46799.053082] Hardware name: OptiPlex 745 > [46799.053087] list_del corruption. next->prev should be > ffff880050c27c38, but was ffff8800057fde38 > [46799.053090] Modules linked in: ipv6 snd_hda_codec_analog > snd_hda_intel snd_hda_codec snd_hwdep snd_pcm snd_page_alloc snd_timer > tg3 sr_mod snd i2c_i801 ppdev parport_pc iTCO_wdt iTCO_vendor_support > lpc_ich pcspkr parport floppy serio_raw microcode ablk_helper cryptd > lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc > reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd > hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd > [46799.053163] Pid: 18210, comm: btrfs-cleaner Tainted: G W > 3.7.4-sad-v1+ #3 > [46799.053166] Call Trace: > [46799.053180] [<ffffffff81030586>] warn_slowpath_common+0x83/0x9b > [46799.053184] [<ffffffff81030641>] warn_slowpath_fmt+0x46/0x48 > [46799.053190] [<ffffffff810ab4e9>] ? __trace_bprintk+0x48/0x4a > [46799.053194] [<ffffffff812097a5>] __list_del_entry+0x8d/0x98 > [46799.053198] [<ffffffff812097be>] list_del+0xe/0x2e > [46799.053220] [<ffffffffa017b2f5>] > btrfs_clean_old_snapshots+0xed/0x150 [btrfs] > [46799.053235] [<ffffffffa0173d7d>] cleaner_kthread+0x5a/0xe6 [btrfs] > [46799.053249] [<ffffffffa0173d23>] ? transaction_kthread+0x1a0/0x1a0 [btrfs] > [46799.053254] [<ffffffff8104c750>] kthread+0xba/0xc2 > [46799.053259] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [46799.053264] [<ffffffff815f301c>] ret_from_fork+0x7c/0xb0 > [46799.053269] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [46799.053272] ---[ end trace 7528086f91b151b6 ]--- > [46811.162649] ------------[ cut here ]------------ > [46811.162665] WARNING: at lib/list_debug.c:53 __list_del_entry+0x8d/0x98() > [46811.162669] Hardware name: OptiPlex 745 > [46811.162674] list_del corruption, ffff880050c27c38->next is > LIST_POISON1 (dead000000100100) > [46811.162678] Modules linked in: ipv6 snd_hda_codec_analog > snd_hda_intel snd_hda_codec snd_hwdep snd_pcm snd_page_alloc snd_timer > tg3 sr_mod snd i2c_i801 ppdev parport_pc iTCO_wdt iTCO_vendor_support > lpc_ich pcspkr parport floppy serio_raw microcode ablk_helper cryptd > lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc > reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd > hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd > [46811.162750] Pid: 18210, comm: btrfs-cleaner Tainted: G W > 3.7.4-sad-v1+ #3 > [46811.162754] Call Trace: > [46811.162764] [<ffffffff81030586>] warn_slowpath_common+0x83/0x9b > [46811.162771] [<ffffffff81030641>] warn_slowpath_fmt+0x46/0x48 > [46811.162779] [<ffffffff810ab4e9>] ? __trace_bprintk+0x48/0x4a > [46811.162785] [<ffffffff812097a5>] __list_del_entry+0x8d/0x98 > [46811.162791] [<ffffffff812097be>] list_del+0xe/0x2e > [46811.162820] [<ffffffffa017b2f5>] > btrfs_clean_old_snapshots+0xed/0x150 [btrfs] > [46811.162841] [<ffffffffa0173d7d>] cleaner_kthread+0x5a/0xe6 [btrfs] > [46811.162862] [<ffffffffa0173d23>] ? transaction_kthread+0x1a0/0x1a0 [btrfs] > [46811.162869] [<ffffffff8104c750>] kthread+0xba/0xc2 > [46811.162875] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [46811.162882] [<ffffffff815f301c>] ret_from_fork+0x7c/0xb0 > [46811.162888] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [46811.162892] ---[ end trace 7528086f91b151b7 ]--- > [46811.162904] BUG: unable to handle kernel paging request at 0000000047c5a000 > [46811.163003] IP: [<ffffffffa017b30b>] > btrfs_clean_old_snapshots+0x103/0x150 [btrfs] > [46811.163003] PGD 0 > [46811.163003] Oops: 0000 [#1] SMP > [46811.163003] Modules linked in: ipv6 snd_hda_codec_analog > snd_hda_intel snd_hda_codec snd_hwdep snd_pcm snd_page_alloc snd_timer > tg3 sr_mod snd i2c_i801 ppdev parport_pc iTCO_wdt iTCO_vendor_support > lpc_ich pcspkr parport floppy serio_raw microcode ablk_helper cryptd > lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc > reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd > hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd > [46811.163003] CPU 0 > [46811.163003] Pid: 18210, comm: btrfs-cleaner Tainted: G W > 3.7.4-sad-v1+ #3 Dell Inc. OptiPlex 745 > /0WF810 > [46811.163003] RIP: 0010:[<ffffffffa017b30b>] [<ffffffffa017b30b>] > btrfs_clean_old_snapshots+0x103/0x150 [btrfs] > [46811.163003] RSP: 0018:ffff8800057fde38 EFLAGS: 00010296 > [46811.163003] RAX: 0000000047c5a000 RBX: ffff880050c27800 RCX: 0000000000000008 > [46811.163003] RDX: 0000000000000000 RSI: ffff8800057fdd70 RDI: ffff880050c27c70 > [46811.163003] RBP: ffff8800057fde78 R08: 0000000000000000 R09: 0000000000000283 > [46811.163003] R10: 0000000000000001 R11: 0000000000000000 R12: ffff880000000000 > [46811.163003] R13: 0000160000000000 R14: ffff8800057fde38 R15: ffff880050c27c38 > [46811.163003] FS: 0000000000000000(0000) GS:ffff88007f200000(0000) > knlGS:0000000000000000 > [46811.163003] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > [46811.163003] CR2: 0000000047c5a000 CR3: 000000003f270000 CR4: 00000000000007f0 > [46811.163003] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > [46811.163003] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > [46811.163003] Process btrfs-cleaner (pid: 18210, threadinfo > ffff8800057fc000, task ffff88007c030d40) > [46811.163003] Stack: > [46811.163003] ffff880050c27c38 ffff88001f488438 ffff8800057fde48 > ffff88002d15b800 > [46811.163003] ffff88007c030d40 ffff88007c030d40 ffff88007c030d40 > 0000000000000000 > [46811.163003] ffff8800057fdeb8 ffffffffa0173d7d ffff88002d15b800 > 0000000000000000 > [46811.163003] Call Trace: > [46811.163003] [<ffffffffa0173d7d>] cleaner_kthread+0x5a/0xe6 [btrfs] > [46811.163003] [<ffffffffa0173d23>] ? transaction_kthread+0x1a0/0x1a0 [btrfs] > [46811.163003] [<ffffffff8104c750>] kthread+0xba/0xc2 > [46811.163003] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [46811.163003] [<ffffffff815f301c>] ret_from_fork+0x7c/0xb0 > [46811.163003] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [46811.163003] Code: c7 c7 d5 b2 17 a0 31 c0 e8 b4 01 f3 e0 4c 89 ff > e8 bb e4 08 e1 48 89 df e8 f2 f5 03 00 49 8b 87 c8 fb ff ff 48 8b 80 > 50 01 00 00 <48> 8b 00 4c 01 e8 48 c1 f8 06 48 c1 e0 0c 42 0f b6 44 20 > 3f 31 > [46811.163003] RIP [<ffffffffa017b30b>] > btrfs_clean_old_snapshots+0x103/0x150 [btrfs] > [46811.163003] RSP <ffff8800057fde38> > [46811.163003] CR2: 0000000047c5a000 > [46811.238512] ---[ end trace 7528086f91b151b8 ]----- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 25, 2013 at 9:42 AM, Liu Bo <bo.li.liu@oracle.com> wrote:> On Fri, Jan 25, 2013 at 08:55:58AM -0600, Mitch Harder wrote: >> On Wed, Jan 23, 2013 at 6:52 PM, Liu Bo <bo.li.liu@oracle.com> wrote: >> > On Wed, Jan 23, 2013 at 10:05:04AM -0600, Mitch Harder wrote: >> >> On Wed, Jan 23, 2013 at 1:51 AM, Liu Bo <bo.li.liu@oracle.com> wrote: >> >> > On Tue, Jan 22, 2013 at 11:41:19AM -0600, Mitch Harder wrote: >> >> >> On Thu, Jan 17, 2013 at 8:42 AM, Mitch Harder >> >> >> <mitch.harder@sabayonlinux.org> wrote: >> >> >> > On Wed, Jan 16, 2013 at 6:36 AM, Liu Bo <bo.li.liu@oracle.com> wrote: >> >> >> >> This comes from one of btrfs''s project ideas, >> >> >> >> As we defragment files, we break any sharing from other snapshots. >> >> >> >> The balancing code will preserve the sharing, and defrag needs to grow this >> >> >> >> as well. >> >> > [...] >> >> >> > >> >> >> > I''ve been testing this patch on a 3.7.2 kernel merged with the >> >> >> > for-linus branch for the 3.8_rc kernels, and I''m seeing the following >> >> >> > error: >> >> >> > >> >> >> >> >> >> I''ve reproduced the error with CONFIG_DEBUG_LIST enabled, which shows >> >> >> some problem with an entry in the list. >> >> >> >> >> >> [59312.260441] ------------[ cut here ]------------ >> >> >> [59312.260454] WARNING: at lib/list_debug.c:62 __list_del_entry+0x8d/0x98() >> >> >> [59312.260458] Hardware name: OptiPlex 745 >> >> >> [59312.260461] list_del corruption. next->prev should be >> >> >> ffff88006511c438, but was dead000000200200 >> >> > >> >> > LIST_POISON2 -> (000000200200) >> >> > So we can know that the next one is deleted from the list even _earlier_ >> >> > than the current one is. >> >> > >> >> > Any other messages before this warning complains? >> >> > >> >> >> >> Just some normal feedback from a metadata balance I had run. >> > >> > Well, these do fit my expectation, since balance also involves with playing with >> > root_list, which may lead to the bad situation. >> > >> >> >> >> [14057.193343] device fsid 28c688c5-7dbd-4071-b271-1bf6726d8835 devid >> >> 1 transid 4 /dev/sda7 >> >> [14057.194438] btrfs: force lzo compression >> >> [14057.194446] btrfs: enabling auto defrag >> >> [14057.194449] btrfs: disk space caching is enabled >> >> [14057.194452] btrfs flagging fs with big metadata feature >> >> [14057.194455] btrfs: lzo incompat flag set. >> >> [57508.799193] btrfs: relocating block group 14516486144 flags 4 >> >> [57632.178797] btrfs: found 6775 extents >> >> [57633.214701] btrfs: relocating block group 11832131584 flags 4 >> >> [57776.400102] btrfs: found 6480 extents >> >> [57777.021175] btrfs: relocating block group 10489954304 flags 4 >> >> [57949.182725] btrfs: found 6681 extents >> >> [59312.260441] ------------[ cut here ]------------ >> >> [59312.260454] WARNING: at lib/list_debug.c:62 __list_del_entry+0x8d/0x98() >> >> [59312.260458] Hardware name: OptiPlex 745 >> >> ... >> >> >> >> I''m going to try to wrap some debugging around the section of code in >> >> btrfs_clean_old_snapshots() where the dead_roots list is spliced onto >> >> the root list being processed. The double entry may be slipping in >> >> here. >> >> >> >> 1764 spin_lock(&fs_info->trans_lock); >> >> 1765 list_splice_init(&fs_info->dead_roots, &list); >> >> 1766 spin_unlock(&fs_info->trans_lock); >> > >> > hmm, I don''t think there is anything wrong in this code. But you can >> > give it a shot anyway :) >> > >> >> I''ve changed up my reproducer to try some things that may hit the >> issue quicker and more reliably. >> >> It gave me a slightly different set of warnings in dmesg, which seem >> to suggest issues in the dead_root list. > > Great! Many thanks for nail it down, we really shouldn''t iput() > after btrfs_iget(). > > Could you please try this(remove iput()) and see if it gets us rid of > the trouble? > > thanks, > liubo > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index 1683f48..c7a0fb7 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -2337,7 +2337,6 @@ out_free_path: > out_unlock: > unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, > lock_end, > &cached, GFP_NOFS); > - iput(inode); > return ret; > } >With this patch, the cleaner never runs to delete the old roots. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 25, 2013 at 12:16:29PM -0600, Mitch Harder wrote: [...]> >> > >> I''ve changed up my reproducer to try some things that may hit the > >> issue quicker and more reliably. > >> > >> It gave me a slightly different set of warnings in dmesg, which seem > >> to suggest issues in the dead_root list. > > > > Great! Many thanks for nail it down, we really shouldn''t iput() > > after btrfs_iget(). > > > > Could you please try this(remove iput()) and see if it gets us rid of > > the trouble? > > > > thanks, > > liubo > > > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > > index 1683f48..c7a0fb7 100644 > > --- a/fs/btrfs/inode.c > > +++ b/fs/btrfs/inode.c > > @@ -2337,7 +2337,6 @@ out_free_path: > > out_unlock: > > unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, > > lock_end, > > &cached, GFP_NOFS); > > - iput(inode); > > return ret; > > } > > > > With this patch, the cleaner never runs to delete the old roots.Hi Mitch, Many thanks for testing it! Well, after some debugging, I finally figure out the whys: (1) btrfs_ioctl_snap_destroy() will free the inode of snapshot and set root''s refs to zero(btrfs_set_root_refs()), if this inode happens to be the only one in the rbtree of the snapshot''s root at this moment, we add this root to the dead_root list. (2) Unfortunately, after (1), our snapshot-aware defrag work may read another inode in this snapshot into memory during ''relink'' stage, and later after we finish relink work and iput() will force us to add the snapshot''s root to the dead_root list again. So that''s why we get double list_add and list_del corruption. And IMO, it can also take place without snapshot-aware defrag, but it''s a rare case. So could you please try this? thanks, liubo diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f154946..d4ee66b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -885,7 +885,15 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, int btrfs_add_dead_root(struct btrfs_root *root) { spin_lock(&root->fs_info->trans_lock); + if (!list_empty(&root->root_list)) { + struct btrfs_root *tmp; + list_for_each_entry(tmp, &root->fs_info->dead_roots, root_list) + if (tmp == root) + goto unlock; + } + list_add(&root->root_list, &root->fs_info->dead_roots); +unlock: spin_unlock(&root->fs_info->trans_lock); return 0; } -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 25, 2013 at 04:40:28PM +0100, Stefan Behrens wrote:> On Fri, 25 Jan 2013 08:55:58 -0600, Mitch Harder wrote: > > On Wed, Jan 23, 2013 at 6:52 PM, Liu Bo <bo.li.liu@oracle.com> wrote: > >> On Wed, Jan 23, 2013 at 10:05:04AM -0600, Mitch Harder wrote: >[...]> Well, the issue that I had reported on IRC some days ago which looks similar (the top part of the call trace is similar: iput -> evict -> destroy_inode -> btrfs_destroy_inode -> btrfs_add_dead_root -> list_add which warns in list_add in your case and crashes in my case) was without Liu Bo''s "snapshot-aware defrag" patch. A 3.8.0-rc4 kernel and nothing else. > > The reproducer was to create and destroy subvolumes and snapshots. I used btrfs-receive to fill them with data. The crash happened on umount. Every time. > > del_fs_roots() is attempting to empty the dead_roots list, and via btrfs_destroy_inode() deeper in the call stack they are added back to the dead_roots list. >Hi Stefan, I assume that you''re with ''inode_cache'' option, since the iput() here refers to static void free_fs_root(struct btrfs_root *root) { iput(root->cache_inode); ... } If my assumption is right, what about the following patch? thanks, liubo diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 65f0367..01a601b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3220,6 +3220,13 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info) struct btrfs_root *gang[8]; int i; + list_for_each_entry(gang[0], &fs_info->dead_roots, root_list) { + if (gang[0]->in_radix) { + iput(root->cache_inode); + root->cache_inode = NULL; + } + } + while (!list_empty(&fs_info->dead_roots)) { gang[0] = list_entry(fs_info->dead_roots.next, struct btrfs_root, root_list);> BUG: unable to handle kernel paging request at ffff88042503b830 > IP: [<ffffffff814532b7>] __list_add+0x17/0xd0 > PGD 1e0c063 PUD bf58e067 PMD bf6b7067 PTE 800000042503b160 > Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC > Modules linked in: btrfs bonding raid1 mpt2sas scsi_transport_sas raid_class > CPU 2 > Pid: 10259, comm: umount Not tainted 3.8.0-rc4+ #16 Supermicro X8SIL/X8SIL > RIP: 0010:[<ffffffff814532b7>] [<ffffffff814532b7>] __list_add+0x17/0xd0 > RSP: 0018:ffff8802f67a1bd8 EFLAGS: 00010286 > RAX: ffff880425b7c560 RBX: ffff880423ca2828 RCX: 0000000000000001 > RDX: ffff88042503b828 RSI: ffff8804257794c0 RDI: ffff880423ca2828 > RBP: ffff8802f67a1bf8 R08: 0000000000077850 R09: 0000000000000000 > R10: 0000000000000000 R11: 0000000000000001 R12: ffff880423ca2000 > R13: ffff880423ca2898 R14: 0000000000000000 R15: ffff8802f67a1d30 > FS: 00007f6e89bba740(0000) GS:ffff88042ea00000(0000) knlGS:0000000000000000 > CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > CR2: ffff88042503b830 CR3: 000000029a56c000 CR4: 00000000000007e0 > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > Process umount (pid: 10259, threadinfo ffff8802f67a0000, task ffff880425b7c560) > Stack: > ffffffffa00a414f ffff880423ca2000 ffff880423ca2000 ffff880423ca2898 > ffff8802f67a1c18 ffffffffa00a4170 ffff88042a60c1f8 ffff88042a60c1f8 > ffff8802f67a1c48 ffffffffa00b3180 ffff88042a60c1f8 ffff88042a60c280 > Call Trace: > [<ffffffffa00a414f>] ? btrfs_add_dead_root+0x1f/0x60 [btrfs] > [<ffffffffa00a4170>] btrfs_add_dead_root+0x40/0x60 [btrfs] > [<ffffffffa00b3180>] btrfs_destroy_inode+0x1d0/0x2d0 [btrfs] > [<ffffffff811b5d17>] destroy_inode+0x37/0x60 > [<ffffffff811b5e4d>] evict+0x10d/0x1a0 > [<ffffffff811b65f5>] iput+0x105/0x190 > [<ffffffffa009bd68>] free_fs_root+0x18/0x90 [btrfs] > [<ffffffffa009f1ab>] btrfs_free_fs_root+0x7b/0x90 [btrfs] > [<ffffffffa009f26f>] del_fs_roots+0xaf/0xf0 [btrfs] > [<ffffffffa00a0bc6>] close_ctree+0x1c6/0x300 [btrfs] > [<ffffffff811b6a7c>] ? evict_inodes+0xec/0x100 > [<ffffffffa00763a4>] btrfs_put_super+0x14/0x20 [btrfs] > [<ffffffff8119dfcc>] generic_shutdown_super+0x5c/0xe0 > [<ffffffff8119e0e1>] kill_anon_super+0x11/0x20 > [<ffffffffa007a3a5>] btrfs_kill_super+0x15/0x90 [btrfs] > [<ffffffff8119f111>] ? deactivate_super+0x41/0x70 > [<ffffffff8119e4dd>] deactivate_locked_super+0x3d/0x70 > [<ffffffff8119f119>] deactivate_super+0x49/0x70 > [<ffffffff811ba772>] mntput_no_expire+0xd2/0x130 > [<ffffffff811bb621>] sys_umount+0x71/0x390 > [<ffffffff81983012>] system_call_fastpath+0x16/0x1b > Code: 48 83 c4 08 5b 5d c3 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 ec 20 48 89 5d e8 4c 89 65 f0 48 89 fb 4c 89 6d f8 <4c> 8b 42 08 49 89 f5 49 89 d4 49 39 f0 75 31 4d 8b 45 00 4d 39 > RIP [<ffffffff814532b7>] __list_add+0x17/0xd0 > RSP <ffff8802f67a1bd8> > CR2: ffff88042503b830 > ---[ end trace 5e44f1afc74751aa ]--- >-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Jan 27, 2013 at 6:41 AM, Liu Bo <bo.li.liu@oracle.com> wrote:> > Hi Mitch, > > Many thanks for testing it! > > Well, after some debugging, I finally figure out the whys: > > (1) btrfs_ioctl_snap_destroy() will free the inode of snapshot and set > root''s refs to zero(btrfs_set_root_refs()), if this inode happens to > be the only one in the rbtree of the snapshot''s root at this moment, > we add this root to the dead_root list. > > (2) Unfortunately, after (1), our snapshot-aware defrag work may read > another inode in this snapshot into memory during ''relink'' stage, and > later after we finish relink work and iput() will force us to add the > snapshot''s root to the dead_root list again. > > So that''s why we get double list_add and list_del corruption. > > And IMO, it can also take place without snapshot-aware defrag, but it''s a > rare case.I''m seeing a smattering of reports that resemble list corruption on the M/L, so that is possible.> > So could you please try this? > > thanks, > liubo > > diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c > index f154946..d4ee66b 100644 > --- a/fs/btrfs/transaction.c > +++ b/fs/btrfs/transaction.c > @@ -885,7 +885,15 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, > int btrfs_add_dead_root(struct btrfs_root *root) > { > spin_lock(&root->fs_info->trans_lock); > + if (!list_empty(&root->root_list)) { > + struct btrfs_root *tmp; > + list_for_each_entry(tmp, &root->fs_info->dead_roots, root_list) > + if (tmp == root) > + goto unlock; > + } > + > list_add(&root->root_list, &root->fs_info->dead_roots); > +unlock: > spin_unlock(&root->fs_info->trans_lock); > return 0; > } >It feels like we''re correcting the problem after-the-fact with this method, instead of addressing the root problem. But I was able to successfully run with this patch. I slightly modified your patch as follows by introducing a WARN_ON in order to get a back trace, and also to give me a positive confirmation that I was triggering the problem. diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d6b17fa..0c1066e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -885,7 +885,18 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, int btrfs_add_dead_root(struct btrfs_root *root) { spin_lock(&root->fs_info->trans_lock); + if (!list_empty(&root->root_list)) { + struct btrfs_root *tmp; + list_for_each_entry(tmp, &root->fs_info->dead_roots, root_list) + if (tmp == root) { + printk(KERN_ERR "btrfs: Duplicate dead root entry.\n"); + WARN_ON(1); + goto unlock; + } + } + list_add(&root->root_list, &root->fs_info->dead_roots); +unlock: spin_unlock(&root->fs_info->trans_lock); return 0; } -- I was able to trigger the problem several times (16 separate times according to dmesg) without killing the cleaner process, and everything appears to have continued successfully after encountering a duplicate list entry. My test partition passes btrfsck afterwards. 13 out of the 16 backtraces seem support your hypothesis as passing through the iput in your patch: [ 4367.314806] btrfs: Duplicate dead root entry. [ 4367.314809] ------------[ cut here ]------------ [ 4367.314834] WARNING: at fs/btrfs/transaction.c:893 btrfs_add_dead_root+0x73/0xbc [btrfs]() [ 4367.314836] Hardware name: OptiPlex 745 [ 4367.314841] Modules linked in: ipv6 snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm tg3 snd_page_alloc snd_timer snd iTCO_wdt iTCO_vendor_support ppdev parport_pc microcode i2c_i801 floppy parport sr_mod lpc_ich serio_raw pcspkr ablk_helper cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd [ 4367.314887] Pid: 4463, comm: btrfs-endio-wri Tainted: G W 3.7.4-sad-v2+ #1 [ 4367.314889] Call Trace: [ 4367.314895] [<ffffffff81030586>] warn_slowpath_common+0x83/0x9b [ 4367.314899] [<ffffffff810305b8>] warn_slowpath_null+0x1a/0x1c [ 4367.314915] [<ffffffffa0179e0b>] btrfs_add_dead_root+0x73/0xbc [btrfs] [ 4367.314931] [<ffffffffa0187bef>] btrfs_destroy_inode+0x227/0x25b [btrfs] [ 4367.314936] [<ffffffff8111393a>] destroy_inode+0x3b/0x54 [ 4367.314940] [<ffffffff81113a9c>] evict+0x149/0x151 [ 4367.314944] [<ffffffff81114322>] iput+0x12c/0x135 [ 4367.314959] [<ffffffffa01845e7>] relink_extent_backref+0x669/0x6af [btrfs] [ 4367.314964] [<ffffffff815e9849>] ? __slab_free+0x17c/0x21b [ 4367.314980] [<ffffffffa0184d9d>] ? btrfs_finish_ordered_io+0x770/0x827 [btrfs] [ 4367.314995] [<ffffffffa0184d6d>] btrfs_finish_ordered_io+0x740/0x827 [btrfs] [ 4367.315011] [<ffffffffa0184e69>] finish_ordered_fn+0x15/0x17 [btrfs] [ 4367.315034] [<ffffffffa019e7a1>] worker_loop+0x14c/0x493 [btrfs] [ 4367.315051] [<ffffffffa019e655>] ? btrfs_queue_worker+0x258/0x258 [btrfs] [ 4367.315055] [<ffffffff8104c750>] kthread+0xba/0xc2 [ 4367.315059] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [ 4367.315062] [<ffffffff815f301c>] ret_from_fork+0x7c/0xb0 [ 4367.315066] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [ 4367.315069] ---[ end trace b71b586e95cb7ba0 ]--- gdb resolves the (relink_extent_backref+0x669) reference back to just after the iput. (gdb) l *(relink_extent_backref+0x669) 0x335e7 is in relink_extent_backref (fs/btrfs/inode.c:2342). 2337 out_unlock: 2338 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, 2339 &cached, GFP_NOFS); 2340 iput(inode); 2341 return ret; 2342 } 2343 2344 static void relink_file_extents(struct new_sa_defrag_extent *new) 2345 { 2346 struct btrfs_path *path; The other 3 backtraces came down a different path: [14857.072378] btrfs: Duplicate dead root entry. [14857.072385] ------------[ cut here ]------------ [14857.072423] WARNING: at fs/btrfs/transaction.c:893 btrfs_add_dead_root+0x73/0xbc [btrfs]() [14857.072427] Hardware name: OptiPlex 745 [14857.072430] Modules linked in: ipv6 snd_hda_codec_analog snd_hda_intel snd_hda_codec snd_hwdep snd_pcm tg3 snd_page_alloc snd_timer snd iTCO_wdt iTCO_vendor_support ppdev parport_pc microcode i2c_i801 floppy parport sr_mod lpc_ich serio_raw pcspkr ablk_helper cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd [14857.072496] Pid: 4301, comm: btrfs-cleaner Tainted: G W 3.7.4-sad-v2+ #1 [14857.072499] Call Trace: [14857.072512] [<ffffffff81030586>] warn_slowpath_common+0x83/0x9b [14857.072518] [<ffffffff810305b8>] warn_slowpath_null+0x1a/0x1c [14857.072540] [<ffffffffa0179e0b>] btrfs_add_dead_root+0x73/0xbc [btrfs] [14857.072564] [<ffffffffa0187bef>] btrfs_destroy_inode+0x227/0x25b [btrfs] [14857.072573] [<ffffffff8111393a>] destroy_inode+0x3b/0x54 [14857.072578] [<ffffffff81113a9c>] evict+0x149/0x151 [14857.072585] [<ffffffff81114322>] iput+0x12c/0x135 [14857.072607] [<ffffffffa01a21c8>] ? btrfs_defrag_file+0xa5b/0xaa1 [btrfs] [14857.072630] [<ffffffffa0189433>] btrfs_run_defrag_inodes+0x256/0x2c0 [btrfs] [14857.072651] [<ffffffffa0173da0>] cleaner_kthread+0x79/0xe6 [btrfs] [14857.072671] [<ffffffffa0173d27>] ? transaction_kthread+0x1a0/0x1a0 [btrfs] [14857.072678] [<ffffffff8104c750>] kthread+0xba/0xc2 [14857.072684] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [14857.072691] [<ffffffff815f301c>] ret_from_fork+0x7c/0xb0 [14857.072696] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 [14857.072701] ---[ end trace b71b586e95cb7bac ]--- The (btrfs_run_defrag_inodes+0x256) resolves back to the iput in __btrfs_run_defrag_inode() (gdb) l *(btrfs_run_defrag_inodes+0x256) 0x38433 is in btrfs_run_defrag_inodes (fs/btrfs/file.c:347). 342 btrfs_requeue_inode_defrag(inode, defrag); 343 } else { 344 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 345 } 346 347 iput(inode); 348 return 0; 349 } 350 351 /* -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Jan 27, 2013 at 11:20:41PM -0600, Mitch Harder wrote:> On Sun, Jan 27, 2013 at 6:41 AM, Liu Bo <bo.li.liu@oracle.com> wrote: > > > > Hi Mitch, > > > > Many thanks for testing it! > > > > Well, after some debugging, I finally figure out the whys: > > > > (1) btrfs_ioctl_snap_destroy() will free the inode of snapshot and set > > root''s refs to zero(btrfs_set_root_refs()), if this inode happens to > > be the only one in the rbtree of the snapshot''s root at this moment, > > we add this root to the dead_root list. > > > > (2) Unfortunately, after (1), our snapshot-aware defrag work may read > > another inode in this snapshot into memory during ''relink'' stage, and > > later after we finish relink work and iput() will force us to add the > > snapshot''s root to the dead_root list again. > > > > So that''s why we get double list_add and list_del corruption. > > > > And IMO, it can also take place without snapshot-aware defrag, but it''s a > > rare case. > > I''m seeing a smattering of reports that resemble list corruption on > the M/L, so that is possible. > > > > > So could you please try this? > > > > thanks, > > liubo > > > > diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c > > index f154946..d4ee66b 100644 > > --- a/fs/btrfs/transaction.c > > +++ b/fs/btrfs/transaction.c > > @@ -885,7 +885,15 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, > > int btrfs_add_dead_root(struct btrfs_root *root) > > { > > spin_lock(&root->fs_info->trans_lock); > > + if (!list_empty(&root->root_list)) { > > + struct btrfs_root *tmp; > > + list_for_each_entry(tmp, &root->fs_info->dead_roots, root_list) > > + if (tmp == root) > > + goto unlock; > > + } > > + > > list_add(&root->root_list, &root->fs_info->dead_roots); > > +unlock: > > spin_unlock(&root->fs_info->trans_lock); > > return 0; > > } > > > > It feels like we''re correcting the problem after-the-fact with this > method, instead of addressing the root problem. But I was able to > successfully run with this patch.I agree on this :)> > I slightly modified your patch as follows by introducing a WARN_ON in > order to get a back trace, and also to give me a positive confirmation > that I was triggering the problem.Yeah, I find that this snapshot-aware defrag patch lacks of the subvol srcu lock protection: index = srcu_read_lock(&fs_info->subvol_srcu); srcu_read_unlock(&fs_info->subvol_srcu, index); And so does btrfs_run_defrag_inodes(). This lock pair is designed to avoid the race between snapshot deletion and dead root list operations. I''m testing the following patch for about 2 hours already and seems it works fine ;) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 841cfe3..93ed89d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -293,21 +293,34 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; int num_defrag; + int index; /* get the inode */ key.objectid = defrag->root; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(inode_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); kmem_cache_free(btrfs_inode_defrag_cachep, defrag); return PTR_ERR(inode_root); } + if (btrfs_root_refs(&inode_root->root_item) == 0) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + printk("%s: root %llu refs is 0\n", __func__, inode_root->root_key.objectid); + return -ENOENT; + } key.objectid = defrag->ino; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); + + srcu_read_unlock(&fs_info->subvol_srcu, index); if (IS_ERR(inode)) { kmem_cache_free(btrfs_inode_defrag_cachep, defrag); return PTR_ERR(inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c335190..b833189 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2176,6 +2176,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path, u64 lock_start; u64 lock_end; bool merge = false; + int index; if (prev && prev->root_id == backref->root_id && prev->inum == backref->inum && @@ -2188,12 +2189,21 @@ static noinline int relink_extent_backref(struct btrfs_path *path, key.offset = (u64)-1; fs_info = BTRFS_I(src_inode)->root->fs_info; + index = srcu_read_lock(&fs_info->subvol_srcu); + root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); if (PTR_ERR(root) == -ENOENT) return 0; return PTR_ERR(root); } + if (btrfs_root_refs(&root->root_item) == 0) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + /* parse ENOENT to 0 */ + printk("root %llu refs is 0, bail out\n", root->root_key.objectid); + return 0; + } /* step 2: get inode */ key.objectid = backref->inum; @@ -2201,12 +2211,13 @@ static noinline int relink_extent_backref(struct btrfs_path *path, key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, root, NULL); - if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) { - if (inode && !IS_ERR(inode)) - iput(inode); + if (IS_ERR(inode)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); return 0; } + srcu_read_unlock(&fs_info->subvol_srcu, index); + /* step 3: relink backref */ lock_start = backref->file_pos; lock_end = backref->file_pos + backref->num_bytes - 1;> > diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c > index d6b17fa..0c1066e 100644 > --- a/fs/btrfs/transaction.c > +++ b/fs/btrfs/transaction.c > @@ -885,7 +885,18 @@ static noinline int commit_cowonly_roots(struct > btrfs_trans_handle *trans, > int btrfs_add_dead_root(struct btrfs_root *root) > { > spin_lock(&root->fs_info->trans_lock); > + if (!list_empty(&root->root_list)) { > + struct btrfs_root *tmp; > + list_for_each_entry(tmp, &root->fs_info->dead_roots, root_list) > + if (tmp == root) { > + printk(KERN_ERR "btrfs: Duplicate dead root entry.\n"); > + WARN_ON(1); > + goto unlock; > + } > + } > + > list_add(&root->root_list, &root->fs_info->dead_roots); > +unlock: > spin_unlock(&root->fs_info->trans_lock); > return 0; > } > -- > > I was able to trigger the problem several times (16 separate times > according to dmesg) without killing the cleaner process, and > everything appears to have continued successfully after encountering a > duplicate list entry. My test partition passes btrfsck afterwards.Same here. thanks, liubo> > 13 out of the 16 backtraces seem support your hypothesis as passing > through the iput in your patch: > > [ 4367.314806] btrfs: Duplicate dead root entry. > [ 4367.314809] ------------[ cut here ]------------ > [ 4367.314834] WARNING: at fs/btrfs/transaction.c:893 > btrfs_add_dead_root+0x73/0xbc [btrfs]() > [ 4367.314836] Hardware name: OptiPlex 745 > [ 4367.314841] Modules linked in: ipv6 snd_hda_codec_analog > snd_hda_intel snd_hda_codec snd_hwdep snd_pcm tg3 snd_page_alloc > snd_timer snd iTCO_wdt iTCO_vendor_support ppdev parport_pc microcode > i2c_i801 floppy parport sr_mod lpc_ich serio_raw pcspkr ablk_helper > cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd > sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache > sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd > [ 4367.314887] Pid: 4463, comm: btrfs-endio-wri Tainted: G W > 3.7.4-sad-v2+ #1 > [ 4367.314889] Call Trace: > [ 4367.314895] [<ffffffff81030586>] warn_slowpath_common+0x83/0x9b > [ 4367.314899] [<ffffffff810305b8>] warn_slowpath_null+0x1a/0x1c > [ 4367.314915] [<ffffffffa0179e0b>] btrfs_add_dead_root+0x73/0xbc [btrfs] > [ 4367.314931] [<ffffffffa0187bef>] btrfs_destroy_inode+0x227/0x25b [btrfs] > [ 4367.314936] [<ffffffff8111393a>] destroy_inode+0x3b/0x54 > [ 4367.314940] [<ffffffff81113a9c>] evict+0x149/0x151 > [ 4367.314944] [<ffffffff81114322>] iput+0x12c/0x135 > [ 4367.314959] [<ffffffffa01845e7>] relink_extent_backref+0x669/0x6af [btrfs] > [ 4367.314964] [<ffffffff815e9849>] ? __slab_free+0x17c/0x21b > [ 4367.314980] [<ffffffffa0184d9d>] ? > btrfs_finish_ordered_io+0x770/0x827 [btrfs] > [ 4367.314995] [<ffffffffa0184d6d>] btrfs_finish_ordered_io+0x740/0x827 [btrfs] > [ 4367.315011] [<ffffffffa0184e69>] finish_ordered_fn+0x15/0x17 [btrfs] > [ 4367.315034] [<ffffffffa019e7a1>] worker_loop+0x14c/0x493 [btrfs] > [ 4367.315051] [<ffffffffa019e655>] ? btrfs_queue_worker+0x258/0x258 [btrfs] > [ 4367.315055] [<ffffffff8104c750>] kthread+0xba/0xc2 > [ 4367.315059] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [ 4367.315062] [<ffffffff815f301c>] ret_from_fork+0x7c/0xb0 > [ 4367.315066] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [ 4367.315069] ---[ end trace b71b586e95cb7ba0 ]--- > > gdb resolves the (relink_extent_backref+0x669) reference back to just > after the iput. > > (gdb) l *(relink_extent_backref+0x669) > 0x335e7 is in relink_extent_backref (fs/btrfs/inode.c:2342). > 2337 out_unlock: > 2338 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, > 2339 &cached, GFP_NOFS); > 2340 iput(inode); > 2341 return ret; > 2342 } > 2343 > 2344 static void relink_file_extents(struct new_sa_defrag_extent *new) > 2345 { > 2346 struct btrfs_path *path; > > The other 3 backtraces came down a different path: > > [14857.072378] btrfs: Duplicate dead root entry. > [14857.072385] ------------[ cut here ]------------ > [14857.072423] WARNING: at fs/btrfs/transaction.c:893 > btrfs_add_dead_root+0x73/0xbc [btrfs]() > [14857.072427] Hardware name: OptiPlex 745 > [14857.072430] Modules linked in: ipv6 snd_hda_codec_analog > snd_hda_intel snd_hda_codec snd_hwdep snd_pcm tg3 snd_page_alloc > snd_timer snd iTCO_wdt iTCO_vendor_support ppdev parport_pc microcode > i2c_i801 floppy parport sr_mod lpc_ich serio_raw pcspkr ablk_helper > cryptd lrw xts gf128mul aes_x86_64 sha256_generic fuse xfs nfs lockd > sunrpc reiserfs btrfs zlib_deflate ext4 jbd2 ext3 jbd ext2 mbcache > sl811_hcd hid_generic xhci_hcd ohci_hcd uhci_hcd ehci_hcd > [14857.072496] Pid: 4301, comm: btrfs-cleaner Tainted: G W > 3.7.4-sad-v2+ #1 > [14857.072499] Call Trace: > [14857.072512] [<ffffffff81030586>] warn_slowpath_common+0x83/0x9b > [14857.072518] [<ffffffff810305b8>] warn_slowpath_null+0x1a/0x1c > [14857.072540] [<ffffffffa0179e0b>] btrfs_add_dead_root+0x73/0xbc [btrfs] > [14857.072564] [<ffffffffa0187bef>] btrfs_destroy_inode+0x227/0x25b [btrfs] > [14857.072573] [<ffffffff8111393a>] destroy_inode+0x3b/0x54 > [14857.072578] [<ffffffff81113a9c>] evict+0x149/0x151 > [14857.072585] [<ffffffff81114322>] iput+0x12c/0x135 > [14857.072607] [<ffffffffa01a21c8>] ? btrfs_defrag_file+0xa5b/0xaa1 [btrfs] > [14857.072630] [<ffffffffa0189433>] btrfs_run_defrag_inodes+0x256/0x2c0 [btrfs] > [14857.072651] [<ffffffffa0173da0>] cleaner_kthread+0x79/0xe6 [btrfs] > [14857.072671] [<ffffffffa0173d27>] ? transaction_kthread+0x1a0/0x1a0 [btrfs] > [14857.072678] [<ffffffff8104c750>] kthread+0xba/0xc2 > [14857.072684] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [14857.072691] [<ffffffff815f301c>] ret_from_fork+0x7c/0xb0 > [14857.072696] [<ffffffff8104c696>] ? kthread_freezable_should_stop+0x52/0x52 > [14857.072701] ---[ end trace b71b586e95cb7bac ]--- > > The (btrfs_run_defrag_inodes+0x256) resolves back to the iput in > __btrfs_run_defrag_inode() > > (gdb) l *(btrfs_run_defrag_inodes+0x256) > 0x38433 is in btrfs_run_defrag_inodes (fs/btrfs/file.c:347). > 342 btrfs_requeue_inode_defrag(inode, defrag); > 343 } else { > 344 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); > 345 } > 346 > 347 iput(inode); > 348 return 0; > 349 } > 350 > 351 /*-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[CC list reduced (my initial statement was that such dead_list corruptions happen without the snapshot-aware defrag patch, by now the contents is not related to the snapshot-aware defrag patch anymore)] On Sun, 27 Jan 2013 21:19:53 +0800, Liu Bo wrote:> On Fri, Jan 25, 2013 at 04:40:28PM +0100, Stefan Behrens wrote: >> Well, the issue that I had reported on IRC some days ago which looks similar (the top part of the call trace is similar: iput -> evict -> destroy_inode -> btrfs_destroy_inode -> btrfs_add_dead_root -> list_add which warns in list_add in your case and crashes in my case) was without Liu Bo''s "snapshot-aware defrag" patch. A 3.8.0-rc4 kernel and nothing else. >> >> The reproducer was to create and destroy subvolumes and snapshots. I used btrfs-receive to fill them with data. The crash happened on umount. Every time. >> >> del_fs_roots() is attempting to empty the dead_roots list, and via btrfs_destroy_inode() deeper in the call stack they are added back to the dead_roots list. >> > > Hi Stefan, > > I assume that you''re with ''inode_cache'' option, since the iput() here > refers to > static void free_fs_root(struct btrfs_root *root) > { > iput(root->cache_inode); > ... > }Hi Liu Bo, Yes, inode_cache is enabled.> If my assumption is right, what about the following patch? > > thanks, > liubo > > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > index 65f0367..01a601b 100644 > --- a/fs/btrfs/disk-io.c > +++ b/fs/btrfs/disk-io.c > @@ -3220,6 +3220,13 @@ static void del_fs_roots(struct btrfs_fs_info > *fs_info) > struct btrfs_root *gang[8]; > int i; > > + list_for_each_entry(gang[0], &fs_info->dead_roots, root_list) { > + if (gang[0]->in_radix) { > + iput(root->cache_inode); > + root->cache_inode = NULL; > + } > + } > + > while (!list_empty(&fs_info->dead_roots)) { > gang[0] = list_entry(fs_info->dead_roots.next, > struct btrfs_root, root_list);No, this did not fix the problem (and I changed the patch and replaced "root" with "gang[0]" for the compiler''s satisfaction). Same stack trace as before. This happens without scrub or defrag running in parallel. The mount options are compress=lzo,space_cache,inode_cache. I mount the filesystem, create about 1000 subvols and snapshots, fill some data in the subvolumes, delete all subvolumes, wait until "btrfs subvol list ... | wc -l" prints 0, then immediately unmount the filesystem and then it crashs. Disabling the inode_cache mount option eliminates the crash. BTW, when I reproduced this crash with 6600 outstanding subvolume deletions, the next mount command took 40 minutes to return back to user mode. The btrfs-cleaner thread was executing btrfs_clean_old_snapshots() and was writing the superblocks everytime I looked on its stack. The mount process was executing btrfs_find_orphan_roots() the first half of the time and afterwards btrfs_orphan_cleanup() for the rest of the 40 minutes.>> BUG: unable to handle kernel paging request at ffff88042503b830 >> IP: [<ffffffff814532b7>] __list_add+0x17/0xd0 >> PGD 1e0c063 PUD bf58e067 PMD bf6b7067 PTE 800000042503b160 >> Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC >> Modules linked in: btrfs bonding raid1 mpt2sas scsi_transport_sas raid_class >> CPU 2 >> Pid: 10259, comm: umount Not tainted 3.8.0-rc4+ #16 Supermicro X8SIL/X8SIL >> RIP: 0010:[<ffffffff814532b7>] [<ffffffff814532b7>] __list_add+0x17/0xd0 >> RSP: 0018:ffff8802f67a1bd8 EFLAGS: 00010286 >> RAX: ffff880425b7c560 RBX: ffff880423ca2828 RCX: 0000000000000001 >> RDX: ffff88042503b828 RSI: ffff8804257794c0 RDI: ffff880423ca2828 >> RBP: ffff8802f67a1bf8 R08: 0000000000077850 R09: 0000000000000000 >> R10: 0000000000000000 R11: 0000000000000001 R12: ffff880423ca2000 >> R13: ffff880423ca2898 R14: 0000000000000000 R15: ffff8802f67a1d30 >> FS: 00007f6e89bba740(0000) GS:ffff88042ea00000(0000) knlGS:0000000000000000 >> CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b >> CR2: ffff88042503b830 CR3: 000000029a56c000 CR4: 00000000000007e0 >> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 >> DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 >> Process umount (pid: 10259, threadinfo ffff8802f67a0000, task ffff880425b7c560) >> Stack: >> ffffffffa00a414f ffff880423ca2000 ffff880423ca2000 ffff880423ca2898 >> ffff8802f67a1c18 ffffffffa00a4170 ffff88042a60c1f8 ffff88042a60c1f8 >> ffff8802f67a1c48 ffffffffa00b3180 ffff88042a60c1f8 ffff88042a60c280 >> Call Trace: >> [<ffffffffa00a414f>] ? btrfs_add_dead_root+0x1f/0x60 [btrfs] >> [<ffffffffa00a4170>] btrfs_add_dead_root+0x40/0x60 [btrfs] >> [<ffffffffa00b3180>] btrfs_destroy_inode+0x1d0/0x2d0 [btrfs] >> [<ffffffff811b5d17>] destroy_inode+0x37/0x60 >> [<ffffffff811b5e4d>] evict+0x10d/0x1a0 >> [<ffffffff811b65f5>] iput+0x105/0x190 >> [<ffffffffa009bd68>] free_fs_root+0x18/0x90 [btrfs] >> [<ffffffffa009f1ab>] btrfs_free_fs_root+0x7b/0x90 [btrfs] >> [<ffffffffa009f26f>] del_fs_roots+0xaf/0xf0 [btrfs] >> [<ffffffffa00a0bc6>] close_ctree+0x1c6/0x300 [btrfs] >> [<ffffffff811b6a7c>] ? evict_inodes+0xec/0x100 >> [<ffffffffa00763a4>] btrfs_put_super+0x14/0x20 [btrfs] >> [<ffffffff8119dfcc>] generic_shutdown_super+0x5c/0xe0 >> [<ffffffff8119e0e1>] kill_anon_super+0x11/0x20 >> [<ffffffffa007a3a5>] btrfs_kill_super+0x15/0x90 [btrfs] >> [<ffffffff8119f111>] ? deactivate_super+0x41/0x70 >> [<ffffffff8119e4dd>] deactivate_locked_super+0x3d/0x70 >> [<ffffffff8119f119>] deactivate_super+0x49/0x70 >> [<ffffffff811ba772>] mntput_no_expire+0xd2/0x130 >> [<ffffffff811bb621>] sys_umount+0x71/0x390 >> [<ffffffff81983012>] system_call_fastpath+0x16/0x1b >> Code: 48 83 c4 08 5b 5d c3 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 ec 20 48 89 5d e8 4c 89 65 f0 48 89 fb 4c 89 6d f8 <4c> 8b 42 08 49 89 f5 49 89 d4 49 39 f0 75 31 4d 8b 45 00 4d 39 >> RIP [<ffffffff814532b7>] __list_add+0x17/0xd0 >> RSP <ffff8802f67a1bd8> >> CR2: ffff88042503b830 >> ---[ end trace 5e44f1afc74751aa ]----- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jan 28, 2013 at 05:55:57PM +0100, Stefan Behrens wrote:> [CC list reduced (my initial statement was that such dead_list > corruptions happen without the snapshot-aware defrag patch, by now the > contents is not related to the snapshot-aware defrag patch anymore)] >[...]> > No, this did not fix the problem (and I changed the patch and replaced > "root" with "gang[0]" for the compiler''s satisfaction). Same stack trace > as before. > > This happens without scrub or defrag running in parallel. The mount > options are compress=lzo,space_cache,inode_cache. I mount the > filesystem, create about 1000 subvols and snapshots, fill some data in > the subvolumes, delete all subvolumes, wait until "btrfs subvol list ... > | wc -l" prints 0, then immediately unmount the filesystem and then it > crashs. > > Disabling the inode_cache mount option eliminates the crash.Hi Stefan, What about this patch(UNTESTED)? thanks, liubo diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ca7ace7..dac9d4b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4142,9 +4142,14 @@ static void inode_tree_del(struct inode *inode) * root_refs of 0, so this could end up dropping the tree root as a * snapshot, so we need the extra !root->fs_info->tree_root check to * make sure we don''t drop it. + * + * Inode cache''s inodes may be iput and add root back to dead roots + * list during killing super, which leads to use-after-free, so + * we need to check fs_info->closing to keep us from use-after-free. */ if (empty && btrfs_root_refs(&root->root_item) == 0 && - root != root->fs_info->tree_root) { + root != root->fs_info->tree_root && + btrfs_fs_closing(root->fs_info) > 1) { synchronize_srcu(&root->fs_info->subvol_srcu); spin_lock(&root->inode_lock); empty = RB_EMPTY_ROOT(&root->inode_tree);> > BTW, when I reproduced this crash with 6600 outstanding subvolume > deletions, the next mount command took 40 minutes to return back to user > mode. The btrfs-cleaner thread was executing btrfs_clean_old_snapshots() > and was writing the superblocks everytime I looked on its stack. The > mount process was executing btrfs_find_orphan_roots() the first half of > the time and afterwards btrfs_orphan_cleanup() for the rest of the 40 > minutes. > > > >> BUG: unable to handle kernel paging request at ffff88042503b830 > >> IP: [<ffffffff814532b7>] __list_add+0x17/0xd0 > >> PGD 1e0c063 PUD bf58e067 PMD bf6b7067 PTE 800000042503b160 > >> Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC > >> Modules linked in: btrfs bonding raid1 mpt2sas scsi_transport_sas raid_class > >> CPU 2 > >> Pid: 10259, comm: umount Not tainted 3.8.0-rc4+ #16 Supermicro X8SIL/X8SIL > >> RIP: 0010:[<ffffffff814532b7>] [<ffffffff814532b7>] __list_add+0x17/0xd0 > >> RSP: 0018:ffff8802f67a1bd8 EFLAGS: 00010286 > >> RAX: ffff880425b7c560 RBX: ffff880423ca2828 RCX: 0000000000000001 > >> RDX: ffff88042503b828 RSI: ffff8804257794c0 RDI: ffff880423ca2828 > >> RBP: ffff8802f67a1bf8 R08: 0000000000077850 R09: 0000000000000000 > >> R10: 0000000000000000 R11: 0000000000000001 R12: ffff880423ca2000 > >> R13: ffff880423ca2898 R14: 0000000000000000 R15: ffff8802f67a1d30 > >> FS: 00007f6e89bba740(0000) GS:ffff88042ea00000(0000) knlGS:0000000000000000 > >> CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > >> CR2: ffff88042503b830 CR3: 000000029a56c000 CR4: 00000000000007e0 > >> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > >> DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > >> Process umount (pid: 10259, threadinfo ffff8802f67a0000, task ffff880425b7c560) > >> Stack: > >> ffffffffa00a414f ffff880423ca2000 ffff880423ca2000 ffff880423ca2898 > >> ffff8802f67a1c18 ffffffffa00a4170 ffff88042a60c1f8 ffff88042a60c1f8 > >> ffff8802f67a1c48 ffffffffa00b3180 ffff88042a60c1f8 ffff88042a60c280 > >> Call Trace: > >> [<ffffffffa00a414f>] ? btrfs_add_dead_root+0x1f/0x60 [btrfs] > >> [<ffffffffa00a4170>] btrfs_add_dead_root+0x40/0x60 [btrfs] > >> [<ffffffffa00b3180>] btrfs_destroy_inode+0x1d0/0x2d0 [btrfs] > >> [<ffffffff811b5d17>] destroy_inode+0x37/0x60 > >> [<ffffffff811b5e4d>] evict+0x10d/0x1a0 > >> [<ffffffff811b65f5>] iput+0x105/0x190 > >> [<ffffffffa009bd68>] free_fs_root+0x18/0x90 [btrfs] > >> [<ffffffffa009f1ab>] btrfs_free_fs_root+0x7b/0x90 [btrfs] > >> [<ffffffffa009f26f>] del_fs_roots+0xaf/0xf0 [btrfs] > >> [<ffffffffa00a0bc6>] close_ctree+0x1c6/0x300 [btrfs] > >> [<ffffffff811b6a7c>] ? evict_inodes+0xec/0x100 > >> [<ffffffffa00763a4>] btrfs_put_super+0x14/0x20 [btrfs] > >> [<ffffffff8119dfcc>] generic_shutdown_super+0x5c/0xe0 > >> [<ffffffff8119e0e1>] kill_anon_super+0x11/0x20 > >> [<ffffffffa007a3a5>] btrfs_kill_super+0x15/0x90 [btrfs] > >> [<ffffffff8119f111>] ? deactivate_super+0x41/0x70 > >> [<ffffffff8119e4dd>] deactivate_locked_super+0x3d/0x70 > >> [<ffffffff8119f119>] deactivate_super+0x49/0x70 > >> [<ffffffff811ba772>] mntput_no_expire+0xd2/0x130 > >> [<ffffffff811bb621>] sys_umount+0x71/0x390 > >> [<ffffffff81983012>] system_call_fastpath+0x16/0x1b > >> Code: 48 83 c4 08 5b 5d c3 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 ec 20 48 89 5d e8 4c 89 65 f0 48 89 fb 4c 89 6d f8 <4c> 8b 42 08 49 89 f5 49 89 d4 49 39 f0 75 31 4d 8b 45 00 4d 39 > >> RIP [<ffffffff814532b7>] __list_add+0x17/0xd0 > >> RSP <ffff8802f67a1bd8> > >> CR2: ffff88042503b830 > >> ---[ end trace 5e44f1afc74751aa ]--- >-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, 16 Feb 2013 14:47:45 +0800, Liu Bo wrote:> What about this patch(UNTESTED)? > > thanks, > liubo > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index ca7ace7..dac9d4b 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -4142,9 +4142,14 @@ static void inode_tree_del(struct inode *inode) > * root_refs of 0, so this could end up dropping the tree root as a > * snapshot, so we need the extra !root->fs_info->tree_root check to > * make sure we don''t drop it. > + * > + * Inode cache''s inodes may be iput and add root back to dead roots > + * list during killing super, which leads to use-after-free, so > + * we need to check fs_info->closing to keep us from use-after-free. > */ > if (empty && btrfs_root_refs(&root->root_item) == 0 && > - root != root->fs_info->tree_root) { > + root != root->fs_info->tree_root && > + btrfs_fs_closing(root->fs_info) > 1) { > synchronize_srcu(&root->fs_info->subvol_srcu); > spin_lock(&root->inode_lock); > empty = RB_EMPTY_ROOT(&root->inode_tree);No improvement with this patch. The inode_cache causes a crash in __list_add. I tested it on the latest cmason/for-linus with and without your patch. This script is an 100% reproducer on my test box: mkfs.btrfs -d single -m raid1 /dev/sdc /dev/sdj /dev/sds /dev/sdt /dev/sdu /dev/sdv mount /dev/sdc /mnt -o compress=lzo,space_cache,inode_cache btrfs subv create /mnt/src (cd ~/git/btrfs/fs/btrfs && tar cf - .) | (cd /mnt/src && tar xf -) for i in `seq 2000`; do btrfs subv create /mnt/${i}; (cd /mnt/src && tar cf - .) | (cd /mnt/${i} && tar xf -); done for i in /mnt/[0-9]*; do btrfs subv dele ${i}; done sleep 45 umount /mnt BUG: unable to handle kernel paging request at ffff88023517d830 IP: [<ffffffff814415f7>] __list_add+0x17/0xd0 PGD 1e0c063 PUD bf58e067 PMD bf737067 PTE 800000023517d160 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: btrfs raid1 mpt2sas scsi_transport_sas raid_class CPU 2 Pid: 18503, comm: umount Not tainted 3.7.0+ #44 Supermicro X8SIL/X8SIL RIP: 0010:[<ffffffff814415f7>] [<ffffffff814415f7>] __list_add+0x17/0xd0 RSP: 0018:ffff88019e1abbd8 EFLAGS: 00010286 RAX: ffff8802353aa290 RBX: ffff880229e38828 RCX: 0000000000000001 RDX: ffff88023517d828 RSI: ffff8802327214c0 RDI: ffff880229e38828 RBP: ffff88019e1abbf8 R08: 000000000006e130 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff880229e38000 R13: ffff880229e38898 R14: 0000000000000000 R15: ffff88019e1abd30 FS: 00007f75eabc4740(0000) GS:ffff880236a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: ffff88023517d830 CR3: 000000019e17e000 CR4: 00000000000007e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process umount (pid: 18503, threadinfo ffff88019e1aa000, task ffff8802353aa290) Stack: ffffffffa008619f ffff880229e38000 ffff880229e38000 ffff880229e38898 ffff88019e1abc18 ffffffffa00861c0 ffff88012760dc38 ffff88012760dc38 ffff88019e1abc48 ffffffffa0095358 ffff88012760dc38 ffff88012760dcc0 Call Trace: [<ffffffffa008619f>] ? btrfs_add_dead_root+0x1f/0x60 [btrfs] [<ffffffffa00861c0>] btrfs_add_dead_root+0x40/0x60 [btrfs] [<ffffffffa0095358>] btrfs_destroy_inode+0x1d8/0x2d0 [btrfs] [<ffffffff811af9c7>] destroy_inode+0x37/0x60 [<ffffffff811afafd>] evict+0x10d/0x1a0 [<ffffffff811b02a5>] iput+0x105/0x190 [<ffffffffa007dda8>] free_fs_root+0x18/0x90 [btrfs] [<ffffffffa00811eb>] btrfs_free_fs_root+0x7b/0x90 [btrfs] [<ffffffffa00812af>] del_fs_roots+0xaf/0xf0 [btrfs] [<ffffffffa0082c16>] close_ctree+0x1c6/0x300 [btrfs] [<ffffffff811b072c>] ? evict_inodes+0xec/0x100 [<ffffffffa00583a4>] btrfs_put_super+0x14/0x20 [btrfs] [<ffffffff8119805c>] generic_shutdown_super+0x5c/0xe0 [<ffffffff81198171>] kill_anon_super+0x11/0x20 [<ffffffffa005c3a5>] btrfs_kill_super+0x15/0x90 [btrfs] [<ffffffff811991a1>] ? deactivate_super+0x41/0x70 [<ffffffff8119856d>] deactivate_locked_super+0x3d/0x70 [<ffffffff811991a9>] deactivate_super+0x49/0x70 [<ffffffff811b4332>] mntput_no_expire+0xd2/0x130 [<ffffffff811b52e1>] sys_umount+0x71/0x390 [<ffffffff81956992>] system_call_fastpath+0x16/0x1b -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Feb 18, 2013 at 05:53:50PM +0100, Stefan Behrens wrote:> On Sat, 16 Feb 2013 14:47:45 +0800, Liu Bo wrote: > > What about this patch(UNTESTED)? > > > > thanks, > > liubo > > > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > > index ca7ace7..dac9d4b 100644 > > --- a/fs/btrfs/inode.c > > +++ b/fs/btrfs/inode.c > > @@ -4142,9 +4142,14 @@ static void inode_tree_del(struct inode *inode) > > * root_refs of 0, so this could end up dropping the tree root as a > > * snapshot, so we need the extra !root->fs_info->tree_root check to > > * make sure we don''t drop it. > > + * > > + * Inode cache''s inodes may be iput and add root back to dead roots > > + * list during killing super, which leads to use-after-free, so > > + * we need to check fs_info->closing to keep us from use-after-free. > > */ > > if (empty && btrfs_root_refs(&root->root_item) == 0 && > > - root != root->fs_info->tree_root) { > > + root != root->fs_info->tree_root && > > + btrfs_fs_closing(root->fs_info) > 1) { > > synchronize_srcu(&root->fs_info->subvol_srcu); > > spin_lock(&root->inode_lock); > > empty = RB_EMPTY_ROOT(&root->inode_tree); > > No improvement with this patch. The inode_cache causes a crash in __list_add. > I tested it on the latest cmason/for-linus with and without your patch.Ahh, I think I made a finger error, + btrfs_fs_closing(root->fs_info) > 1) { SHOULD be + btrfs_fs_closing(root->fs_info) < 2) {> > This script is an 100% reproducer on my test box: > mkfs.btrfs -d single -m raid1 /dev/sdc /dev/sdj /dev/sds /dev/sdt /dev/sdu /dev/sdv > mount /dev/sdc /mnt -o compress=lzo,space_cache,inode_cache > btrfs subv create /mnt/src > (cd ~/git/btrfs/fs/btrfs && tar cf - .) | (cd /mnt/src && tar xf -) > for i in `seq 2000`; do btrfs subv create /mnt/${i}; (cd /mnt/src && tar cf - .) | (cd /mnt/${i} && tar xf -); done > for i in /mnt/[0-9]*; do btrfs subv dele ${i}; done > sleep 45 > umount /mntWith the latest cmason/for-linus(commit 6f60cbd3ae442cb35861bb522f388db123d42ec1 btrfs: access superblock via pagecache in scan_one_device), I ran this script several times with all good, I used two 40G disks, others remains same. I''m wondering which line does ''del_fs_roots+0xaf/0xf0 [btrfs]'' refer to? thanks, liubo> > BUG: unable to handle kernel paging request at ffff88023517d830 > IP: [<ffffffff814415f7>] __list_add+0x17/0xd0 > PGD 1e0c063 PUD bf58e067 PMD bf737067 PTE 800000023517d160 > Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC > Modules linked in: btrfs raid1 mpt2sas scsi_transport_sas raid_class > CPU 2 > Pid: 18503, comm: umount Not tainted 3.7.0+ #44 Supermicro X8SIL/X8SIL > RIP: 0010:[<ffffffff814415f7>] [<ffffffff814415f7>] __list_add+0x17/0xd0 > RSP: 0018:ffff88019e1abbd8 EFLAGS: 00010286 > RAX: ffff8802353aa290 RBX: ffff880229e38828 RCX: 0000000000000001 > RDX: ffff88023517d828 RSI: ffff8802327214c0 RDI: ffff880229e38828 > RBP: ffff88019e1abbf8 R08: 000000000006e130 R09: 0000000000000000 > R10: 0000000000000000 R11: 0000000000000001 R12: ffff880229e38000 > R13: ffff880229e38898 R14: 0000000000000000 R15: ffff88019e1abd30 > FS: 00007f75eabc4740(0000) GS:ffff880236a00000(0000) knlGS:0000000000000000 > CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b > CR2: ffff88023517d830 CR3: 000000019e17e000 CR4: 00000000000007e0 > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > Process umount (pid: 18503, threadinfo ffff88019e1aa000, task ffff8802353aa290) > Stack: > ffffffffa008619f ffff880229e38000 ffff880229e38000 ffff880229e38898 > ffff88019e1abc18 ffffffffa00861c0 ffff88012760dc38 ffff88012760dc38 > ffff88019e1abc48 ffffffffa0095358 ffff88012760dc38 ffff88012760dcc0 > Call Trace: > [<ffffffffa008619f>] ? btrfs_add_dead_root+0x1f/0x60 [btrfs] > [<ffffffffa00861c0>] btrfs_add_dead_root+0x40/0x60 [btrfs] > [<ffffffffa0095358>] btrfs_destroy_inode+0x1d8/0x2d0 [btrfs] > [<ffffffff811af9c7>] destroy_inode+0x37/0x60 > [<ffffffff811afafd>] evict+0x10d/0x1a0 > [<ffffffff811b02a5>] iput+0x105/0x190 > [<ffffffffa007dda8>] free_fs_root+0x18/0x90 [btrfs] > [<ffffffffa00811eb>] btrfs_free_fs_root+0x7b/0x90 [btrfs] > [<ffffffffa00812af>] del_fs_roots+0xaf/0xf0 [btrfs] > [<ffffffffa0082c16>] close_ctree+0x1c6/0x300 [btrfs] > [<ffffffff811b072c>] ? evict_inodes+0xec/0x100 > [<ffffffffa00583a4>] btrfs_put_super+0x14/0x20 [btrfs] > [<ffffffff8119805c>] generic_shutdown_super+0x5c/0xe0 > [<ffffffff81198171>] kill_anon_super+0x11/0x20 > [<ffffffffa005c3a5>] btrfs_kill_super+0x15/0x90 [btrfs] > [<ffffffff811991a1>] ? deactivate_super+0x41/0x70 > [<ffffffff8119856d>] deactivate_locked_super+0x3d/0x70 > [<ffffffff811991a9>] deactivate_super+0x49/0x70 > [<ffffffff811b4332>] mntput_no_expire+0xd2/0x130 > [<ffffffff811b52e1>] sys_umount+0x71/0x390 > [<ffffffff81956992>] system_call_fastpath+0x16/0x1b > >-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 19.02.2013 05:29, Liu Bo wrote:> On Mon, Feb 18, 2013 at 05:53:50PM +0100, Stefan Behrens wrote: >> On Sat, 16 Feb 2013 14:47:45 +0800, Liu Bo wrote: >>> What about this patch(UNTESTED)? >>> >>> thanks, >>> liubo >>> >>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c >>> index ca7ace7..dac9d4b 100644 >>> --- a/fs/btrfs/inode.c >>> +++ b/fs/btrfs/inode.c >>> @@ -4142,9 +4142,14 @@ static void inode_tree_del(struct inode *inode) >>> * root_refs of 0, so this could end up dropping the tree root as a >>> * snapshot, so we need the extra !root->fs_info->tree_root check to >>> * make sure we don''t drop it. >>> + * >>> + * Inode cache''s inodes may be iput and add root back to dead roots >>> + * list during killing super, which leads to use-after-free, so >>> + * we need to check fs_info->closing to keep us from use-after-free. >>> */ >>> if (empty && btrfs_root_refs(&root->root_item) == 0 && >>> - root != root->fs_info->tree_root) { >>> + root != root->fs_info->tree_root && >>> + btrfs_fs_closing(root->fs_info) > 1) { >>> synchronize_srcu(&root->fs_info->subvol_srcu); >>> spin_lock(&root->inode_lock); >>> empty = RB_EMPTY_ROOT(&root->inode_tree); >> >> No improvement with this patch. The inode_cache causes a crash in __list_add. >> I tested it on the latest cmason/for-linus with and without your patch. > > Ahh, I think I made a finger error, > > + btrfs_fs_closing(root->fs_info) > 1) { > SHOULD be > + btrfs_fs_closing(root->fs_info) < 2) {Yes, this eliminates the crash. Thanks!>> This script is an 100% reproducer on my test box: >> mkfs.btrfs -d single -m raid1 /dev/sdc /dev/sdj /dev/sds /dev/sdt /dev/sdu /dev/sdv >> mount /dev/sdc /mnt -o compress=lzo,space_cache,inode_cache >> btrfs subv create /mnt/src >> (cd ~/git/btrfs/fs/btrfs && tar cf - .) | (cd /mnt/src && tar xf -) >> for i in `seq 2000`; do btrfs subv create /mnt/${i}; (cd /mnt/src && tar cf - .) | (cd /mnt/${i} && tar xf -); done >> for i in /mnt/[0-9]*; do btrfs subv dele ${i}; done >> sleep 45 >> umount /mnt > > With the latest cmason/for-linus(commit 6f60cbd3ae442cb35861bb522f388db123d42ec1 > btrfs: access superblock via pagecache in scan_one_device), > I ran this script several times with all good, I used two 40G disks, > others remains same.6f60cbd is my HEAD too. Maybe the DEBUG options in the .config make the difference although I seem to recall that the issue was also always there with a plain RHEL config.> I''m wondering which line does ''del_fs_roots+0xaf/0xf0 [btrfs]'' refer to?del_fs_roots+0xaf/0xf0 [btrfs]: 0x2c2af is in del_fs_roots (fs/btrfs/disk-io.c:3243). 3238 ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, 3239 (void **)gang, 0, 3240 ARRAY_SIZE(gang)); 3241 if (!ret) 3242 break; 3243 for (i = 0; i < ret; i++) 3244 btrfs_free_fs_root(fs_info, gang[i]); 3245 } 3246 } 3247 Thanks for the patch :)>> BUG: unable to handle kernel paging request at ffff88023517d830 >> IP: [<ffffffff814415f7>] __list_add+0x17/0xd0 >> PGD 1e0c063 PUD bf58e067 PMD bf737067 PTE 800000023517d160 >> Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC >> Modules linked in: btrfs raid1 mpt2sas scsi_transport_sas raid_class >> CPU 2 >> Pid: 18503, comm: umount Not tainted 3.7.0+ #44 Supermicro X8SIL/X8SIL >> RIP: 0010:[<ffffffff814415f7>] [<ffffffff814415f7>] __list_add+0x17/0xd0 >> RSP: 0018:ffff88019e1abbd8 EFLAGS: 00010286 >> RAX: ffff8802353aa290 RBX: ffff880229e38828 RCX: 0000000000000001 >> RDX: ffff88023517d828 RSI: ffff8802327214c0 RDI: ffff880229e38828 >> RBP: ffff88019e1abbf8 R08: 000000000006e130 R09: 0000000000000000 >> R10: 0000000000000000 R11: 0000000000000001 R12: ffff880229e38000 >> R13: ffff880229e38898 R14: 0000000000000000 R15: ffff88019e1abd30 >> FS: 00007f75eabc4740(0000) GS:ffff880236a00000(0000) knlGS:0000000000000000 >> CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b >> CR2: ffff88023517d830 CR3: 000000019e17e000 CR4: 00000000000007e0 >> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 >> DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 >> Process umount (pid: 18503, threadinfo ffff88019e1aa000, task ffff8802353aa290) >> Stack: >> ffffffffa008619f ffff880229e38000 ffff880229e38000 ffff880229e38898 >> ffff88019e1abc18 ffffffffa00861c0 ffff88012760dc38 ffff88012760dc38 >> ffff88019e1abc48 ffffffffa0095358 ffff88012760dc38 ffff88012760dcc0 >> Call Trace: >> [<ffffffffa008619f>] ? btrfs_add_dead_root+0x1f/0x60 [btrfs] >> [<ffffffffa00861c0>] btrfs_add_dead_root+0x40/0x60 [btrfs] >> [<ffffffffa0095358>] btrfs_destroy_inode+0x1d8/0x2d0 [btrfs] >> [<ffffffff811af9c7>] destroy_inode+0x37/0x60 >> [<ffffffff811afafd>] evict+0x10d/0x1a0 >> [<ffffffff811b02a5>] iput+0x105/0x190 >> [<ffffffffa007dda8>] free_fs_root+0x18/0x90 [btrfs] >> [<ffffffffa00811eb>] btrfs_free_fs_root+0x7b/0x90 [btrfs] >> [<ffffffffa00812af>] del_fs_roots+0xaf/0xf0 [btrfs] >> [<ffffffffa0082c16>] close_ctree+0x1c6/0x300 [btrfs] >> [<ffffffff811b072c>] ? evict_inodes+0xec/0x100 >> [<ffffffffa00583a4>] btrfs_put_super+0x14/0x20 [btrfs] >> [<ffffffff8119805c>] generic_shutdown_super+0x5c/0xe0 >> [<ffffffff81198171>] kill_anon_super+0x11/0x20 >> [<ffffffffa005c3a5>] btrfs_kill_super+0x15/0x90 [btrfs] >> [<ffffffff811991a1>] ? deactivate_super+0x41/0x70 >> [<ffffffff8119856d>] deactivate_locked_super+0x3d/0x70 >> [<ffffffff811991a9>] deactivate_super+0x49/0x70 >> [<ffffffff811b4332>] mntput_no_expire+0xd2/0x130 >> [<ffffffff811b52e1>] sys_umount+0x71/0x390 >> [<ffffffff81956992>] system_call_fastpath+0x16/0x1b >>-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html