Gui Hecheng
2014-Nov-10 07:36 UTC
[PATCH] btrfs: fix dead lock while running replace and defrag concurrently
This can be reproduced by fstests: btrfs/070 The scenario is like the following: replace worker thread defrag thread --------------------- ------------- copy_nocow_pages_worker btrfs_defrag_file copy_nocow_pages_for_inode ... btrfs_writepages |A| lock_extent_bits extent_write_cache_pages |B| lock_page __extent_writepage ... writepage_delalloc find_lock_delalloc_range |B| lock_extent_bits find_or_create_page pagecache_get_page |A| lock_page This leads to an ABBA pattern deadlock. To fix it, o we just change it to an AABB pattern which means to @unlock_extent_bits() before we @lock_page(), and in this way the @extent_read_full_page_nolock() is no longer in an locked context, so change it back to @extent_read_full_page() to regain protection. o Since we @unlock_extent_bits() earlier, then before @write_page_nocow(), the extent may not really point at the physical block we want, so we have to check it before write. Signed-off-by: Gui Hecheng <guihc.fnst@cn.fujitsu.com> --- fs/btrfs/scrub.c | 90 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index efa0831..4325bb0 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3310,6 +3310,50 @@ out: scrub_pending_trans_workers_dec(sctx); } +static int check_extent_to_block(struct inode *inode, u64 start, u64 len, + u64 logical) +{ + struct extent_state *cached_state = NULL; + struct btrfs_ordered_extent *ordered; + struct extent_io_tree *io_tree; + struct extent_map *em; + u64 lockstart = start, lockend = start + len - 1; + int ret = 0; + + io_tree = &BTRFS_I(inode)->io_tree; + + lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, lockstart, len); + if (ordered) { + btrfs_put_ordered_extent(ordered); + ret = 1; + goto out_unlock; + } + + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_unlock; + } + + /* + * This extent does not actually cover the logical extent anymore, + * move on to the next inode. + */ + if (em->block_start > logical || + em->block_start + em->block_len < logical + len) { + free_extent_map(em); + ret = 1; + goto out_unlock; + } + free_extent_map(em); + +out_unlock: + unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, + GFP_NOFS); + return ret; +} + static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, struct scrub_copy_nocow_ctx *nocow_ctx) { @@ -3318,13 +3362,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, struct inode *inode; struct page *page; struct btrfs_root *local_root; - struct btrfs_ordered_extent *ordered; - struct extent_map *em; - struct extent_state *cached_state = NULL; struct extent_io_tree *io_tree; u64 physical_for_dev_replace; + u64 nocow_ctx_logical; u64 len = nocow_ctx->len; - u64 lockstart = offset, lockend = offset + len - 1; unsigned long index; int srcu_index; int ret = 0; @@ -3356,30 +3397,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; io_tree = &BTRFS_I(inode)->io_tree; + nocow_ctx_logical = nocow_ctx->logical; - lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, lockstart, len); - if (ordered) { - btrfs_put_ordered_extent(ordered); - goto out_unlock; - } - - em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out_unlock; - } - - /* - * This extent does not actually cover the logical extent anymore, - * move on to the next inode. - */ - if (em->block_start > nocow_ctx->logical || - em->block_start + em->block_len < nocow_ctx->logical + len) { - free_extent_map(em); - goto out_unlock; + ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical); + if (ret) { + ret = ret > 0 ? 0 : ret; + goto out; } - free_extent_map(em); while (len >= PAGE_CACHE_SIZE) { index = offset >> PAGE_CACHE_SHIFT; @@ -3396,7 +3420,7 @@ again: goto next_page; } else { ClearPageError(page); - err = extent_read_full_page_nolock(io_tree, page, + err = extent_read_full_page(io_tree, page, btrfs_get_extent, nocow_ctx->mirror_num); if (err) { @@ -3421,6 +3445,14 @@ again: goto next_page; } } + + ret = check_extent_to_block(inode, offset, len, + nocow_ctx_logical); + if (ret) { + ret = ret > 0 ? 0 : ret; + goto next_page; + } + err = write_page_nocow(nocow_ctx->sctx, physical_for_dev_replace, page); if (err) @@ -3434,12 +3466,10 @@ next_page: offset += PAGE_CACHE_SIZE; physical_for_dev_replace += PAGE_CACHE_SIZE; + nocow_ctx_logical += PAGE_CACHE_SIZE; len -= PAGE_CACHE_SIZE; } ret = COPY_COMPLETE; -out_unlock: - unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, - GFP_NOFS); out: mutex_unlock(&inode->i_mutex); iput(inode); -- 1.8.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html