thr3ads.net - Btrfs devel - [PATCH] Btrfs: fix very slow inode eviction and fs unmount [Nov 2013]

If this information is useful, please help other people find it:
Share via:

Filipe David Borba Manana

2013-Nov-19 22:29 UTC

[PATCH] Btrfs: fix very slow inode eviction and fs unmount

The inode eviction can be very slow, because during eviction we
tell the VFS to truncate all of the inode''s pages. This results
in calls to btrfs_invalidatepage() which in turn does calls to
lock_extent_bits() and clear_extent_bit(). These calls result in
too many merges and splits of extent_state structures, which
consume a lot of time and cpu when the inode has many pages. In
some scenarios I have experienced umount times higher than 15
minutes, even when there''s no pending IO (after a btrfs fs sync).

A quick way to reproduce this issue:

$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ cd /mnt/btrfs
$ sysbench --test=fileio --file-num=128 --file-total-size=16G \
    --file-test-mode=seqwr --num-threads=128 \
    --file-block-size=16384 --max-time=60 --max-requests=0 run
$ time btrfs fi sync .
FSSync ''.''

real	0m25.457s
user	0m0.000s
sys	0m0.092s
$ cd ..
$ time umount /mnt/btrfs

real	1m38.234s
user	0m0.000s
sys	1m25.760s

The same test on ext4 runs much faster:

$ mkfs.ext4 /dev/sdb3
$ mount /dev/sdb3 /mnt/ext4
$ cd /mnt/ext4
$ sysbench --test=fileio --file-num=128 --file-total-size=16G \
    --file-test-mode=seqwr --num-threads=128 \
    --file-block-size=16384 --max-time=60 --max-requests=0 run
$ sync
$ cd ..
$ time umount /mnt/ext4

real	0m3.626s
user	0m0.004s
sys	0m3.012s

After this patch, the unmount (inode evictions) is much faster:

$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt/btrfs
$ cd /mnt/btrfs
$ sysbench --test=fileio --file-num=128 --file-total-size=16G \
    --file-test-mode=seqwr --num-threads=128 \
    --file-block-size=16384 --max-time=60 --max-requests=0 run
$ time btrfs fi sync .
FSSync ''.''

real	0m26.774s
user	0m0.000s
sys	0m0.084s
$ cd ..
$ time umount /mnt/btrfs

real	0m1.811s
user	0m0.000s
sys	0m1.564s

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
---
 fs/btrfs/inode.c |   98 ++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 84 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5a5de36..e889779 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4488,6 +4488,62 @@ static int btrfs_setattr(struct dentry *dentry, struct
iattr *attr)
 	return err;
 }
 
+/*
+ * While truncating the inode pages during eviction, we get the VFS calling
+ * btrfs_invalidatepage() against each page of the inode. This is slow because
+ * the calls to btrfs_invalidatepage() result in a huge amount of calls to
+ * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
+ * extent_state structures over and over, wasting lots of time.
+ *
+ * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
+ * those expensive operations on a per page basis and do only the ordered io
+ * finishing, while we release here the extent_map and extent_state structures,
+ * without the excessive merging and splitting.
+ */
+static void evict_inode_truncate_pages(struct inode *inode)
+{
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
+	struct rb_node *node;
+
+	ASSERT(inode->i_state & I_FREEING);
+	truncate_inode_pages(&inode->i_data, 0);
+
+	write_lock(&map_tree->lock);
+	while (!RB_EMPTY_ROOT(&map_tree->map)) {
+		struct extent_map *em;
+
+		node = rb_first(&map_tree->map);
+		em = rb_entry(node, struct extent_map, rb_node);
+		remove_extent_mapping(map_tree, em);
+		free_extent_map(em);
+	}
+	write_unlock(&map_tree->lock);
+
+	spin_lock(&io_tree->lock);
+	while (!RB_EMPTY_ROOT(&io_tree->state)) {
+		struct extent_state *state;
+		struct extent_state *cached_state = NULL;
+
+		node = rb_first(&io_tree->state);
+		state = rb_entry(node, struct extent_state, rb_node);
+		atomic_inc(&state->refs);
+		spin_unlock(&io_tree->lock);
+
+		lock_extent_bits(io_tree, state->start, state->end,
+				 0, &cached_state);
+		clear_extent_bit(io_tree, state->start, state->end,
+				 EXTENT_LOCKED | EXTENT_DIRTY |
+				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+				 EXTENT_DEFRAG, 1, 1,
+				 &cached_state, GFP_NOFS);
+		free_extent_state(state);
+
+		spin_lock(&io_tree->lock);
+	}
+	spin_unlock(&io_tree->lock);
+}
+
 void btrfs_evict_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
@@ -4498,7 +4554,8 @@ void btrfs_evict_inode(struct inode *inode)
 
 	trace_btrfs_inode_evict(inode);
 
-	truncate_inode_pages(&inode->i_data, 0);
+	evict_inode_truncate_pages(inode);
+
 	if (inode->i_nlink &&
 	    ((btrfs_root_refs(&root->root_item) != 0 &&
 	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
@@ -7379,6 +7436,7 @@ static void btrfs_invalidatepage(struct page *page,
unsigned int offset,
 	struct extent_state *cached_state = NULL;
 	u64 page_start = page_offset(page);
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+	int inode_evicting = inode->i_state & I_FREEING;
 
 	/*
 	 * we have the page locked, so new writeback can''t start,
@@ -7394,17 +7452,21 @@ static void btrfs_invalidatepage(struct page *page,
unsigned int offset,
 		btrfs_releasepage(page, GFP_NOFS);
 		return;
 	}
-	lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
-	ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
+
+	if (!inode_evicting)
+		lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
 		/*
 		 * IO on this page will never be started, so we need
 		 * to account for any ordered extents now
 		 */
-		clear_extent_bit(tree, page_start, page_end,
-				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
-				 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
+		if (!inode_evicting)
+			clear_extent_bit(tree, page_start, page_end,
+					 EXTENT_DIRTY | EXTENT_DELALLOC |
+					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+					 EXTENT_DEFRAG, 1, 0, &cached_state,
+					 GFP_NOFS);
 		/*
 		 * whoever cleared the private bit is responsible
 		 * for the finish_ordered_io
@@ -7428,14 +7490,22 @@ static void btrfs_invalidatepage(struct page *page,
unsigned int offset,
 				btrfs_finish_ordered_io(ordered);
 		}
 		btrfs_put_ordered_extent(ordered);
-		cached_state = NULL;
-		lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+		if (!inode_evicting) {
+			cached_state = NULL;
+			lock_extent_bits(tree, page_start, page_end, 0,
+					 &cached_state);
+		}
+	}
+
+	if (!inode_evicting) {
+		clear_extent_bit(tree, page_start, page_end,
+				 EXTENT_LOCKED | EXTENT_DIRTY |
+				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+				 EXTENT_DEFRAG, 1, 1,
+				 &cached_state, GFP_NOFS);
+
+		__btrfs_releasepage(page, GFP_NOFS);
 	}
-	clear_extent_bit(tree, page_start, page_end,
-		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-		 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
-		 &cached_state, GFP_NOFS);
-	__btrfs_releasepage(page, GFP_NOFS);
 
 	ClearPageChecked(page);
 	if (PagePrivate(page)) {
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Liu Bo

2013-Dec-16 09:27 UTC

head link

Re: [PATCH] Btrfs: fix very slow inode eviction and fs unmount

On Tue, Nov 19, 2013 at 10:29:35PM +0000, Filipe David Borba Manana
wrote:> The inode eviction can be very slow, because during eviction we
> tell the VFS to truncate all of the inode''s pages. This results
> in calls to btrfs_invalidatepage() which in turn does calls to
> lock_extent_bits() and clear_extent_bit(). These calls result in
> too many merges and splits of extent_state structures, which
> consume a lot of time and cpu when the inode has many pages. In
> some scenarios I have experienced umount times higher than 15
> minutes, even when there''s no pending IO (after a btrfs fs sync).
> 
> A quick way to reproduce this issue:
> 
> $ mkfs.btrfs -f /dev/sdb3
> $ mount /dev/sdb3 /mnt/btrfs
> $ cd /mnt/btrfs
> $ sysbench --test=fileio --file-num=128 --file-total-size=16G \
>     --file-test-mode=seqwr --num-threads=128 \
>     --file-block-size=16384 --max-time=60 --max-requests=0 run
> $ time btrfs fi sync .
> FSSync ''.''
> 
> real	0m25.457s
> user	0m0.000s
> sys	0m0.092s
> $ cd ..
> $ time umount /mnt/btrfs
> 
> real	1m38.234s
> user	0m0.000s
> sys	1m25.760s
> 
What about the time of umount after ''sync''?

The following ext4 uses sync while btrfs uses ''btrfs filesystem
sync''.

I don''t think they are the same thing.

-liubo
> The same test on ext4 runs much faster:
> 
> $ mkfs.ext4 /dev/sdb3
> $ mount /dev/sdb3 /mnt/ext4
> $ cd /mnt/ext4
> $ sysbench --test=fileio --file-num=128 --file-total-size=16G \
>     --file-test-mode=seqwr --num-threads=128 \
>     --file-block-size=16384 --max-time=60 --max-requests=0 run
> $ sync
> $ cd ..
> $ time umount /mnt/ext4
> 
> real	0m3.626s
> user	0m0.004s
> sys	0m3.012s
> 
> After this patch, the unmount (inode evictions) is much faster:
> 
> $ mkfs.btrfs -f /dev/sdb3
> $ mount /dev/sdb3 /mnt/btrfs
> $ cd /mnt/btrfs
> $ sysbench --test=fileio --file-num=128 --file-total-size=16G \
>     --file-test-mode=seqwr --num-threads=128 \
>     --file-block-size=16384 --max-time=60 --max-requests=0 run
> $ time btrfs fi sync .
> FSSync ''.''
> 
> real	0m26.774s
> user	0m0.000s
> sys	0m0.084s
> $ cd ..
> $ time umount /mnt/btrfs
> 
> real	0m1.811s
> user	0m0.000s
> sys	0m1.564s
> 
> Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
> ---
>  fs/btrfs/inode.c |   98
++++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 84 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 5a5de36..e889779 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -4488,6 +4488,62 @@ static int btrfs_setattr(struct dentry *dentry,
struct iattr *attr)
>  	return err;
>  }
>  
> +/*
> + * While truncating the inode pages during eviction, we get the VFS
calling
> + * btrfs_invalidatepage() against each page of the inode. This is slow
because
> + * the calls to btrfs_invalidatepage() result in a huge amount of calls to
> + * lock_extent_bits() and clear_extent_bit(), which keep merging and
splitting
> + * extent_state structures over and over, wasting lots of time.
> + *
> + * Therefore if the inode is being evicted, let btrfs_invalidatepage()
skip all
> + * those expensive operations on a per page basis and do only the ordered
io
> + * finishing, while we release here the extent_map and extent_state
structures,
> + * without the excessive merging and splitting.
> + */
> +static void evict_inode_truncate_pages(struct inode *inode)
> +{
> +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> +	struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
> +	struct rb_node *node;
> +
> +	ASSERT(inode->i_state & I_FREEING);
> +	truncate_inode_pages(&inode->i_data, 0);
> +
> +	write_lock(&map_tree->lock);
> +	while (!RB_EMPTY_ROOT(&map_tree->map)) {
> +		struct extent_map *em;
> +
> +		node = rb_first(&map_tree->map);
> +		em = rb_entry(node, struct extent_map, rb_node);
> +		remove_extent_mapping(map_tree, em);
> +		free_extent_map(em);
> +	}
> +	write_unlock(&map_tree->lock);
> +
> +	spin_lock(&io_tree->lock);
> +	while (!RB_EMPTY_ROOT(&io_tree->state)) {
> +		struct extent_state *state;
> +		struct extent_state *cached_state = NULL;
> +
> +		node = rb_first(&io_tree->state);
> +		state = rb_entry(node, struct extent_state, rb_node);
> +		atomic_inc(&state->refs);
> +		spin_unlock(&io_tree->lock);
> +
> +		lock_extent_bits(io_tree, state->start, state->end,
> +				 0, &cached_state);
> +		clear_extent_bit(io_tree, state->start, state->end,
> +				 EXTENT_LOCKED | EXTENT_DIRTY |
> +				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
> +				 EXTENT_DEFRAG, 1, 1,
> +				 &cached_state, GFP_NOFS);
> +		free_extent_state(state);
> +
> +		spin_lock(&io_tree->lock);
> +	}
> +	spin_unlock(&io_tree->lock);
> +}
> +
>  void btrfs_evict_inode(struct inode *inode)
>  {
>  	struct btrfs_trans_handle *trans;
> @@ -4498,7 +4554,8 @@ void btrfs_evict_inode(struct inode *inode)
>  
>  	trace_btrfs_inode_evict(inode);
>  
> -	truncate_inode_pages(&inode->i_data, 0);
> +	evict_inode_truncate_pages(inode);
> +
>  	if (inode->i_nlink &&
>  	    ((btrfs_root_refs(&root->root_item) != 0 &&
>  	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
> @@ -7379,6 +7436,7 @@ static void btrfs_invalidatepage(struct page *page,
unsigned int offset,
>  	struct extent_state *cached_state = NULL;
>  	u64 page_start = page_offset(page);
>  	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
> +	int inode_evicting = inode->i_state & I_FREEING;
>  
>  	/*
>  	 * we have the page locked, so new writeback can''t start,
> @@ -7394,17 +7452,21 @@ static void btrfs_invalidatepage(struct page *page,
unsigned int offset,
>  		btrfs_releasepage(page, GFP_NOFS);
>  		return;
>  	}
> -	lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
> -	ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
> +
> +	if (!inode_evicting)
> +		lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
> +	ordered = btrfs_lookup_ordered_extent(inode, page_start);
>  	if (ordered) {
>  		/*
>  		 * IO on this page will never be started, so we need
>  		 * to account for any ordered extents now
>  		 */
> -		clear_extent_bit(tree, page_start, page_end,
> -				 EXTENT_DIRTY | EXTENT_DELALLOC |
> -				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
> -				 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
> +		if (!inode_evicting)
> +			clear_extent_bit(tree, page_start, page_end,
> +					 EXTENT_DIRTY | EXTENT_DELALLOC |
> +					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
> +					 EXTENT_DEFRAG, 1, 0, &cached_state,
> +					 GFP_NOFS);
>  		/*
>  		 * whoever cleared the private bit is responsible
>  		 * for the finish_ordered_io
> @@ -7428,14 +7490,22 @@ static void btrfs_invalidatepage(struct page *page,
unsigned int offset,
>  				btrfs_finish_ordered_io(ordered);
>  		}
>  		btrfs_put_ordered_extent(ordered);
> -		cached_state = NULL;
> -		lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
> +		if (!inode_evicting) {
> +			cached_state = NULL;
> +			lock_extent_bits(tree, page_start, page_end, 0,
> +					 &cached_state);
> +		}
> +	}
> +
> +	if (!inode_evicting) {
> +		clear_extent_bit(tree, page_start, page_end,
> +				 EXTENT_LOCKED | EXTENT_DIRTY |
> +				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
> +				 EXTENT_DEFRAG, 1, 1,
> +				 &cached_state, GFP_NOFS);
> +
> +		__btrfs_releasepage(page, GFP_NOFS);
>  	}
> -	clear_extent_bit(tree, page_start, page_end,
> -		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
> -		 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
> -		 &cached_state, GFP_NOFS);
> -	__btrfs_releasepage(page, GFP_NOFS);
>  
>  	ClearPageChecked(page);
>  	if (PagePrivate(page)) {
> -- 
> 1.7.9.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe
linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Filipe David Manana

2013-Dec-16 11:05 UTC

head link

Re: [PATCH] Btrfs: fix very slow inode eviction and fs unmount

On Mon, Dec 16, 2013 at 9:27 AM, Liu Bo <bo.li.liu@oracle.com>
wrote:> On Tue, Nov 19, 2013 at 10:29:35PM +0000, Filipe David Borba Manana wrote:
>> The inode eviction can be very slow, because during eviction we
>> tell the VFS to truncate all of the inode''s pages. This
results
>> in calls to btrfs_invalidatepage() which in turn does calls to
>> lock_extent_bits() and clear_extent_bit(). These calls result in
>> too many merges and splits of extent_state structures, which
>> consume a lot of time and cpu when the inode has many pages. In
>> some scenarios I have experienced umount times higher than 15
>> minutes, even when there''s no pending IO (after a btrfs fs
sync).
>>
>> A quick way to reproduce this issue:
>>
>> $ mkfs.btrfs -f /dev/sdb3
>> $ mount /dev/sdb3 /mnt/btrfs
>> $ cd /mnt/btrfs
>> $ sysbench --test=fileio --file-num=128 --file-total-size=16G \
>>     --file-test-mode=seqwr --num-threads=128 \
>>     --file-block-size=16384 --max-time=60 --max-requests=0 run
>> $ time btrfs fi sync .
>> FSSync ''.''
>>
>> real  0m25.457s
>> user  0m0.000s
>> sys   0m0.092s
>> $ cd ..
>> $ time umount /mnt/btrfs
>>
>> real  1m38.234s
>> user  0m0.000s
>> sys   1m25.760s
>>
>
> What about the time of umount after ''sync''?
Same huge difference.
Thanks.
>
> The following ext4 uses sync while btrfs uses ''btrfs filesystem
sync''.
>
> I don''t think they are the same thing.
>
> -liubo
>
>> The same test on ext4 runs much faster:
>>
>> $ mkfs.ext4 /dev/sdb3
>> $ mount /dev/sdb3 /mnt/ext4
>> $ cd /mnt/ext4
>> $ sysbench --test=fileio --file-num=128 --file-total-size=16G \
>>     --file-test-mode=seqwr --num-threads=128 \
>>     --file-block-size=16384 --max-time=60 --max-requests=0 run
>> $ sync
>> $ cd ..
>> $ time umount /mnt/ext4
>>
>> real  0m3.626s
>> user  0m0.004s
>> sys   0m3.012s
>>
>> After this patch, the unmount (inode evictions) is much faster:
>>
>> $ mkfs.btrfs -f /dev/sdb3
>> $ mount /dev/sdb3 /mnt/btrfs
>> $ cd /mnt/btrfs
>> $ sysbench --test=fileio --file-num=128 --file-total-size=16G \
>>     --file-test-mode=seqwr --num-threads=128 \
>>     --file-block-size=16384 --max-time=60 --max-requests=0 run
>> $ time btrfs fi sync .
>> FSSync ''.''
>>
>> real  0m26.774s
>> user  0m0.000s
>> sys   0m0.084s
>> $ cd ..
>> $ time umount /mnt/btrfs
>>
>> real  0m1.811s
>> user  0m0.000s
>> sys   0m1.564s
>
>>
>> Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
>> ---
>>  fs/btrfs/inode.c |   98
++++++++++++++++++++++++++++++++++++++++++++++--------
>>  1 file changed, 84 insertions(+), 14 deletions(-)
>>
>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>> index 5a5de36..e889779 100644
>> --- a/fs/btrfs/inode.c
>> +++ b/fs/btrfs/inode.c
>> @@ -4488,6 +4488,62 @@ static int btrfs_setattr(struct dentry *dentry,
struct iattr *attr)
>>       return err;
>>  }
>>
>> +/*
>> + * While truncating the inode pages during eviction, we get the VFS
calling
>> + * btrfs_invalidatepage() against each page of the inode. This is slow
because
>> + * the calls to btrfs_invalidatepage() result in a huge amount of
calls to
>> + * lock_extent_bits() and clear_extent_bit(), which keep merging and
splitting
>> + * extent_state structures over and over, wasting lots of time.
>> + *
>> + * Therefore if the inode is being evicted, let btrfs_invalidatepage()
skip all
>> + * those expensive operations on a per page basis and do only the
ordered io
>> + * finishing, while we release here the extent_map and extent_state
structures,
>> + * without the excessive merging and splitting.
>> + */
>> +static void evict_inode_truncate_pages(struct inode *inode)
>> +{
>> +     struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
>> +     struct extent_map_tree *map_tree =
&BTRFS_I(inode)->extent_tree;
>> +     struct rb_node *node;
>> +
>> +     ASSERT(inode->i_state & I_FREEING);
>> +     truncate_inode_pages(&inode->i_data, 0);
>> +
>> +     write_lock(&map_tree->lock);
>> +     while (!RB_EMPTY_ROOT(&map_tree->map)) {
>> +             struct extent_map *em;
>> +
>> +             node = rb_first(&map_tree->map);
>> +             em = rb_entry(node, struct extent_map, rb_node);
>> +             remove_extent_mapping(map_tree, em);
>> +             free_extent_map(em);
>> +     }
>> +     write_unlock(&map_tree->lock);
>> +
>> +     spin_lock(&io_tree->lock);
>> +     while (!RB_EMPTY_ROOT(&io_tree->state)) {
>> +             struct extent_state *state;
>> +             struct extent_state *cached_state = NULL;
>> +
>> +             node = rb_first(&io_tree->state);
>> +             state = rb_entry(node, struct extent_state, rb_node);
>> +             atomic_inc(&state->refs);
>> +             spin_unlock(&io_tree->lock);
>> +
>> +             lock_extent_bits(io_tree, state->start, state->end,
>> +                              0, &cached_state);
>> +             clear_extent_bit(io_tree, state->start, state->end,
>> +                              EXTENT_LOCKED | EXTENT_DIRTY |
>> +                              EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
>> +                              EXTENT_DEFRAG, 1, 1,
>> +                              &cached_state, GFP_NOFS);
>> +             free_extent_state(state);
>> +
>> +             spin_lock(&io_tree->lock);
>> +     }
>> +     spin_unlock(&io_tree->lock);
>> +}
>> +
>>  void btrfs_evict_inode(struct inode *inode)
>>  {
>>       struct btrfs_trans_handle *trans;
>> @@ -4498,7 +4554,8 @@ void btrfs_evict_inode(struct inode *inode)
>>
>>       trace_btrfs_inode_evict(inode);
>>
>> -     truncate_inode_pages(&inode->i_data, 0);
>> +     evict_inode_truncate_pages(inode);
>> +
>>       if (inode->i_nlink &&
>>           ((btrfs_root_refs(&root->root_item) != 0 &&
>>             root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
>> @@ -7379,6 +7436,7 @@ static void btrfs_invalidatepage(struct page
*page, unsigned int offset,
>>       struct extent_state *cached_state = NULL;
>>       u64 page_start = page_offset(page);
>>       u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
>> +     int inode_evicting = inode->i_state & I_FREEING;
>>
>>       /*
>>        * we have the page locked, so new writeback can''t
start,
>> @@ -7394,17 +7452,21 @@ static void btrfs_invalidatepage(struct page
*page, unsigned int offset,
>>               btrfs_releasepage(page, GFP_NOFS);
>>               return;
>>       }
>> -     lock_extent_bits(tree, page_start, page_end, 0,
&cached_state);
>> -     ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
>> +
>> +     if (!inode_evicting)
>> +             lock_extent_bits(tree, page_start, page_end, 0,
&cached_state);
>> +     ordered = btrfs_lookup_ordered_extent(inode, page_start);
>>       if (ordered) {
>>               /*
>>                * IO on this page will never be started, so we need
>>                * to account for any ordered extents now
>>                */
>> -             clear_extent_bit(tree, page_start, page_end,
>> -                              EXTENT_DIRTY | EXTENT_DELALLOC |
>> -                              EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
>> -                              EXTENT_DEFRAG, 1, 0, &cached_state,
GFP_NOFS);
>> +             if (!inode_evicting)
>> +                     clear_extent_bit(tree, page_start, page_end,
>> +                                      EXTENT_DIRTY | EXTENT_DELALLOC |
>> +                                      EXTENT_LOCKED |
EXTENT_DO_ACCOUNTING |
>> +                                      EXTENT_DEFRAG, 1, 0,
&cached_state,
>> +                                      GFP_NOFS);
>>               /*
>>                * whoever cleared the private bit is responsible
>>                * for the finish_ordered_io
>> @@ -7428,14 +7490,22 @@ static void btrfs_invalidatepage(struct page
*page, unsigned int offset,
>>                               btrfs_finish_ordered_io(ordered);
>>               }
>>               btrfs_put_ordered_extent(ordered);
>> -             cached_state = NULL;
>> -             lock_extent_bits(tree, page_start, page_end, 0,
&cached_state);
>> +             if (!inode_evicting) {
>> +                     cached_state = NULL;
>> +                     lock_extent_bits(tree, page_start, page_end, 0,
>> +                                      &cached_state);
>> +             }
>> +     }
>> +
>> +     if (!inode_evicting) {
>> +             clear_extent_bit(tree, page_start, page_end,
>> +                              EXTENT_LOCKED | EXTENT_DIRTY |
>> +                              EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
>> +                              EXTENT_DEFRAG, 1, 1,
>> +                              &cached_state, GFP_NOFS);
>> +
>> +             __btrfs_releasepage(page, GFP_NOFS);
>>       }
>> -     clear_extent_bit(tree, page_start, page_end,
>> -              EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
>> -              EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
>> -              &cached_state, GFP_NOFS);
>> -     __btrfs_releasepage(page, GFP_NOFS);
>>
>>       ClearPageChecked(page);
>>       if (PagePrivate(page)) {
>> --
>> 1.7.9.5
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe
linux-btrfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html


-- 
Filipe David Manana,

"Reasonable men adapt themselves to the world.
 Unreasonable men adapt the world to themselves.
 That''s why all progress depends on unreasonable men."
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Liu Bo

2013-Dec-16 11:45 UTC

head link

Re: [PATCH] Btrfs: fix very slow inode eviction and fs unmount

On Mon, Dec 16, 2013 at 11:05:31AM +0000, Filipe David Manana
wrote:> On Mon, Dec 16, 2013 at 9:27 AM, Liu Bo <bo.li.liu@oracle.com> wrote:
> > On Tue, Nov 19, 2013 at 10:29:35PM +0000, Filipe David Borba Manana
wrote:
> >> The inode eviction can be very slow, because during eviction we
> >> tell the VFS to truncate all of the inode''s pages. This
results
> >> in calls to btrfs_invalidatepage() which in turn does calls to
> >> lock_extent_bits() and clear_extent_bit(). These calls result in
> >> too many merges and splits of extent_state structures, which
> >> consume a lot of time and cpu when the inode has many pages. In
> >> some scenarios I have experienced umount times higher than 15
> >> minutes, even when there''s no pending IO (after a btrfs
fs sync).
> >>
> >> A quick way to reproduce this issue:
> >>
> >> $ mkfs.btrfs -f /dev/sdb3
> >> $ mount /dev/sdb3 /mnt/btrfs
> >> $ cd /mnt/btrfs
> >> $ sysbench --test=fileio --file-num=128 --file-total-size=16G \
> >>     --file-test-mode=seqwr --num-threads=128 \
> >>     --file-block-size=16384 --max-time=60 --max-requests=0 run
> >> $ time btrfs fi sync .
> >> FSSync ''.''
> >>
> >> real  0m25.457s
> >> user  0m0.000s
> >> sys   0m0.092s
> >> $ cd ..
> >> $ time umount /mnt/btrfs
> >>
> >> real  1m38.234s
> >> user  0m0.000s
> >> sys   1m25.760s
> >>
> >
> > What about the time of umount after ''sync''?
> 
> Same huge difference.
> Thanks.
Not seeing that huge one with the latest btrfs, maybe because your memory is
rather larger.

time sync
FSSync ''/mnt/btrfs''

real	0m17.006s
user	0m0.004s
sys	0m0.056s

time umount /mnt/btrfs

real	0m0.910s
user	0m0.003s
sys	0m0.715s

-liubo
> 
> >
> > The following ext4 uses sync while btrfs uses ''btrfs
filesystem sync''.
> >
> > I don''t think they are the same thing.
> >
> > -liubo
> >
> >> The same test on ext4 runs much faster:
> >>
> >> $ mkfs.ext4 /dev/sdb3
> >> $ mount /dev/sdb3 /mnt/ext4
> >> $ cd /mnt/ext4
> >> $ sysbench --test=fileio --file-num=128 --file-total-size=16G \
> >>     --file-test-mode=seqwr --num-threads=128 \
> >>     --file-block-size=16384 --max-time=60 --max-requests=0 run
> >> $ sync
> >> $ cd ..
> >> $ time umount /mnt/ext4
> >>
> >> real  0m3.626s
> >> user  0m0.004s
> >> sys   0m3.012s
> >>
> >> After this patch, the unmount (inode evictions) is much faster:
> >>
> >> $ mkfs.btrfs -f /dev/sdb3
> >> $ mount /dev/sdb3 /mnt/btrfs
> >> $ cd /mnt/btrfs
> >> $ sysbench --test=fileio --file-num=128 --file-total-size=16G \
> >>     --file-test-mode=seqwr --num-threads=128 \
> >>     --file-block-size=16384 --max-time=60 --max-requests=0 run
> >> $ time btrfs fi sync .
> >> FSSync ''.''
> >>
> >> real  0m26.774s
> >> user  0m0.000s
> >> sys   0m0.084s
> >> $ cd ..
> >> $ time umount /mnt/btrfs
> >>
> >> real  0m1.811s
> >> user  0m0.000s
> >> sys   0m1.564s
> >
> >>
> >> Signed-off-by: Filipe David Borba Manana
<fdmanana@gmail.com>
> >> ---
> >>  fs/btrfs/inode.c |   98
++++++++++++++++++++++++++++++++++++++++++++++--------
> >>  1 file changed, 84 insertions(+), 14 deletions(-)
> >>
> >> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> >> index 5a5de36..e889779 100644
> >> --- a/fs/btrfs/inode.c
> >> +++ b/fs/btrfs/inode.c
> >> @@ -4488,6 +4488,62 @@ static int btrfs_setattr(struct dentry
*dentry, struct iattr *attr)
> >>       return err;
> >>  }
> >>
> >> +/*
> >> + * While truncating the inode pages during eviction, we get the
VFS calling
> >> + * btrfs_invalidatepage() against each page of the inode. This is
slow because
> >> + * the calls to btrfs_invalidatepage() result in a huge amount of
calls to
> >> + * lock_extent_bits() and clear_extent_bit(), which keep merging
and splitting
> >> + * extent_state structures over and over, wasting lots of time.
> >> + *
> >> + * Therefore if the inode is being evicted, let
btrfs_invalidatepage() skip all
> >> + * those expensive operations on a per page basis and do only the
ordered io
> >> + * finishing, while we release here the extent_map and
extent_state structures,
> >> + * without the excessive merging and splitting.
> >> + */
> >> +static void evict_inode_truncate_pages(struct inode *inode)
> >> +{
> >> +     struct extent_io_tree *io_tree =
&BTRFS_I(inode)->io_tree;
> >> +     struct extent_map_tree *map_tree =
&BTRFS_I(inode)->extent_tree;
> >> +     struct rb_node *node;
> >> +
> >> +     ASSERT(inode->i_state & I_FREEING);
> >> +     truncate_inode_pages(&inode->i_data, 0);
> >> +
> >> +     write_lock(&map_tree->lock);
> >> +     while (!RB_EMPTY_ROOT(&map_tree->map)) {
> >> +             struct extent_map *em;
> >> +
> >> +             node = rb_first(&map_tree->map);
> >> +             em = rb_entry(node, struct extent_map, rb_node);
> >> +             remove_extent_mapping(map_tree, em);
> >> +             free_extent_map(em);
> >> +     }
> >> +     write_unlock(&map_tree->lock);
> >> +
> >> +     spin_lock(&io_tree->lock);
> >> +     while (!RB_EMPTY_ROOT(&io_tree->state)) {
> >> +             struct extent_state *state;
> >> +             struct extent_state *cached_state = NULL;
> >> +
> >> +             node = rb_first(&io_tree->state);
> >> +             state = rb_entry(node, struct extent_state,
rb_node);
> >> +             atomic_inc(&state->refs);
> >> +             spin_unlock(&io_tree->lock);
> >> +
> >> +             lock_extent_bits(io_tree, state->start,
state->end,
> >> +                              0, &cached_state);
> >> +             clear_extent_bit(io_tree, state->start,
state->end,
> >> +                              EXTENT_LOCKED | EXTENT_DIRTY |
> >> +                              EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING |
> >> +                              EXTENT_DEFRAG, 1, 1,
> >> +                              &cached_state, GFP_NOFS);
> >> +             free_extent_state(state);
> >> +
> >> +             spin_lock(&io_tree->lock);
> >> +     }
> >> +     spin_unlock(&io_tree->lock);
> >> +}
> >> +
> >>  void btrfs_evict_inode(struct inode *inode)
> >>  {
> >>       struct btrfs_trans_handle *trans;
> >> @@ -4498,7 +4554,8 @@ void btrfs_evict_inode(struct inode *inode)
> >>
> >>       trace_btrfs_inode_evict(inode);
> >>
> >> -     truncate_inode_pages(&inode->i_data, 0);
> >> +     evict_inode_truncate_pages(inode);
> >> +
> >>       if (inode->i_nlink &&
> >>           ((btrfs_root_refs(&root->root_item) != 0
&&
> >>             root->root_key.objectid !=
BTRFS_ROOT_TREE_OBJECTID) ||
> >> @@ -7379,6 +7436,7 @@ static void btrfs_invalidatepage(struct page
*page, unsigned int offset,
> >>       struct extent_state *cached_state = NULL;
> >>       u64 page_start = page_offset(page);
> >>       u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
> >> +     int inode_evicting = inode->i_state & I_FREEING;
> >>
> >>       /*
> >>        * we have the page locked, so new writeback can''t
start,
> >> @@ -7394,17 +7452,21 @@ static void btrfs_invalidatepage(struct
page *page, unsigned int offset,
> >>               btrfs_releasepage(page, GFP_NOFS);
> >>               return;
> >>       }
> >> -     lock_extent_bits(tree, page_start, page_end, 0,
&cached_state);
> >> -     ordered = btrfs_lookup_ordered_extent(inode,
page_offset(page));
> >> +
> >> +     if (!inode_evicting)
> >> +             lock_extent_bits(tree, page_start, page_end, 0,
&cached_state);
> >> +     ordered = btrfs_lookup_ordered_extent(inode, page_start);
> >>       if (ordered) {
> >>               /*
> >>                * IO on this page will never be started, so we need
> >>                * to account for any ordered extents now
> >>                */
> >> -             clear_extent_bit(tree, page_start, page_end,
> >> -                              EXTENT_DIRTY | EXTENT_DELALLOC |
> >> -                              EXTENT_LOCKED |
EXTENT_DO_ACCOUNTING |
> >> -                              EXTENT_DEFRAG, 1, 0,
&cached_state, GFP_NOFS);
> >> +             if (!inode_evicting)
> >> +                     clear_extent_bit(tree, page_start, page_end,
> >> +                                      EXTENT_DIRTY |
EXTENT_DELALLOC |
> >> +                                      EXTENT_LOCKED |
EXTENT_DO_ACCOUNTING |
> >> +                                      EXTENT_DEFRAG, 1, 0,
&cached_state,
> >> +                                      GFP_NOFS);
> >>               /*
> >>                * whoever cleared the private bit is responsible
> >>                * for the finish_ordered_io
> >> @@ -7428,14 +7490,22 @@ static void btrfs_invalidatepage(struct
page *page, unsigned int offset,
> >>                               btrfs_finish_ordered_io(ordered);
> >>               }
> >>               btrfs_put_ordered_extent(ordered);
> >> -             cached_state = NULL;
> >> -             lock_extent_bits(tree, page_start, page_end, 0,
&cached_state);
> >> +             if (!inode_evicting) {
> >> +                     cached_state = NULL;
> >> +                     lock_extent_bits(tree, page_start, page_end,
0,
> >> +                                      &cached_state);
> >> +             }
> >> +     }
> >> +
> >> +     if (!inode_evicting) {
> >> +             clear_extent_bit(tree, page_start, page_end,
> >> +                              EXTENT_LOCKED | EXTENT_DIRTY |
> >> +                              EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING |
> >> +                              EXTENT_DEFRAG, 1, 1,
> >> +                              &cached_state, GFP_NOFS);
> >> +
> >> +             __btrfs_releasepage(page, GFP_NOFS);
> >>       }
> >> -     clear_extent_bit(tree, page_start, page_end,
> >> -              EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
> >> -              EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
> >> -              &cached_state, GFP_NOFS);
> >> -     __btrfs_releasepage(page, GFP_NOFS);
> >>
> >>       ClearPageChecked(page);
> >>       if (PagePrivate(page)) {
> >> --
> >> 1.7.9.5
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe
linux-btrfs" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> 
> -- 
> Filipe David Manana,
> 
> "Reasonable men adapt themselves to the world.
>  Unreasonable men adapt the world to themselves.
>  That''s why all progress depends on unreasonable men."--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Filipe David Manana

2013-Dec-16 11:48 UTC

head link

Re: [PATCH] Btrfs: fix very slow inode eviction and fs unmount

On Mon, Dec 16, 2013 at 11:45 AM, Liu Bo <bo.li.liu@oracle.com>
wrote:> On Mon, Dec 16, 2013 at 11:05:31AM +0000, Filipe David Manana wrote:
>> On Mon, Dec 16, 2013 at 9:27 AM, Liu Bo <bo.li.liu@oracle.com>
wrote:
>> > On Tue, Nov 19, 2013 at 10:29:35PM +0000, Filipe David Borba
Manana wrote:
>> >> The inode eviction can be very slow, because during eviction
we
>> >> tell the VFS to truncate all of the inode''s pages.
This results
>> >> in calls to btrfs_invalidatepage() which in turn does calls to
>> >> lock_extent_bits() and clear_extent_bit(). These calls result
in
>> >> too many merges and splits of extent_state structures, which
>> >> consume a lot of time and cpu when the inode has many pages.
In
>> >> some scenarios I have experienced umount times higher than 15
>> >> minutes, even when there''s no pending IO (after a
btrfs fs sync).
>> >>
>> >> A quick way to reproduce this issue:
>> >>
>> >> $ mkfs.btrfs -f /dev/sdb3
>> >> $ mount /dev/sdb3 /mnt/btrfs
>> >> $ cd /mnt/btrfs
>> >> $ sysbench --test=fileio --file-num=128 --file-total-size=16G
\
>> >>     --file-test-mode=seqwr --num-threads=128 \
>> >>     --file-block-size=16384 --max-time=60 --max-requests=0 run
>> >> $ time btrfs fi sync .
>> >> FSSync ''.''
>> >>
>> >> real  0m25.457s
>> >> user  0m0.000s
>> >> sys   0m0.092s
>> >> $ cd ..
>> >> $ time umount /mnt/btrfs
>> >>
>> >> real  1m38.234s
>> >> user  0m0.000s
>> >> sys   1m25.760s
>> >>
>> >
>> > What about the time of umount after ''sync''?
>>
>> Same huge difference.
>> Thanks.
>
> Not seeing that huge one with the latest btrfs, maybe because your memory
is
> rather larger.
Not sure if I understand you.
Latest btrfs-next has this change integrated. Was the test below with
it integrated? You would have to compare it with a build without this
change.

Thanks.
>
> time sync
> FSSync ''/mnt/btrfs''
>
> real    0m17.006s
> user    0m0.004s
> sys     0m0.056s
>
> time umount /mnt/btrfs
>
> real    0m0.910s
> user    0m0.003s
> sys     0m0.715s
>
> -liubo
>
>>
>> >
>> > The following ext4 uses sync while btrfs uses ''btrfs
filesystem sync''.
>> >
>> > I don''t think they are the same thing.
>> >
>> > -liubo
>> >
>> >> The same test on ext4 runs much faster:
>> >>
>> >> $ mkfs.ext4 /dev/sdb3
>> >> $ mount /dev/sdb3 /mnt/ext4
>> >> $ cd /mnt/ext4
>> >> $ sysbench --test=fileio --file-num=128 --file-total-size=16G
\
>> >>     --file-test-mode=seqwr --num-threads=128 \
>> >>     --file-block-size=16384 --max-time=60 --max-requests=0 run
>> >> $ sync
>> >> $ cd ..
>> >> $ time umount /mnt/ext4
>> >>
>> >> real  0m3.626s
>> >> user  0m0.004s
>> >> sys   0m3.012s
>> >>
>> >> After this patch, the unmount (inode evictions) is much
faster:
>> >>
>> >> $ mkfs.btrfs -f /dev/sdb3
>> >> $ mount /dev/sdb3 /mnt/btrfs
>> >> $ cd /mnt/btrfs
>> >> $ sysbench --test=fileio --file-num=128 --file-total-size=16G
\
>> >>     --file-test-mode=seqwr --num-threads=128 \
>> >>     --file-block-size=16384 --max-time=60 --max-requests=0 run
>> >> $ time btrfs fi sync .
>> >> FSSync ''.''
>> >>
>> >> real  0m26.774s
>> >> user  0m0.000s
>> >> sys   0m0.084s
>> >> $ cd ..
>> >> $ time umount /mnt/btrfs
>> >>
>> >> real  0m1.811s
>> >> user  0m0.000s
>> >> sys   0m1.564s
>> >
>> >>
>> >> Signed-off-by: Filipe David Borba Manana
<fdmanana@gmail.com>
>> >> ---
>> >>  fs/btrfs/inode.c |   98
++++++++++++++++++++++++++++++++++++++++++++++--------
>> >>  1 file changed, 84 insertions(+), 14 deletions(-)
>> >>
>> >> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>> >> index 5a5de36..e889779 100644
>> >> --- a/fs/btrfs/inode.c
>> >> +++ b/fs/btrfs/inode.c
>> >> @@ -4488,6 +4488,62 @@ static int btrfs_setattr(struct dentry
*dentry, struct iattr *attr)
>> >>       return err;
>> >>  }
>> >>
>> >> +/*
>> >> + * While truncating the inode pages during eviction, we get
the VFS calling
>> >> + * btrfs_invalidatepage() against each page of the inode.
This is slow because
>> >> + * the calls to btrfs_invalidatepage() result in a huge
amount of calls to
>> >> + * lock_extent_bits() and clear_extent_bit(), which keep
merging and splitting
>> >> + * extent_state structures over and over, wasting lots of
time.
>> >> + *
>> >> + * Therefore if the inode is being evicted, let
btrfs_invalidatepage() skip all
>> >> + * those expensive operations on a per page basis and do only
the ordered io
>> >> + * finishing, while we release here the extent_map and
extent_state structures,
>> >> + * without the excessive merging and splitting.
>> >> + */
>> >> +static void evict_inode_truncate_pages(struct inode *inode)
>> >> +{
>> >> +     struct extent_io_tree *io_tree =
&BTRFS_I(inode)->io_tree;
>> >> +     struct extent_map_tree *map_tree =
&BTRFS_I(inode)->extent_tree;
>> >> +     struct rb_node *node;
>> >> +
>> >> +     ASSERT(inode->i_state & I_FREEING);
>> >> +     truncate_inode_pages(&inode->i_data, 0);
>> >> +
>> >> +     write_lock(&map_tree->lock);
>> >> +     while (!RB_EMPTY_ROOT(&map_tree->map)) {
>> >> +             struct extent_map *em;
>> >> +
>> >> +             node = rb_first(&map_tree->map);
>> >> +             em = rb_entry(node, struct extent_map, rb_node);
>> >> +             remove_extent_mapping(map_tree, em);
>> >> +             free_extent_map(em);
>> >> +     }
>> >> +     write_unlock(&map_tree->lock);
>> >> +
>> >> +     spin_lock(&io_tree->lock);
>> >> +     while (!RB_EMPTY_ROOT(&io_tree->state)) {
>> >> +             struct extent_state *state;
>> >> +             struct extent_state *cached_state = NULL;
>> >> +
>> >> +             node = rb_first(&io_tree->state);
>> >> +             state = rb_entry(node, struct extent_state,
rb_node);
>> >> +             atomic_inc(&state->refs);
>> >> +             spin_unlock(&io_tree->lock);
>> >> +
>> >> +             lock_extent_bits(io_tree, state->start,
state->end,
>> >> +                              0, &cached_state);
>> >> +             clear_extent_bit(io_tree, state->start,
state->end,
>> >> +                              EXTENT_LOCKED | EXTENT_DIRTY |
>> >> +                              EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING |
>> >> +                              EXTENT_DEFRAG, 1, 1,
>> >> +                              &cached_state, GFP_NOFS);
>> >> +             free_extent_state(state);
>> >> +
>> >> +             spin_lock(&io_tree->lock);
>> >> +     }
>> >> +     spin_unlock(&io_tree->lock);
>> >> +}
>> >> +
>> >>  void btrfs_evict_inode(struct inode *inode)
>> >>  {
>> >>       struct btrfs_trans_handle *trans;
>> >> @@ -4498,7 +4554,8 @@ void btrfs_evict_inode(struct inode
*inode)
>> >>
>> >>       trace_btrfs_inode_evict(inode);
>> >>
>> >> -     truncate_inode_pages(&inode->i_data, 0);
>> >> +     evict_inode_truncate_pages(inode);
>> >> +
>> >>       if (inode->i_nlink &&
>> >>           ((btrfs_root_refs(&root->root_item) != 0
&&
>> >>             root->root_key.objectid !=
BTRFS_ROOT_TREE_OBJECTID) ||
>> >> @@ -7379,6 +7436,7 @@ static void btrfs_invalidatepage(struct
page *page, unsigned int offset,
>> >>       struct extent_state *cached_state = NULL;
>> >>       u64 page_start = page_offset(page);
>> >>       u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
>> >> +     int inode_evicting = inode->i_state & I_FREEING;
>> >>
>> >>       /*
>> >>        * we have the page locked, so new writeback
can''t start,
>> >> @@ -7394,17 +7452,21 @@ static void
btrfs_invalidatepage(struct page *page, unsigned int offset,
>> >>               btrfs_releasepage(page, GFP_NOFS);
>> >>               return;
>> >>       }
>> >> -     lock_extent_bits(tree, page_start, page_end, 0,
&cached_state);
>> >> -     ordered = btrfs_lookup_ordered_extent(inode,
page_offset(page));
>> >> +
>> >> +     if (!inode_evicting)
>> >> +             lock_extent_bits(tree, page_start, page_end, 0,
&cached_state);
>> >> +     ordered = btrfs_lookup_ordered_extent(inode,
page_start);
>> >>       if (ordered) {
>> >>               /*
>> >>                * IO on this page will never be started, so we
need
>> >>                * to account for any ordered extents now
>> >>                */
>> >> -             clear_extent_bit(tree, page_start, page_end,
>> >> -                              EXTENT_DIRTY | EXTENT_DELALLOC
|
>> >> -                              EXTENT_LOCKED |
EXTENT_DO_ACCOUNTING |
>> >> -                              EXTENT_DEFRAG, 1, 0,
&cached_state, GFP_NOFS);
>> >> +             if (!inode_evicting)
>> >> +                     clear_extent_bit(tree, page_start,
page_end,
>> >> +                                      EXTENT_DIRTY |
EXTENT_DELALLOC |
>> >> +                                      EXTENT_LOCKED |
EXTENT_DO_ACCOUNTING |
>> >> +                                      EXTENT_DEFRAG, 1, 0,
&cached_state,
>> >> +                                      GFP_NOFS);
>> >>               /*
>> >>                * whoever cleared the private bit is
responsible
>> >>                * for the finish_ordered_io
>> >> @@ -7428,14 +7490,22 @@ static void
btrfs_invalidatepage(struct page *page, unsigned int offset,
>> >>                              
btrfs_finish_ordered_io(ordered);
>> >>               }
>> >>               btrfs_put_ordered_extent(ordered);
>> >> -             cached_state = NULL;
>> >> -             lock_extent_bits(tree, page_start, page_end, 0,
&cached_state);
>> >> +             if (!inode_evicting) {
>> >> +                     cached_state = NULL;
>> >> +                     lock_extent_bits(tree, page_start,
page_end, 0,
>> >> +                                      &cached_state);
>> >> +             }
>> >> +     }
>> >> +
>> >> +     if (!inode_evicting) {
>> >> +             clear_extent_bit(tree, page_start, page_end,
>> >> +                              EXTENT_LOCKED | EXTENT_DIRTY |
>> >> +                              EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING |
>> >> +                              EXTENT_DEFRAG, 1, 1,
>> >> +                              &cached_state, GFP_NOFS);
>> >> +
>> >> +             __btrfs_releasepage(page, GFP_NOFS);
>> >>       }
>> >> -     clear_extent_bit(tree, page_start, page_end,
>> >> -              EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC
|
>> >> -              EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
>> >> -              &cached_state, GFP_NOFS);
>> >> -     __btrfs_releasepage(page, GFP_NOFS);
>> >>
>> >>       ClearPageChecked(page);
>> >>       if (PagePrivate(page)) {
>> >> --
>> >> 1.7.9.5
>> >>
>> >> --
>> >> To unsubscribe from this list: send the line "unsubscribe
linux-btrfs" in
>> >> the body of a message to majordomo@vger.kernel.org
>> >> More majordomo info at 
http://vger.kernel.org/majordomo-info.html
>>
>>
>>
>> --
>> Filipe David Manana,
>>
>> "Reasonable men adapt themselves to the world.
>>  Unreasonable men adapt the world to themselves.
>>  That''s why all progress depends on unreasonable men."


-- 
Filipe David Manana,

"Reasonable men adapt themselves to the world.
 Unreasonable men adapt the world to themselves.
 That''s why all progress depends on unreasonable men."
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Liu Bo

2013-Dec-16 11:57 UTC

head link

Re: [PATCH] Btrfs: fix very slow inode eviction and fs unmount

On Mon, Dec 16, 2013 at 11:48:08AM +0000, Filipe David Manana
wrote:> On Mon, Dec 16, 2013 at 11:45 AM, Liu Bo <bo.li.liu@oracle.com>
wrote:
> > On Mon, Dec 16, 2013 at 11:05:31AM +0000, Filipe David Manana wrote:
> >> On Mon, Dec 16, 2013 at 9:27 AM, Liu Bo
<bo.li.liu@oracle.com> wrote:
> >> > On Tue, Nov 19, 2013 at 10:29:35PM +0000, Filipe David Borba
Manana wrote:
> >> >> The inode eviction can be very slow, because during
eviction we
> >> >> tell the VFS to truncate all of the inode''s
pages. This results
> >> >> in calls to btrfs_invalidatepage() which in turn does
calls to
> >> >> lock_extent_bits() and clear_extent_bit(). These calls
result in
> >> >> too many merges and splits of extent_state structures,
which
> >> >> consume a lot of time and cpu when the inode has many
pages. In
> >> >> some scenarios I have experienced umount times higher
than 15
> >> >> minutes, even when there''s no pending IO (after
a btrfs fs sync).
> >> >>
> >> >> A quick way to reproduce this issue:
> >> >>
> >> >> $ mkfs.btrfs -f /dev/sdb3
> >> >> $ mount /dev/sdb3 /mnt/btrfs
> >> >> $ cd /mnt/btrfs
> >> >> $ sysbench --test=fileio --file-num=128
--file-total-size=16G \
> >> >>     --file-test-mode=seqwr --num-threads=128 \
> >> >>     --file-block-size=16384 --max-time=60
--max-requests=0 run
> >> >> $ time btrfs fi sync .
> >> >> FSSync ''.''
> >> >>
> >> >> real  0m25.457s
> >> >> user  0m0.000s
> >> >> sys   0m0.092s
> >> >> $ cd ..
> >> >> $ time umount /mnt/btrfs
> >> >>
> >> >> real  1m38.234s
> >> >> user  0m0.000s
> >> >> sys   1m25.760s
> >> >>
> >> >
> >> > What about the time of umount after ''sync''?
> >>
> >> Same huge difference.
> >> Thanks.
> >
> > Not seeing that huge one with the latest btrfs, maybe because your
memory is
> > rather larger.
> 
> Not sure if I understand you.
> Latest btrfs-next has this change integrated. Was the test below with
> it integrated? You would have to compare it with a build without this
> change.
I''m testing the script with Chris''s upstream repo, not
btrfs-next, and umount
is normal.

It''s possible that some patches merged in btrfs-next make
umount''s latency longer
than expected.

thanks,
-liubo
> 
> Thanks.
> 
> >
> > time sync
> > FSSync ''/mnt/btrfs''
> >
> > real    0m17.006s
> > user    0m0.004s
> > sys     0m0.056s
> >
> > time umount /mnt/btrfs
> >
> > real    0m0.910s
> > user    0m0.003s
> > sys     0m0.715s
> >
> > -liubo
> >
> >>
> >> >
> >> > The following ext4 uses sync while btrfs uses ''btrfs
filesystem sync''.
> >> >
> >> > I don''t think they are the same thing.
> >> >
> >> > -liubo
> >> >
> >> >> The same test on ext4 runs much faster:
> >> >>
> >> >> $ mkfs.ext4 /dev/sdb3
> >> >> $ mount /dev/sdb3 /mnt/ext4
> >> >> $ cd /mnt/ext4
> >> >> $ sysbench --test=fileio --file-num=128
--file-total-size=16G \
> >> >>     --file-test-mode=seqwr --num-threads=128 \
> >> >>     --file-block-size=16384 --max-time=60
--max-requests=0 run
> >> >> $ sync
> >> >> $ cd ..
> >> >> $ time umount /mnt/ext4
> >> >>
> >> >> real  0m3.626s
> >> >> user  0m0.004s
> >> >> sys   0m3.012s
> >> >>
> >> >> After this patch, the unmount (inode evictions) is much
faster:
> >> >>
> >> >> $ mkfs.btrfs -f /dev/sdb3
> >> >> $ mount /dev/sdb3 /mnt/btrfs
> >> >> $ cd /mnt/btrfs
> >> >> $ sysbench --test=fileio --file-num=128
--file-total-size=16G \
> >> >>     --file-test-mode=seqwr --num-threads=128 \
> >> >>     --file-block-size=16384 --max-time=60
--max-requests=0 run
> >> >> $ time btrfs fi sync .
> >> >> FSSync ''.''
> >> >>
> >> >> real  0m26.774s
> >> >> user  0m0.000s
> >> >> sys   0m0.084s
> >> >> $ cd ..
> >> >> $ time umount /mnt/btrfs
> >> >>
> >> >> real  0m1.811s
> >> >> user  0m0.000s
> >> >> sys   0m1.564s
> >> >
> >> >>
> >> >> Signed-off-by: Filipe David Borba Manana
<fdmanana@gmail.com>
> >> >> ---
> >> >>  fs/btrfs/inode.c |   98
++++++++++++++++++++++++++++++++++++++++++++++--------
> >> >>  1 file changed, 84 insertions(+), 14 deletions(-)
> >> >>
> >> >> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> >> >> index 5a5de36..e889779 100644
> >> >> --- a/fs/btrfs/inode.c
> >> >> +++ b/fs/btrfs/inode.c
> >> >> @@ -4488,6 +4488,62 @@ static int btrfs_setattr(struct
dentry *dentry, struct iattr *attr)
> >> >>       return err;
> >> >>  }
> >> >>
> >> >> +/*
> >> >> + * While truncating the inode pages during eviction, we
get the VFS calling
> >> >> + * btrfs_invalidatepage() against each page of the
inode. This is slow because
> >> >> + * the calls to btrfs_invalidatepage() result in a huge
amount of calls to
> >> >> + * lock_extent_bits() and clear_extent_bit(), which keep
merging and splitting
> >> >> + * extent_state structures over and over, wasting lots
of time.
> >> >> + *
> >> >> + * Therefore if the inode is being evicted, let
btrfs_invalidatepage() skip all
> >> >> + * those expensive operations on a per page basis and do
only the ordered io
> >> >> + * finishing, while we release here the extent_map and
extent_state structures,
> >> >> + * without the excessive merging and splitting.
> >> >> + */
> >> >> +static void evict_inode_truncate_pages(struct inode
*inode)
> >> >> +{
> >> >> +     struct extent_io_tree *io_tree =
&BTRFS_I(inode)->io_tree;
> >> >> +     struct extent_map_tree *map_tree =
&BTRFS_I(inode)->extent_tree;
> >> >> +     struct rb_node *node;
> >> >> +
> >> >> +     ASSERT(inode->i_state & I_FREEING);
> >> >> +     truncate_inode_pages(&inode->i_data, 0);
> >> >> +
> >> >> +     write_lock(&map_tree->lock);
> >> >> +     while (!RB_EMPTY_ROOT(&map_tree->map)) {
> >> >> +             struct extent_map *em;
> >> >> +
> >> >> +             node = rb_first(&map_tree->map);
> >> >> +             em = rb_entry(node, struct extent_map,
rb_node);
> >> >> +             remove_extent_mapping(map_tree, em);
> >> >> +             free_extent_map(em);
> >> >> +     }
> >> >> +     write_unlock(&map_tree->lock);
> >> >> +
> >> >> +     spin_lock(&io_tree->lock);
> >> >> +     while (!RB_EMPTY_ROOT(&io_tree->state)) {
> >> >> +             struct extent_state *state;
> >> >> +             struct extent_state *cached_state = NULL;
> >> >> +
> >> >> +             node = rb_first(&io_tree->state);
> >> >> +             state = rb_entry(node, struct extent_state,
rb_node);
> >> >> +             atomic_inc(&state->refs);
> >> >> +             spin_unlock(&io_tree->lock);
> >> >> +
> >> >> +             lock_extent_bits(io_tree, state->start,
state->end,
> >> >> +                              0, &cached_state);
> >> >> +             clear_extent_bit(io_tree, state->start,
state->end,
> >> >> +                              EXTENT_LOCKED |
EXTENT_DIRTY |
> >> >> +                              EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING |
> >> >> +                              EXTENT_DEFRAG, 1, 1,
> >> >> +                              &cached_state,
GFP_NOFS);
> >> >> +             free_extent_state(state);
> >> >> +
> >> >> +             spin_lock(&io_tree->lock);
> >> >> +     }
> >> >> +     spin_unlock(&io_tree->lock);
> >> >> +}
> >> >> +
> >> >>  void btrfs_evict_inode(struct inode *inode)
> >> >>  {
> >> >>       struct btrfs_trans_handle *trans;
> >> >> @@ -4498,7 +4554,8 @@ void btrfs_evict_inode(struct inode
*inode)
> >> >>
> >> >>       trace_btrfs_inode_evict(inode);
> >> >>
> >> >> -     truncate_inode_pages(&inode->i_data, 0);
> >> >> +     evict_inode_truncate_pages(inode);
> >> >> +
> >> >>       if (inode->i_nlink &&
> >> >>           ((btrfs_root_refs(&root->root_item) != 0
&&
> >> >>             root->root_key.objectid !=
BTRFS_ROOT_TREE_OBJECTID) ||
> >> >> @@ -7379,6 +7436,7 @@ static void
btrfs_invalidatepage(struct page *page, unsigned int offset,
> >> >>       struct extent_state *cached_state = NULL;
> >> >>       u64 page_start = page_offset(page);
> >> >>       u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
> >> >> +     int inode_evicting = inode->i_state &
I_FREEING;
> >> >>
> >> >>       /*
> >> >>        * we have the page locked, so new writeback
can''t start,
> >> >> @@ -7394,17 +7452,21 @@ static void
btrfs_invalidatepage(struct page *page, unsigned int offset,
> >> >>               btrfs_releasepage(page, GFP_NOFS);
> >> >>               return;
> >> >>       }
> >> >> -     lock_extent_bits(tree, page_start, page_end, 0,
&cached_state);
> >> >> -     ordered = btrfs_lookup_ordered_extent(inode,
page_offset(page));
> >> >> +
> >> >> +     if (!inode_evicting)
> >> >> +             lock_extent_bits(tree, page_start,
page_end, 0, &cached_state);
> >> >> +     ordered = btrfs_lookup_ordered_extent(inode,
page_start);
> >> >>       if (ordered) {
> >> >>               /*
> >> >>                * IO on this page will never be started,
so we need
> >> >>                * to account for any ordered extents now
> >> >>                */
> >> >> -             clear_extent_bit(tree, page_start,
page_end,
> >> >> -                              EXTENT_DIRTY |
EXTENT_DELALLOC |
> >> >> -                              EXTENT_LOCKED |
EXTENT_DO_ACCOUNTING |
> >> >> -                              EXTENT_DEFRAG, 1, 0,
&cached_state, GFP_NOFS);
> >> >> +             if (!inode_evicting)
> >> >> +                     clear_extent_bit(tree, page_start,
page_end,
> >> >> +                                      EXTENT_DIRTY |
EXTENT_DELALLOC |
> >> >> +                                      EXTENT_LOCKED |
EXTENT_DO_ACCOUNTING |
> >> >> +                                      EXTENT_DEFRAG, 1,
0, &cached_state,
> >> >> +                                      GFP_NOFS);
> >> >>               /*
> >> >>                * whoever cleared the private bit is
responsible
> >> >>                * for the finish_ordered_io
> >> >> @@ -7428,14 +7490,22 @@ static void
btrfs_invalidatepage(struct page *page, unsigned int offset,
> >> >>                              
btrfs_finish_ordered_io(ordered);
> >> >>               }
> >> >>               btrfs_put_ordered_extent(ordered);
> >> >> -             cached_state = NULL;
> >> >> -             lock_extent_bits(tree, page_start,
page_end, 0, &cached_state);
> >> >> +             if (!inode_evicting) {
> >> >> +                     cached_state = NULL;
> >> >> +                     lock_extent_bits(tree, page_start,
page_end, 0,
> >> >> +                                     
&cached_state);
> >> >> +             }
> >> >> +     }
> >> >> +
> >> >> +     if (!inode_evicting) {
> >> >> +             clear_extent_bit(tree, page_start,
page_end,
> >> >> +                              EXTENT_LOCKED |
EXTENT_DIRTY |
> >> >> +                              EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING |
> >> >> +                              EXTENT_DEFRAG, 1, 1,
> >> >> +                              &cached_state,
GFP_NOFS);
> >> >> +
> >> >> +             __btrfs_releasepage(page, GFP_NOFS);
> >> >>       }
> >> >> -     clear_extent_bit(tree, page_start, page_end,
> >> >> -              EXTENT_LOCKED | EXTENT_DIRTY |
EXTENT_DELALLOC |
> >> >> -              EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
1,
> >> >> -              &cached_state, GFP_NOFS);
> >> >> -     __btrfs_releasepage(page, GFP_NOFS);
> >> >>
> >> >>       ClearPageChecked(page);
> >> >>       if (PagePrivate(page)) {
> >> >> --
> >> >> 1.7.9.5
> >> >>
> >> >> --
> >> >> To unsubscribe from this list: send the line
"unsubscribe linux-btrfs" in
> >> >> the body of a message to majordomo@vger.kernel.org
> >> >> More majordomo info at 
http://vger.kernel.org/majordomo-info.html
> >>
> >>
> >>
> >> --
> >> Filipe David Manana,
> >>
> >> "Reasonable men adapt themselves to the world.
> >>  Unreasonable men adapt the world to themselves.
> >>  That''s why all progress depends on unreasonable
men."
> 
> 
> 
> -- 
> Filipe David Manana,
> 
> "Reasonable men adapt themselves to the world.
>  Unreasonable men adapt the world to themselves.
>  That''s why all progress depends on unreasonable men."--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Filipe David Manana

2013-Dec-16 12:16 UTC

head link

Re: [PATCH] Btrfs: fix very slow inode eviction and fs unmount

On Mon, Dec 16, 2013 at 11:57 AM, Liu Bo <bo.li.liu@oracle.com>
wrote:> On Mon, Dec 16, 2013 at 11:48:08AM +0000, Filipe David Manana wrote:
>> On Mon, Dec 16, 2013 at 11:45 AM, Liu Bo <bo.li.liu@oracle.com>
wrote:
>> > On Mon, Dec 16, 2013 at 11:05:31AM +0000, Filipe David Manana
wrote:
>> >> On Mon, Dec 16, 2013 at 9:27 AM, Liu Bo
<bo.li.liu@oracle.com> wrote:
>> >> > On Tue, Nov 19, 2013 at 10:29:35PM +0000, Filipe David
Borba Manana wrote:
>> >> >> The inode eviction can be very slow, because during
eviction we
>> >> >> tell the VFS to truncate all of the inode''s
pages. This results
>> >> >> in calls to btrfs_invalidatepage() which in turn does
calls to
>> >> >> lock_extent_bits() and clear_extent_bit(). These
calls result in
>> >> >> too many merges and splits of extent_state
structures, which
>> >> >> consume a lot of time and cpu when the inode has many
pages. In
>> >> >> some scenarios I have experienced umount times higher
than 15
>> >> >> minutes, even when there''s no pending IO
(after a btrfs fs sync).
>> >> >>
>> >> >> A quick way to reproduce this issue:
>> >> >>
>> >> >> $ mkfs.btrfs -f /dev/sdb3
>> >> >> $ mount /dev/sdb3 /mnt/btrfs
>> >> >> $ cd /mnt/btrfs
>> >> >> $ sysbench --test=fileio --file-num=128
--file-total-size=16G \
>> >> >>     --file-test-mode=seqwr --num-threads=128 \
>> >> >>     --file-block-size=16384 --max-time=60
--max-requests=0 run
>> >> >> $ time btrfs fi sync .
>> >> >> FSSync ''.''
>> >> >>
>> >> >> real  0m25.457s
>> >> >> user  0m0.000s
>> >> >> sys   0m0.092s
>> >> >> $ cd ..
>> >> >> $ time umount /mnt/btrfs
>> >> >>
>> >> >> real  1m38.234s
>> >> >> user  0m0.000s
>> >> >> sys   1m25.760s
>> >> >>
>> >> >
>> >> > What about the time of umount after
''sync''?
>> >>
>> >> Same huge difference.
>> >> Thanks.
>> >
>> > Not seeing that huge one with the latest btrfs, maybe because your
memory is
>> > rather larger.
>>
>> Not sure if I understand you.
>> Latest btrfs-next has this change integrated. Was the test below with
>> it integrated? You would have to compare it with a build without this
>> change.
>
> I''m testing the script with Chris''s upstream repo, not
btrfs-next, and umount
> is normal.
>
> It''s possible that some patches merged in btrfs-next make
umount''s latency longer
> than expected.
The umount example was just a simple way to show inode eviction was
taking a long time not waiting for or doing IO.
And yes, my test was performed on a machine with a large amount of ram
(32Gb) compared to that tests total file size.

thanks
>
> thanks,
> -liubo
>
>>
>> Thanks.
>>
>> >
>> > time sync
>> > FSSync ''/mnt/btrfs''
>> >
>> > real    0m17.006s
>> > user    0m0.004s
>> > sys     0m0.056s
>> >
>> > time umount /mnt/btrfs
>> >
>> > real    0m0.910s
>> > user    0m0.003s
>> > sys     0m0.715s
>> >
>> > -liubo
>> >
>> >>
>> >> >
>> >> > The following ext4 uses sync while btrfs uses
''btrfs filesystem sync''.
>> >> >
>> >> > I don''t think they are the same thing.
>> >> >
>> >> > -liubo
>> >> >
>> >> >> The same test on ext4 runs much faster:
>> >> >>
>> >> >> $ mkfs.ext4 /dev/sdb3
>> >> >> $ mount /dev/sdb3 /mnt/ext4
>> >> >> $ cd /mnt/ext4
>> >> >> $ sysbench --test=fileio --file-num=128
--file-total-size=16G \
>> >> >>     --file-test-mode=seqwr --num-threads=128 \
>> >> >>     --file-block-size=16384 --max-time=60
--max-requests=0 run
>> >> >> $ sync
>> >> >> $ cd ..
>> >> >> $ time umount /mnt/ext4
>> >> >>
>> >> >> real  0m3.626s
>> >> >> user  0m0.004s
>> >> >> sys   0m3.012s
>> >> >>
>> >> >> After this patch, the unmount (inode evictions) is
much faster:
>> >> >>
>> >> >> $ mkfs.btrfs -f /dev/sdb3
>> >> >> $ mount /dev/sdb3 /mnt/btrfs
>> >> >> $ cd /mnt/btrfs
>> >> >> $ sysbench --test=fileio --file-num=128
--file-total-size=16G \
>> >> >>     --file-test-mode=seqwr --num-threads=128 \
>> >> >>     --file-block-size=16384 --max-time=60
--max-requests=0 run
>> >> >> $ time btrfs fi sync .
>> >> >> FSSync ''.''
>> >> >>
>> >> >> real  0m26.774s
>> >> >> user  0m0.000s
>> >> >> sys   0m0.084s
>> >> >> $ cd ..
>> >> >> $ time umount /mnt/btrfs
>> >> >>
>> >> >> real  0m1.811s
>> >> >> user  0m0.000s
>> >> >> sys   0m1.564s
>> >> >
>> >> >>
>> >> >> Signed-off-by: Filipe David Borba Manana
<fdmanana@gmail.com>
>> >> >> ---
>> >> >>  fs/btrfs/inode.c |   98
++++++++++++++++++++++++++++++++++++++++++++++--------
>> >> >>  1 file changed, 84 insertions(+), 14 deletions(-)
>> >> >>
>> >> >> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>> >> >> index 5a5de36..e889779 100644
>> >> >> --- a/fs/btrfs/inode.c
>> >> >> +++ b/fs/btrfs/inode.c
>> >> >> @@ -4488,6 +4488,62 @@ static int
btrfs_setattr(struct dentry *dentry, struct iattr *attr)
>> >> >>       return err;
>> >> >>  }
>> >> >>
>> >> >> +/*
>> >> >> + * While truncating the inode pages during eviction,
we get the VFS calling
>> >> >> + * btrfs_invalidatepage() against each page of the
inode. This is slow because
>> >> >> + * the calls to btrfs_invalidatepage() result in a
huge amount of calls to
>> >> >> + * lock_extent_bits() and clear_extent_bit(), which
keep merging and splitting
>> >> >> + * extent_state structures over and over, wasting
lots of time.
>> >> >> + *
>> >> >> + * Therefore if the inode is being evicted, let
btrfs_invalidatepage() skip all
>> >> >> + * those expensive operations on a per page basis
and do only the ordered io
>> >> >> + * finishing, while we release here the extent_map
and extent_state structures,
>> >> >> + * without the excessive merging and splitting.
>> >> >> + */
>> >> >> +static void evict_inode_truncate_pages(struct inode
*inode)
>> >> >> +{
>> >> >> +     struct extent_io_tree *io_tree =
&BTRFS_I(inode)->io_tree;
>> >> >> +     struct extent_map_tree *map_tree =
&BTRFS_I(inode)->extent_tree;
>> >> >> +     struct rb_node *node;
>> >> >> +
>> >> >> +     ASSERT(inode->i_state & I_FREEING);
>> >> >> +     truncate_inode_pages(&inode->i_data, 0);
>> >> >> +
>> >> >> +     write_lock(&map_tree->lock);
>> >> >> +     while (!RB_EMPTY_ROOT(&map_tree->map)) {
>> >> >> +             struct extent_map *em;
>> >> >> +
>> >> >> +             node = rb_first(&map_tree->map);
>> >> >> +             em = rb_entry(node, struct extent_map,
rb_node);
>> >> >> +             remove_extent_mapping(map_tree, em);
>> >> >> +             free_extent_map(em);
>> >> >> +     }
>> >> >> +     write_unlock(&map_tree->lock);
>> >> >> +
>> >> >> +     spin_lock(&io_tree->lock);
>> >> >> +     while (!RB_EMPTY_ROOT(&io_tree->state))
{
>> >> >> +             struct extent_state *state;
>> >> >> +             struct extent_state *cached_state =
NULL;
>> >> >> +
>> >> >> +             node =
rb_first(&io_tree->state);
>> >> >> +             state = rb_entry(node, struct
extent_state, rb_node);
>> >> >> +             atomic_inc(&state->refs);
>> >> >> +             spin_unlock(&io_tree->lock);
>> >> >> +
>> >> >> +             lock_extent_bits(io_tree,
state->start, state->end,
>> >> >> +                              0, &cached_state);
>> >> >> +             clear_extent_bit(io_tree,
state->start, state->end,
>> >> >> +                              EXTENT_LOCKED |
EXTENT_DIRTY |
>> >> >> +                              EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING |
>> >> >> +                              EXTENT_DEFRAG, 1, 1,
>> >> >> +                              &cached_state,
GFP_NOFS);
>> >> >> +             free_extent_state(state);
>> >> >> +
>> >> >> +             spin_lock(&io_tree->lock);
>> >> >> +     }
>> >> >> +     spin_unlock(&io_tree->lock);
>> >> >> +}
>> >> >> +
>> >> >>  void btrfs_evict_inode(struct inode *inode)
>> >> >>  {
>> >> >>       struct btrfs_trans_handle *trans;
>> >> >> @@ -4498,7 +4554,8 @@ void btrfs_evict_inode(struct
inode *inode)
>> >> >>
>> >> >>       trace_btrfs_inode_evict(inode);
>> >> >>
>> >> >> -     truncate_inode_pages(&inode->i_data, 0);
>> >> >> +     evict_inode_truncate_pages(inode);
>> >> >> +
>> >> >>       if (inode->i_nlink &&
>> >> >>           ((btrfs_root_refs(&root->root_item)
!= 0 &&
>> >> >>             root->root_key.objectid !=
BTRFS_ROOT_TREE_OBJECTID) ||
>> >> >> @@ -7379,6 +7436,7 @@ static void
btrfs_invalidatepage(struct page *page, unsigned int offset,
>> >> >>       struct extent_state *cached_state = NULL;
>> >> >>       u64 page_start = page_offset(page);
>> >> >>       u64 page_end = page_start + PAGE_CACHE_SIZE -
1;
>> >> >> +     int inode_evicting = inode->i_state &
I_FREEING;
>> >> >>
>> >> >>       /*
>> >> >>        * we have the page locked, so new writeback
can''t start,
>> >> >> @@ -7394,17 +7452,21 @@ static void
btrfs_invalidatepage(struct page *page, unsigned int offset,
>> >> >>               btrfs_releasepage(page, GFP_NOFS);
>> >> >>               return;
>> >> >>       }
>> >> >> -     lock_extent_bits(tree, page_start, page_end, 0,
&cached_state);
>> >> >> -     ordered = btrfs_lookup_ordered_extent(inode,
page_offset(page));
>> >> >> +
>> >> >> +     if (!inode_evicting)
>> >> >> +             lock_extent_bits(tree, page_start,
page_end, 0, &cached_state);
>> >> >> +     ordered = btrfs_lookup_ordered_extent(inode,
page_start);
>> >> >>       if (ordered) {
>> >> >>               /*
>> >> >>                * IO on this page will never be
started, so we need
>> >> >>                * to account for any ordered extents
now
>> >> >>                */
>> >> >> -             clear_extent_bit(tree, page_start,
page_end,
>> >> >> -                              EXTENT_DIRTY |
EXTENT_DELALLOC |
>> >> >> -                              EXTENT_LOCKED |
EXTENT_DO_ACCOUNTING |
>> >> >> -                              EXTENT_DEFRAG, 1, 0,
&cached_state, GFP_NOFS);
>> >> >> +             if (!inode_evicting)
>> >> >> +                     clear_extent_bit(tree,
page_start, page_end,
>> >> >> +                                      EXTENT_DIRTY |
EXTENT_DELALLOC |
>> >> >> +                                      EXTENT_LOCKED
| EXTENT_DO_ACCOUNTING |
>> >> >> +                                      EXTENT_DEFRAG,
1, 0, &cached_state,
>> >> >> +                                      GFP_NOFS);
>> >> >>               /*
>> >> >>                * whoever cleared the private bit is
responsible
>> >> >>                * for the finish_ordered_io
>> >> >> @@ -7428,14 +7490,22 @@ static void
btrfs_invalidatepage(struct page *page, unsigned int offset,
>> >> >>                              
btrfs_finish_ordered_io(ordered);
>> >> >>               }
>> >> >>               btrfs_put_ordered_extent(ordered);
>> >> >> -             cached_state = NULL;
>> >> >> -             lock_extent_bits(tree, page_start,
page_end, 0, &cached_state);
>> >> >> +             if (!inode_evicting) {
>> >> >> +                     cached_state = NULL;
>> >> >> +                     lock_extent_bits(tree,
page_start, page_end, 0,
>> >> >> +                                     
&cached_state);
>> >> >> +             }
>> >> >> +     }
>> >> >> +
>> >> >> +     if (!inode_evicting) {
>> >> >> +             clear_extent_bit(tree, page_start,
page_end,
>> >> >> +                              EXTENT_LOCKED |
EXTENT_DIRTY |
>> >> >> +                              EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING |
>> >> >> +                              EXTENT_DEFRAG, 1, 1,
>> >> >> +                              &cached_state,
GFP_NOFS);
>> >> >> +
>> >> >> +             __btrfs_releasepage(page, GFP_NOFS);
>> >> >>       }
>> >> >> -     clear_extent_bit(tree, page_start, page_end,
>> >> >> -              EXTENT_LOCKED | EXTENT_DIRTY |
EXTENT_DELALLOC |
>> >> >> -              EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
1, 1,
>> >> >> -              &cached_state, GFP_NOFS);
>> >> >> -     __btrfs_releasepage(page, GFP_NOFS);
>> >> >>
>> >> >>       ClearPageChecked(page);
>> >> >>       if (PagePrivate(page)) {
>> >> >> --
>> >> >> 1.7.9.5
>> >> >>
>> >> >> --
>> >> >> To unsubscribe from this list: send the line
"unsubscribe linux-btrfs" in
>> >> >> the body of a message to majordomo@vger.kernel.org
>> >> >> More majordomo info at 
http://vger.kernel.org/majordomo-info.html
>> >>
>> >>
>> >>
>> >> --
>> >> Filipe David Manana,
>> >>
>> >> "Reasonable men adapt themselves to the world.
>> >>  Unreasonable men adapt the world to themselves.
>> >>  That''s why all progress depends on unreasonable
men."
>>
>>
>>
>> --
>> Filipe David Manana,
>>
>> "Reasonable men adapt themselves to the world.
>>  Unreasonable men adapt the world to themselves.
>>  That''s why all progress depends on unreasonable men."


-- 
Filipe David Manana,

"Reasonable men adapt themselves to the world.
 Unreasonable men adapt the world to themselves.
 That''s why all progress depends on unreasonable men."
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Apparently Analagous Threads

Search for more reasonably related threads

Btrfs devel - Nov 2013 - [PATCH] Btrfs: fix very slow inode eviction and fs unmount

[PATCH] Btrfs: fix very slow inode eviction and fs unmount

Re: [PATCH] Btrfs: fix very slow inode eviction and fs unmount

Re: [PATCH] Btrfs: fix very slow inode eviction and fs unmount

Re: [PATCH] Btrfs: fix very slow inode eviction and fs unmount

Re: [PATCH] Btrfs: fix very slow inode eviction and fs unmount

Re: [PATCH] Btrfs: fix very slow inode eviction and fs unmount

Re: [PATCH] Btrfs: fix very slow inode eviction and fs unmount

Apparently Analagous Threads