thr3ads.net - Btrfs devel - [PATCH 1/2] Btrfs: cleanup extents after we finish logging inode [Aug 2012]

If this information is useful, please help other people find it:
Share via:

Liu Bo

2012-Aug-27 16:52 UTC

[PATCH 1/2] Btrfs: cleanup extents after we finish logging inode

This is based on Josef''s "Btrfs: turbo charge fsync".

We should cleanup those extents after we''ve finished logging inode,
otherwise we may do redundant work on them.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/tree-log.c |    6 ++++++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5190cd6..e7365d7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3167,6 +3167,12 @@ next_slot:
 			err = ret;
 			goto out_unlock;
 		}
+	} else {
+		struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+		struct extent_map *em, *n;
+
+		list_for_each_entry_safe(em, n, &tree->modified_extents, list)
+			list_del_init(&em->list);
 	}
 
 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Liu Bo

2012-Aug-27 16:52 UTC

head link

[PATCH 2/2] Btrfs: improve fsync by filtering extents that we want

This is based on Josef''s "Btrfs: turbo charge fsync".

The above Josef''s patch performs very good in random sync write test,
because we won''t have too much extents to merge.

However, it does not performs good on the test:
dd if=/dev/zero of=foobar bs=4k count=12500 oflag=sync

The reason is when we do sequencial sync write, we need to merge the
current extent just with the previous one, so that we can get accumulated
extents to log:

A(4k) --> AA(8k) --> AAA(12k) --> AAAA(16k) ...

So we''ll have to flush more and more checksum into log tree, which is
the
bottleneck according to my tests.

But we can avoid this by telling fsync the real extents that are needed
to be logged.

With this, I did the above dd sync write test (size=50m),

         w/o (orig)   w/ (josef''s)   w/ (this)
SATA      104KB/s       109KB/s       121KB/s
ramdisk   1.5MB/s       1.5MB/s       10.7MB/s (613%)

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/extent_map.c |   20 ++++++++++++++++++++
 fs/btrfs/extent_map.h |    2 ++
 fs/btrfs/inode.c      |    1 +
 fs/btrfs/tree-log.c   |    6 +++---
 4 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1fe82cf..ac606f0 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -203,6 +203,8 @@ static void try_merge_map(struct extent_map_tree *tree,
struct extent_map *em)
 			em->block_start = merge->block_start;
 			merge->in_tree = 0;
 			if (merge->generation > em->generation) {
+				em->mod_start = em->start;
+				em->mod_len = em->len;
 				em->generation = merge->generation;
 				list_move(&em->list, &tree->modified_extents);
 			}
@@ -222,6 +224,7 @@ static void try_merge_map(struct extent_map_tree *tree,
struct extent_map *em)
 		rb_erase(&merge->rb_node, &tree->map);
 		merge->in_tree = 0;
 		if (merge->generation > em->generation) {
+			em->mod_len = em->len;
 			em->generation = merge->generation;
 			list_move(&em->list, &tree->modified_extents);
 		}
@@ -247,6 +250,7 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64
start, u64 len,
 {
 	int ret = 0;
 	struct extent_map *em;
+	bool prealloc = false;
 
 	write_lock(&tree->lock);
 	em = lookup_extent_mapping(tree, start, len);
@@ -259,8 +263,21 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64
start, u64 len,
 	list_move(&em->list, &tree->modified_extents);
 	em->generation = gen;
 	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+	em->mod_start = em->start;
+	em->mod_len = em->len;
+
+	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+		prealloc = true;
+		clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+	}
 
 	try_merge_map(tree, em);
+
+	if (prealloc) {
+		em->mod_start = em->start;
+		em->mod_len = em->len;
+	}
+
 	free_extent_map(em);
 out:
 	write_unlock(&tree->lock);
@@ -298,6 +315,9 @@ int add_extent_mapping(struct extent_map_tree *tree,
 	}
 	atomic_inc(&em->refs);
 
+	em->mod_start = em->start;
+	em->mod_len = em->len;
+
 	try_merge_map(tree, em);
 out:
 	return ret;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 2388a60..8e6294b 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -20,6 +20,8 @@ struct extent_map {
 	/* all of these are in bytes */
 	u64 start;
 	u64 len;
+	u64 mod_start;
+	u64 mod_len;
 	u64 orig_start;
 	u64 block_start;
 	u64 block_len;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e887d10..8879e46e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1308,6 +1308,7 @@ out_check:
 			em->block_start = disk_bytenr;
 			em->bdev = root->fs_info->fs_devices->latest_bdev;
 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
+			set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
 			while (1) {
 				write_lock(&em_tree->lock);
 				ret = add_extent_mapping(em_tree, em);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e7365d7..e70cdad 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2833,8 +2833,8 @@ static int log_one_extent(struct btrfs_trans_handle
*trans,
 	struct btrfs_root *log = root->log_root;
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
-	u64 start = em->start;
-	u64 len = em->len;
+	u64 start = em->mod_start;
+	u64 len = em->mod_len;
 	u64 num_bytes;
 	int nritems;
 	int ret;
@@ -2970,7 +2970,7 @@ static int btrfs_log_changed_extents(struct
btrfs_trans_handle *trans,
 		 * sequential then we need to copy the items we have and redo
 		 * our search
 		 */
-		if (args.nr && em->start != args.next_offset) {
+		if (args.nr && em->mod_start != args.next_offset) {
 			ret = copy_items(trans, log, dst_path, args.src,
 					 args.start_slot, args.nr,
 					 LOG_INODE_ALL);
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Josef Bacik

2012-Aug-27 17:05 UTC

head link

Re: [PATCH 1/2] Btrfs: cleanup extents after we finish logging inode

On Mon, Aug 27, 2012 at 10:52:19AM -0600, Liu Bo wrote:> This is based on Josef''s "Btrfs: turbo charge fsync".
> 
> We should cleanup those extents after we''ve finished logging
inode,
> otherwise we may do redundant work on them.
> 
> Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
> ---
>  fs/btrfs/tree-log.c |    6 ++++++
>  1 files changed, 6 insertions(+), 0 deletions(-)
> 
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index 5190cd6..e7365d7 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -3167,6 +3167,12 @@ next_slot:
>  			err = ret;
>  			goto out_unlock;
>  		}
> +	} else {
> +		struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
> +		struct extent_map *em, *n;
> +
> +		list_for_each_entry_safe(em, n, &tree->modified_extents, list)
> +			list_del_init(&em->list);
>  	}
>  
>  	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
> -- 
Ah thanks I had been meaning to do this but I kept forgetting.  I''ll
add it to
btrfs-next.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Josef Bacik

2012-Aug-27 17:12 UTC

head link

Re: [PATCH 2/2] Btrfs: improve fsync by filtering extents that we want

On Mon, Aug 27, 2012 at 10:52:20AM -0600, Liu Bo wrote:> This is based on Josef''s "Btrfs: turbo charge fsync".
> 
> The above Josef''s patch performs very good in random sync write
test,
> because we won''t have too much extents to merge.
> 
> However, it does not performs good on the test:
> dd if=/dev/zero of=foobar bs=4k count=12500 oflag=sync
> 
> The reason is when we do sequencial sync write, we need to merge the
> current extent just with the previous one, so that we can get accumulated
> extents to log:
> 
> A(4k) --> AA(8k) --> AAA(12k) --> AAAA(16k) ...
> 
> So we''ll have to flush more and more checksum into log tree, which
is the
> bottleneck according to my tests.
> 
> But we can avoid this by telling fsync the real extents that are needed
> to be logged.
> 
> With this, I did the above dd sync write test (size=50m),
> 
>          w/o (orig)   w/ (josef''s)   w/ (this)
> SATA      104KB/s       109KB/s       121KB/s
> ramdisk   1.5MB/s       1.5MB/s       10.7MB/s (613%)
> 
> Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
> ---
>  fs/btrfs/extent_map.c |   20 ++++++++++++++++++++
>  fs/btrfs/extent_map.h |    2 ++
>  fs/btrfs/inode.c      |    1 +
>  fs/btrfs/tree-log.c   |    6 +++---
>  4 files changed, 26 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
> index 1fe82cf..ac606f0 100644
> --- a/fs/btrfs/extent_map.c
> +++ b/fs/btrfs/extent_map.c
> @@ -203,6 +203,8 @@ static void try_merge_map(struct extent_map_tree *tree,
struct extent_map *em)
>  			em->block_start = merge->block_start;
>  			merge->in_tree = 0;
>  			if (merge->generation > em->generation) {
> +				em->mod_start = em->start;
> +				em->mod_len = em->len;
Shouldn''t this be

em->mod_start = merge->start;
em->mod_len += merge_len;
>  				em->generation = merge->generation;
>  				list_move(&em->list, &tree->modified_extents);
>  			}
> @@ -222,6 +224,7 @@ static void try_merge_map(struct extent_map_tree *tree,
struct extent_map *em)
>  		rb_erase(&merge->rb_node, &tree->map);
>  		merge->in_tree = 0;
>  		if (merge->generation > em->generation) {
> +			em->mod_len = em->len;
And this should be em->mod_len += em->len?

Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Liu Bo

2012-Aug-28 00:22 UTC

head link

Re: [PATCH 2/2] Btrfs: improve fsync by filtering extents that we want

On 08/28/2012 01:12 AM, Josef Bacik wrote:> On Mon, Aug 27, 2012 at 10:52:20AM -0600, Liu Bo wrote:
>> This is based on Josef''s "Btrfs: turbo charge
fsync".
>>
>> The above Josef''s patch performs very good in random sync
write test,
>> because we won''t have too much extents to merge.
>>
>> However, it does not performs good on the test:
>> dd if=/dev/zero of=foobar bs=4k count=12500 oflag=sync
>>
>> The reason is when we do sequencial sync write, we need to merge the
>> current extent just with the previous one, so that we can get
accumulated
>> extents to log:
>>
>> A(4k) --> AA(8k) --> AAA(12k) --> AAAA(16k) ...
>>
>> So we''ll have to flush more and more checksum into log tree,
which is the
>> bottleneck according to my tests.
>>
>> But we can avoid this by telling fsync the real extents that are needed
>> to be logged.
>>
>> With this, I did the above dd sync write test (size=50m),
>>
>>          w/o (orig)   w/ (josef''s)   w/ (this)
>> SATA      104KB/s       109KB/s       121KB/s
>> ramdisk   1.5MB/s       1.5MB/s       10.7MB/s (613%)
>>
>> Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
>> ---
>>  fs/btrfs/extent_map.c |   20 ++++++++++++++++++++
>>  fs/btrfs/extent_map.h |    2 ++
>>  fs/btrfs/inode.c      |    1 +
>>  fs/btrfs/tree-log.c   |    6 +++---
>>  4 files changed, 26 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
>> index 1fe82cf..ac606f0 100644
>> --- a/fs/btrfs/extent_map.c
>> +++ b/fs/btrfs/extent_map.c
>> @@ -203,6 +203,8 @@ static void try_merge_map(struct extent_map_tree
*tree, struct extent_map *em)
>>  			em->block_start = merge->block_start;
>>  			merge->in_tree = 0;
>>  			if (merge->generation > em->generation) {
>> +				em->mod_start = em->start;
>> +				em->mod_len = em->len;
> 
> Shouldn''t this be
> 
> em->mod_start = merge->start;
> em->mod_len += merge_len;
> 
They just do the same thing.

There is already a 
em->start = merge->start;
em->len += merge_len
>>  				em->generation = merge->generation;
>>  				list_move(&em->list, &tree->modified_extents);
>>  			}
>> @@ -222,6 +224,7 @@ static void try_merge_map(struct extent_map_tree
*tree, struct extent_map *em)
>>  		rb_erase(&merge->rb_node, &tree->map);
>>  		merge->in_tree = 0;
>>  		if (merge->generation > em->generation) {
>> +			em->mod_len = em->len;
> 
> And this should be em->mod_len += em->len?
> 
No, em->len has already contained the merge''s len.

thanks,
liubo

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Josef Bacik

2012-Aug-28 17:54 UTC

head link

Re: [PATCH 2/2] Btrfs: improve fsync by filtering extents that we want

On Mon, Aug 27, 2012 at 06:22:02PM -0600, Liu Bo wrote:> On 08/28/2012 01:12 AM, Josef Bacik wrote:
> > On Mon, Aug 27, 2012 at 10:52:20AM -0600, Liu Bo wrote:
> >> This is based on Josef''s "Btrfs: turbo charge
fsync".
> >>
> >> The above Josef''s patch performs very good in random sync
write test,
> >> because we won''t have too much extents to merge.
> >>
> >> However, it does not performs good on the test:
> >> dd if=/dev/zero of=foobar bs=4k count=12500 oflag=sync
> >>
> >> The reason is when we do sequencial sync write, we need to merge
the
> >> current extent just with the previous one, so that we can get
accumulated
> >> extents to log:
> >>
> >> A(4k) --> AA(8k) --> AAA(12k) --> AAAA(16k) ...
> >>
> >> So we''ll have to flush more and more checksum into log
tree, which is the
> >> bottleneck according to my tests.
> >>
> >> But we can avoid this by telling fsync the real extents that are
needed
> >> to be logged.
> >>
> >> With this, I did the above dd sync write test (size=50m),
> >>
> >>          w/o (orig)   w/ (josef''s)   w/ (this)
> >> SATA      104KB/s       109KB/s       121KB/s
> >> ramdisk   1.5MB/s       1.5MB/s       10.7MB/s (613%)
> >>
> >> Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
> >> ---
> >>  fs/btrfs/extent_map.c |   20 ++++++++++++++++++++
> >>  fs/btrfs/extent_map.h |    2 ++
> >>  fs/btrfs/inode.c      |    1 +
> >>  fs/btrfs/tree-log.c   |    6 +++---
> >>  4 files changed, 26 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
> >> index 1fe82cf..ac606f0 100644
> >> --- a/fs/btrfs/extent_map.c
> >> +++ b/fs/btrfs/extent_map.c
> >> @@ -203,6 +203,8 @@ static void try_merge_map(struct
extent_map_tree *tree, struct extent_map *em)
> >>  			em->block_start = merge->block_start;
> >>  			merge->in_tree = 0;
> >>  			if (merge->generation > em->generation) {
> >> +				em->mod_start = em->start;
> >> +				em->mod_len = em->len;
> > 
> > Shouldn''t this be
> > 
> > em->mod_start = merge->start;
> > em->mod_len += merge_len;
> > 
> 
> They just do the same thing.
> 
> There is already a 
> em->start = merge->start;
> em->len += merge_len
> 
> >>  				em->generation = merge->generation;
> >>  				list_move(&em->list, &tree->modified_extents);
> >>  			}
> >> @@ -222,6 +224,7 @@ static void try_merge_map(struct
extent_map_tree *tree, struct extent_map *em)
> >>  		rb_erase(&merge->rb_node, &tree->map);
> >>  		merge->in_tree = 0;
> >>  		if (merge->generation > em->generation) {
> >> +			em->mod_len = em->len;
> > 
> > And this should be em->mod_len += em->len?
> > 
> 
> No, em->len has already contained the merge''s len.
>
Duh right sorry.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Btrfs devel - Aug 2012 - [PATCH 1/2] Btrfs: cleanup extents after we finish logging inode

[PATCH 1/2] Btrfs: cleanup extents after we finish logging inode

[PATCH 2/2] Btrfs: improve fsync by filtering extents that we want

Re: [PATCH 1/2] Btrfs: cleanup extents after we finish logging inode

Re: [PATCH 2/2] Btrfs: improve fsync by filtering extents that we want

Re: [PATCH 2/2] Btrfs: improve fsync by filtering extents that we want

Re: [PATCH 2/2] Btrfs: improve fsync by filtering extents that we want