This RFC patch is intended to make our extent state code more rcu friendly.
o Since IO related extents in the tree will not be merged or changed,
we can safely clear the extents''s lock flags at endio time.
o After clearing flags, we will merge the adjacent extents when their states
are equal, where we may change the tree.
o If we manage to batch the merge part(now we''re conservative and make
a batch
per bio), we will get to reduce write locks in endio and it will somewhat
improve the read performance.
o Since this is a RFC one, I''ve just passed xfstests without further
tests.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
---
fs/btrfs/extent_io.c | 434 +++++++++++++++++++++++++++++++++++++++++++-------
fs/btrfs/extent_io.h | 5 +-
2 files changed, 382 insertions(+), 57 deletions(-)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 49f3c9d..0cced4d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -110,7 +110,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
tree->ops = NULL;
tree->dirty_bytes = 0;
- spin_lock_init(&tree->lock);
+ rwlock_init(&tree->lock);
spin_lock_init(&tree->buffer_lock);
tree->mapping = mapping;
}
@@ -391,17 +391,19 @@ static int split_state(struct extent_io_tree *tree, struct
extent_state *orig,
return 0;
}
-/*
- * utility function to clear some bits in an extent state struct.
- * it will optionally wake up any one waiting on this state (wake == 1), or
- * forcibly remove the state from the tree (delete == 1).
- *
- * If no bits are set on the state struct after clearing things, the
- * struct is freed and removed from the tree
- */
-static int clear_state_bit(struct extent_io_tree *tree,
- struct extent_state *state,
- int *bits, int wake)
+static struct extent_state *
+alloc_extent_state_atomic(struct extent_state *prealloc)
+{
+ if (!prealloc)
+ prealloc = alloc_extent_state(GFP_ATOMIC);
+
+ return prealloc;
+}
+
+#define RCU_FRIENDLY_CODE 1
+#if RCU_FRIENDLY_CODE
+static int __clear_state_bit(struct extent_io_tree *tree,
+ struct extent_state *state, int *bits, int wake)
{
int bits_to_clear = *bits & ~EXTENT_CTLBITS;
int ret = state->state & bits_to_clear;
@@ -415,6 +417,12 @@ static int clear_state_bit(struct extent_io_tree *tree,
state->state &= ~bits_to_clear;
if (wake)
wake_up(&state->wq);
+ return ret;
+}
+
+static void try_free_or_merge_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+{
if (state->state == 0) {
if (state->tree) {
rb_erase(&state->rb_node, &tree->state);
@@ -426,19 +434,304 @@ static int clear_state_bit(struct extent_io_tree *tree,
} else {
merge_state(tree, state);
}
+}
+
+/*
+ * utility function to clear some bits in an extent state struct.
+ * it will optionally wake up any one waiting on this state (wake == 1), or
+ * forcibly remove the state from the tree (delete == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static int clear_state_bit(struct extent_io_tree *tree,
+ struct extent_state *state, int *bits, int wake)
+{
+ int ret;
+
+ ret = __clear_state_bit(tree, state, bits, wake);
+ try_free_or_merge_state(tree, state);
+
return ret;
}
-static struct extent_state *
-alloc_extent_state_atomic(struct extent_state *prealloc)
+static noinline void batch_merge_state(struct extent_io_tree *tree,
+ struct extent_state *state)
{
- if (!prealloc)
- prealloc = alloc_extent_state(GFP_ATOMIC);
+ struct extent_state *other;
+ struct rb_node *other_node;
- return prealloc;
+ if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+ return;
+
+ while (1) {
+ other_node = rb_prev(&state->rb_node);
+ if (!other_node)
+ break;
+
+ other = rb_entry(other_node, struct extent_state, rb_node);
+ if (other->end == state->start - 1 &&
+ other->state == state->state) {
+ merge_cb(tree, state, other);
+ state->start = other->start;
+ other->tree = NULL;
+ rb_erase(&other->rb_node, &tree->state);
+ free_extent_state(other);
+ } else {
+ break;
+ }
+ }
+
+ while (1) {
+ other_node = rb_next(&state->rb_node);
+ if (!other_node)
+ break;
+
+ other = rb_entry(other_node, struct extent_state, rb_node);
+ if (other->start == state->end + 1 &&
+ other->state == state->state) {
+ merge_cb(tree, state, other);
+ state->end = other->end;
+ other->tree = NULL;
+ rb_erase(&other->rb_node, &tree->state);
+ free_extent_state(other);
+ } else {
+ break;
+ }
+ }
}
/*
+ * get into process_merge_state if return 1, otherwise just go ahead.
+ */
+static int test_merge_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+{
+ struct extent_state *other;
+ struct rb_node *other_node;
+
+ if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+ return 0;
+
+ other_node = rb_prev(&state->rb_node);
+ if (other_node) {
+ other = rb_entry(other_node, struct extent_state, rb_node);
+ if (other->end == state->start - 1 &&
+ other->state == state->state)
+ return 1;
+ }
+ other_node = rb_next(&state->rb_node);
+ if (other_node) {
+ other = rb_entry(other_node, struct extent_state, rb_node);
+ if (other->start == state->end + 1 &&
+ other->state == state->state)
+ return 1;
+ }
+
+ return 0;
+}
+
+static void process_merge_state(struct extent_io_tree *tree, u64 start)
+{
+ struct extent_state *state = NULL;
+ struct rb_node *node = NULL;
+
+ if (!tree || start == (u64)-1) {
+ WARN_ON(1);
+ return;
+ }
+
+ write_lock(&tree->lock);
+ node = tree_search(tree, start);
+ if (!node) {
+ printk(KERN_INFO "write side: not find states"
+ " to merge %llu\n", start);
+ goto out;
+ }
+ state = rb_entry(node, struct extent_state, rb_node);
+ /* should merge all states around this one */
+ batch_merge_state(tree, state);
+out:
+ write_unlock(&tree->lock);
+}
+
+int clear_extent_bit_ro(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int wake, int delete,
+ struct extent_state **cached_state,
+ gfp_t mask)
+{
+ struct extent_state *state;
+ struct extent_state *cached;
+ struct rb_node *next_node;
+ struct rb_node *node;
+ u64 last_end;
+ int set = 0;
+ int clear = 0;
+ int merge = 0;
+
+ if (delete)
+ bits |= ~EXTENT_CTLBITS;
+ bits |= EXTENT_FIRST_DELALLOC;
+
+ if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+ clear = 1;
+
+again:
+ read_lock(&tree->lock);
+ if (cached_state) {
+ cached = *cached_state;
+
+ if (clear) {
+ *cached_state = NULL;
+ cached_state = NULL;
+ }
+
+ if (cached && cached->tree && cached->start <= start
&&
+ cached->end > start) {
+ if (clear)
+ atomic_dec(&cached->refs);
+ state = cached;
+ goto hit_next;
+ }
+ if (clear)
+ free_extent_state(cached);
+ }
+ /*
+ * this search will find the extents that end after
+ * our range starts
+ */
+ node = tree_search(tree, start);
+ if (!node)
+ goto out;
+ state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+ if (state->start > end)
+ goto out;
+ WARN_ON(state->start < start || state->end > end);
+ WARN_ON(state->end < start);
+ last_end = state->end;
+
+ if (state->end < end && !need_resched())
+ next_node = rb_next(&state->rb_node);
+ else
+ next_node = NULL;
+
+ set |= __clear_state_bit(tree, state, &bits, wake);
+
+ /* batch merge ones */
+ merge = test_merge_state(tree, state);
+
+ if (last_end == (u64)-1)
+ goto out;
+ start = last_end + 1;
+ if (start <= end && next_node) {
+ state = rb_entry(next_node, struct extent_state,
+ rb_node);
+ if (state->start == start)
+ goto hit_next;
+ }
+ goto search_again;
+
+out:
+ read_unlock(&tree->lock);
+
+ if (merge)
+ set |= EXTENT_MERGE;
+ return set;
+
+search_again:
+ if (start > end)
+ goto out;
+ read_unlock(&tree->lock);
+ if (mask & __GFP_WAIT)
+ cond_resched();
+ goto again;
+}
+
+static void cache_state(struct extent_state *state,
+ struct extent_state **cached_ptr);
+
+int set_extent_bit_ro(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask)
+{
+ struct extent_state *state;
+ struct rb_node *node;
+ int err = 0;
+ u64 last_start;
+ u64 last_end;
+
+ bits |= EXTENT_FIRST_DELALLOC;
+ WARN_ON(!(mask & GFP_ATOMIC));
+again:
+ read_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start <= start && state->end > start
&&
+ state->tree) {
+ node = &state->rb_node;
+ goto hit_next;
+ }
+ }
+ /*
+ * this search will find all the extents that end after
+ * our range starts.
+ */
+ node = tree_search(tree, start);
+ BUG_ON(!node);
+ state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+ last_start = state->start;
+ last_end = state->end;
+
+ /*
+ * | ---- desired range ---- |
+ * | state |
+ *
+ * Just lock what we found and keep going
+ */
+ if (state->start == start && state->end <= end) {
+ struct rb_node *next_node;
+ if (state->state & exclusive_bits) {
+ *failed_start = state->start;
+ err = -EEXIST;
+ goto out;
+ }
+
+ set_state_bits(tree, state, &bits);
+
+ cache_state(state, cached_state);
+
+ if (last_end == (u64)-1)
+ goto out;
+
+ start = last_end + 1;
+ if (state->end < end)
+ next_node = rb_next(&state->rb_node);
+ else
+ next_node = NULL;
+ if (next_node && start < end) {
+ state = rb_entry(next_node, struct extent_state,
+ rb_node);
+ if (state->start == start)
+ goto hit_next;
+ }
+ goto search_again;
+ }
+ WARN_ON(1);
+ goto search_again;
+out:
+ read_unlock(&tree->lock);
+ return err;
+search_again:
+ if (start > end)
+ goto out;
+ read_unlock(&tree->lock);
+ goto again;
+}
+#endif
+
+/*
* clear some bits on a range in the tree. This may require splitting
* or inserting elements in the tree, so the gfp mask is used to
* indicate which allocations or sleeping are allowed.
@@ -479,7 +772,7 @@ again:
return -ENOMEM;
}
- spin_lock(&tree->lock);
+ write_lock(&tree->lock);
if (cached_state) {
cached = *cached_state;
@@ -582,7 +875,7 @@ hit_next:
goto search_again;
out:
- spin_unlock(&tree->lock);
+ write_unlock(&tree->lock);
if (prealloc)
free_extent_state(prealloc);
@@ -591,7 +884,7 @@ out:
search_again:
if (start > end)
goto out;
- spin_unlock(&tree->lock);
+ write_unlock(&tree->lock);
if (mask & __GFP_WAIT)
cond_resched();
goto again;
@@ -604,9 +897,9 @@ static int wait_on_state(struct extent_io_tree *tree,
{
DEFINE_WAIT(wait);
prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock(&tree->lock);
+ read_unlock(&tree->lock);
schedule();
- spin_lock(&tree->lock);
+ read_lock(&tree->lock);
finish_wait(&state->wq, &wait);
return 0;
}
@@ -621,7 +914,7 @@ int wait_extent_bit(struct extent_io_tree *tree, u64 start,
u64 end, int bits)
struct extent_state *state;
struct rb_node *node;
- spin_lock(&tree->lock);
+ read_lock(&tree->lock);
again:
while (1) {
/*
@@ -649,10 +942,12 @@ again:
if (start > end)
break;
- cond_resched_lock(&tree->lock);
+ read_unlock(&tree->lock);
+ cond_resched();
+ read_lock(&tree->lock);
}
out:
- spin_unlock(&tree->lock);
+ read_unlock(&tree->lock);
return 0;
}
@@ -719,7 +1014,7 @@ again:
BUG_ON(!prealloc);
}
- spin_lock(&tree->lock);
+ write_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
if (state->start <= start && state->end > start
&&
@@ -880,7 +1175,7 @@ hit_next:
goto search_again;
out:
- spin_unlock(&tree->lock);
+ write_unlock(&tree->lock);
if (prealloc)
free_extent_state(prealloc);
@@ -889,7 +1184,7 @@ out:
search_again:
if (start > end)
goto out;
- spin_unlock(&tree->lock);
+ write_unlock(&tree->lock);
if (mask & __GFP_WAIT)
cond_resched();
goto again;
@@ -927,7 +1222,7 @@ again:
return -ENOMEM;
}
- spin_lock(&tree->lock);
+ write_lock(&tree->lock);
/*
* this search will find all the extents that end after
* our range starts.
@@ -1076,7 +1371,7 @@ hit_next:
goto search_again;
out:
- spin_unlock(&tree->lock);
+ write_unlock(&tree->lock);
if (prealloc)
free_extent_state(prealloc);
@@ -1085,7 +1380,7 @@ out:
search_again:
if (start > end)
goto out;
- spin_unlock(&tree->lock);
+ write_unlock(&tree->lock);
if (mask & __GFP_WAIT)
cond_resched();
goto again;
@@ -1135,6 +1430,14 @@ int set_extent_new(struct extent_io_tree *tree, u64
start, u64 end,
NULL, mask);
}
+int set_extent_uptodate_ro(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached_state, gfp_t mask)
+{
+ WARN_ON(!(mask & GFP_ATOMIC));
+ return set_extent_bit_ro(tree, start, end, EXTENT_UPTODATE, 0,
+ NULL, cached_state, mask);
+}
+
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask)
{
@@ -1196,6 +1499,13 @@ int try_lock_extent(struct extent_io_tree *tree, u64
start, u64 end,
return 1;
}
+int unlock_extent_cached_ro(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached, gfp_t mask)
+{
+ return clear_extent_bit_ro(tree, start, end, EXTENT_LOCKED, 1, 0,
+ cached, mask);
+}
+
int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached, gfp_t mask)
{
@@ -1272,14 +1582,14 @@ int find_first_extent_bit(struct extent_io_tree *tree,
u64 start,
struct extent_state *state;
int ret = 1;
- spin_lock(&tree->lock);
+ read_lock(&tree->lock);
state = find_first_extent_bit_state(tree, start, bits);
if (state) {
*start_ret = state->start;
*end_ret = state->end;
ret = 0;
}
- spin_unlock(&tree->lock);
+ read_unlock(&tree->lock);
return ret;
}
@@ -1299,7 +1609,7 @@ static noinline u64 find_delalloc_range(struct
extent_io_tree *tree,
u64 found = 0;
u64 total_bytes = 0;
- spin_lock(&tree->lock);
+ read_lock(&tree->lock);
/*
* this search will find all the extents that end after
@@ -1339,7 +1649,7 @@ static noinline u64 find_delalloc_range(struct
extent_io_tree *tree,
break;
}
out:
- spin_unlock(&tree->lock);
+ read_unlock(&tree->lock);
return found;
}
@@ -1602,7 +1912,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
return 0;
}
- spin_lock(&tree->lock);
+ read_lock(&tree->lock);
if (cur_start == 0 && bits == EXTENT_DIRTY) {
total_bytes = tree->dirty_bytes;
goto out;
@@ -1639,7 +1949,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
break;
}
out:
- spin_unlock(&tree->lock);
+ read_unlock(&tree->lock);
return total_bytes;
}
@@ -1653,7 +1963,7 @@ int set_state_private(struct extent_io_tree *tree, u64
start, u64 private)
struct extent_state *state;
int ret = 0;
- spin_lock(&tree->lock);
+ write_lock(&tree->lock);
/*
* this search will find all the extents that end after
* our range starts.
@@ -1670,7 +1980,7 @@ int set_state_private(struct extent_io_tree *tree, u64
start, u64 private)
}
state->private = private;
out:
- spin_unlock(&tree->lock);
+ write_unlock(&tree->lock);
return ret;
}
@@ -1680,7 +1990,7 @@ int get_state_private(struct extent_io_tree *tree, u64
start, u64 *private)
struct extent_state *state;
int ret = 0;
- spin_lock(&tree->lock);
+ read_lock(&tree->lock);
/*
* this search will find all the extents that end after
* our range starts.
@@ -1697,7 +2007,7 @@ int get_state_private(struct extent_io_tree *tree, u64
start, u64 *private)
}
*private = state->private;
out:
- spin_unlock(&tree->lock);
+ read_unlock(&tree->lock);
return ret;
}
@@ -1714,7 +2024,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start,
u64 end,
struct rb_node *node;
int bitset = 0;
- spin_lock(&tree->lock);
+ read_lock(&tree->lock);
if (cached && cached->tree && cached->start <= start
&&
cached->end > start)
node = &cached->rb_node;
@@ -1753,7 +2063,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start,
u64 end,
break;
}
}
- spin_unlock(&tree->lock);
+ read_unlock(&tree->lock);
return bitset;
}
@@ -1950,11 +2260,11 @@ static int clean_io_failure(u64 start, struct page
*page)
goto out;
}
- spin_lock(&BTRFS_I(inode)->io_tree.lock);
+ read_lock(&BTRFS_I(inode)->io_tree.lock);
state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
failrec->start,
EXTENT_LOCKED);
- spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+ read_unlock(&BTRFS_I(inode)->io_tree.lock);
if (state && state->start == failrec->start) {
map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
@@ -2088,12 +2398,12 @@ static int bio_readpage_error(struct bio *failed_bio,
struct page *page,
}
if (!state) {
- spin_lock(&tree->lock);
+ read_lock(&tree->lock);
state = find_first_extent_bit_state(tree, failrec->start,
EXTENT_LOCKED);
if (state && state->start != failrec->start)
state = NULL;
- spin_unlock(&tree->lock);
+ read_unlock(&tree->lock);
}
/*
@@ -2245,6 +2555,8 @@ static void end_bio_extent_readpage(struct bio *bio, int
err)
struct extent_io_tree *tree;
u64 start;
u64 end;
+ u64 merge_start = (u64)-1;
+ unsigned long merge = 0;
int whole_page;
int ret;
@@ -2273,7 +2585,7 @@ static void end_bio_extent_readpage(struct bio *bio, int
err)
if (++bvec <= bvec_end)
prefetchw(&bvec->bv_page->flags);
- spin_lock(&tree->lock);
+ read_lock(&tree->lock);
state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
if (state && state->start == start) {
/*
@@ -2282,7 +2594,7 @@ static void end_bio_extent_readpage(struct bio *bio, int
err)
*/
cache_state(state, &cached);
}
- spin_unlock(&tree->lock);
+ read_unlock(&tree->lock);
if (uptodate && tree->ops &&
tree->ops->readpage_end_io_hook) {
ret = tree->ops->readpage_end_io_hook(page, start, end,
@@ -2326,10 +2638,17 @@ error_handled:
}
if (uptodate) {
- set_extent_uptodate(tree, start, end, &cached,
- GFP_ATOMIC);
+ /* for readonly */
+ set_extent_uptodate_ro(tree, start, end, &cached,
+ GFP_ATOMIC);
+ merge |= unlock_extent_cached_ro(tree, start, end,
+ &cached, GFP_ATOMIC);
+ if (merge & EXTENT_MERGE)
+ merge_start = min(start, merge_start);
+ } else {
+ unlock_extent_cached(tree, start, end, &cached,
+ GFP_ATOMIC);
}
- unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
if (whole_page) {
if (uptodate) {
@@ -2350,6 +2669,9 @@ error_handled:
}
} while (bvec <= bvec_end);
+ if (merge & EXTENT_MERGE)
+ process_merge_state(tree, merge_start);
+
bio_put(bio);
}
@@ -2552,8 +2874,8 @@ static int __extent_read_full_page(struct extent_io_tree
*tree,
kunmap_atomic(userpage, KM_USER0);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- unlock_extent_cached(tree, cur, cur + iosize - 1,
- &cached, GFP_NOFS);
+ unlock_extent_cached_ro(tree, cur, cur + iosize - 1,
+ &cached, GFP_NOFS);
break;
}
em = get_extent(inode, page, pg_offset, cur,
@@ -2602,8 +2924,8 @@ static int __extent_read_full_page(struct extent_io_tree
*tree,
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- unlock_extent_cached(tree, cur, cur + iosize - 1,
- &cached, GFP_NOFS);
+ unlock_extent_cached_ro(tree, cur, cur + iosize - 1,
+ &cached, GFP_NOFS);
cur = cur + iosize;
pg_offset += iosize;
continue;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7604c30..f878aae 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -19,6 +19,7 @@
#define EXTENT_FIRST_DELALLOC (1 << 12)
#define EXTENT_NEED_WAIT (1 << 13)
#define EXTENT_DAMAGED (1 << 14)
+#define EXTENT_MERGE (1 << 15)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
@@ -97,7 +98,7 @@ struct extent_io_tree {
struct radix_tree_root buffer;
struct address_space *mapping;
u64 dirty_bytes;
- spinlock_t lock;
+ rwlock_t lock;
spinlock_t buffer_lock;
struct extent_io_ops *ops;
};
@@ -186,6 +187,8 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start,
u64 end,
int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached, gfp_t mask);
+int unlock_extent_cached_ro(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached, gfp_t mask);
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
--
1.6.5.2
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html