Josef Bacik
2011-Jun-21 15:28 UTC
[PATCH] Btrfs: save preloaded extent_state''s in a percpu cache V2
When doing DIO tracing I noticed we were doing a ton of allocations, a lot of the time for extent_states. Some of the time we don''t even use the prealloc''ed extent_state, it just get''s free''d up. So instead create a per-cpu cache like the radix tree stuff. So we will check to see if our per-cpu cache has a prealloc''ed extent_state in it and if so we just continue, else we alloc a new one and fill the cache. Then if we need to use a prealloc''ed extent_state we can just take it out of our per-cpu cache. We will also refill the cache on free to try and limit the number of times we have to ask the allocator for caches. With this patch dbench 50 goes from ~210 mb/s to ~260 mb/s. Thanks, Signed-off-by: Josef Bacik <josef@redhat.com> --- V1->V2: Fix a uneven preempt_disable() when we use a prealloc but still have to search again. fs/btrfs/extent_io.c | 167 +++++++++++++++++++++++++++++++++++++------------ 1 files changed, 126 insertions(+), 41 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7055d11..9adc614 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -12,6 +12,7 @@ #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/cleancache.h> +#include <linux/cpu.h> #include "extent_io.h" #include "extent_map.h" #include "compat.h" @@ -31,6 +32,8 @@ static DEFINE_SPINLOCK(leak_lock); #define BUFFER_LRU_MAX 64 +static DEFINE_PER_CPU(struct extent_state *, extent_state_preloads) = NULL; + struct tree_entry { u64 start; u64 end; @@ -71,10 +74,36 @@ free_state_cache: return -ENOMEM; } +static void __free_extent_state(struct extent_state *state) +{ + if (!state) + return; + if (atomic_dec_and_test(&state->refs)) { +#if LEAK_DEBUG + unsigned long flags; +#endif + WARN_ON(state->tree); +#if LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_del(&state->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + kmem_cache_free(extent_state_cache, state); + } +} void extent_io_exit(void) { struct extent_state *state; struct extent_buffer *eb; + int cpu; + + for_each_possible_cpu(cpu) { + state = per_cpu(extent_state_preloads, cpu); + if (!state) + continue; + per_cpu(extent_state_preloads, cpu) = NULL; + __free_extent_state(state); + } while (!list_empty(&states)) { state = list_entry(states.next, struct extent_state, leak_list); @@ -114,16 +143,11 @@ void extent_io_tree_init(struct extent_io_tree *tree, tree->mapping = mapping; } -static struct extent_state *alloc_extent_state(gfp_t mask) +static void init_extent_state(struct extent_state *state) { - struct extent_state *state; #if LEAK_DEBUG unsigned long flags; #endif - - state = kmem_cache_alloc(extent_state_cache, mask); - if (!state) - return state; state->state = 0; state->private = 0; state->tree = NULL; @@ -134,6 +158,16 @@ static struct extent_state *alloc_extent_state(gfp_t mask) #endif atomic_set(&state->refs, 1); init_waitqueue_head(&state->wq); +} + +static struct extent_state *alloc_extent_state(gfp_t mask) +{ + struct extent_state *state; + + state = kmem_cache_alloc(extent_state_cache, mask); + if (!state) + return state; + init_extent_state(state); return state; } @@ -142,6 +176,7 @@ void free_extent_state(struct extent_state *state) if (!state) return; if (atomic_dec_and_test(&state->refs)) { + struct extent_state *tmp; #if LEAK_DEBUG unsigned long flags; #endif @@ -151,10 +186,53 @@ void free_extent_state(struct extent_state *state) list_del(&state->leak_list); spin_unlock_irqrestore(&leak_lock, flags); #endif - kmem_cache_free(extent_state_cache, state); + preempt_disable(); + tmp = __get_cpu_var(extent_state_preloads); + if (!tmp) { + init_extent_state(state); + __get_cpu_var(extent_state_preloads) = state; + } else { + kmem_cache_free(extent_state_cache, state); + } + preempt_enable(); } } +/* + * Pre-load an extent state. + */ +static int extent_state_preload(gfp_t mask) +{ + struct extent_state *state; + + preempt_disable(); + state = __get_cpu_var(extent_state_preloads); + if (!state) { + struct extent_state *tmp; + preempt_enable(); + tmp = alloc_extent_state(mask); + if (!tmp) + return -ENOMEM; + preempt_disable(); + state = __get_cpu_var(extent_state_preloads); + if (state) + free_extent_state(tmp); + else + __get_cpu_var(extent_state_preloads) = tmp; + } + + return 0; +} + +/* + * Just re-enable preemption, this is here to make patch reviewing easier, to + * make sure we match up extent_state_preloads with the end() properly. + */ +static void extent_state_preload_end(void) +{ + preempt_enable(); +} + static struct rb_node *tree_insert(struct rb_root *root, u64 offset, struct rb_node *node) { @@ -441,13 +519,24 @@ static int clear_state_bit(struct extent_io_tree *tree, return ret; } -static struct extent_state * -alloc_extent_state_atomic(struct extent_state *prealloc) +static struct extent_state *get_prealloc_state(void) { - if (!prealloc) - prealloc = alloc_extent_state(GFP_ATOMIC); + struct extent_state *state; - return prealloc; + state = __get_cpu_var(extent_state_preloads); + __get_cpu_var(extent_state_preloads) = NULL; + return state; +} + +static struct extent_state *alloc_extent_state_atomic(void) +{ + struct extent_state *state; + + state = get_prealloc_state(); + if (!state) + state = alloc_extent_state(GFP_ATOMIC); + + return state; } /* @@ -477,6 +566,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int err; int set = 0; int clear = 0; + bool preload = !(mask & __GFP_WAIT); if (delete) bits |= ~EXTENT_CTLBITS; @@ -485,9 +575,8 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) clear = 1; again: - if (!prealloc && (mask & __GFP_WAIT)) { - prealloc = alloc_extent_state(mask); - if (!prealloc) + if (preload) { + if (extent_state_preload(mask)) return -ENOMEM; } @@ -540,11 +629,10 @@ hit_next: */ if (state->start < start) { - prealloc = alloc_extent_state_atomic(prealloc); + prealloc = alloc_extent_state_atomic(); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); BUG_ON(err == -EEXIST); - prealloc = NULL; if (err) goto out; if (state->end <= end) { @@ -562,7 +650,7 @@ hit_next: * on the first half */ if (state->start <= end && state->end > end) { - prealloc = alloc_extent_state_atomic(prealloc); + prealloc = alloc_extent_state_atomic(); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); @@ -571,11 +659,10 @@ hit_next: set |= clear_state_bit(tree, prealloc, &bits, wake); - prealloc = NULL; goto out; } - if (state->end < end && prealloc && !need_resched()) + if (state->end < end && !need_resched()) next_node = rb_next(&state->rb_node); else next_node = NULL; @@ -594,8 +681,8 @@ hit_next: out: spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); + if (preload) + extent_state_preload_end(); return set; @@ -603,8 +690,10 @@ search_again: if (start > end) goto out; spin_unlock(&tree->lock); - if (mask & __GFP_WAIT) + if (preload) { + extent_state_preload_end(); cond_resched(); + } goto again; } @@ -731,12 +820,13 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int err = 0; u64 last_start; u64 last_end; + bool preload = !(mask & __GFP_WAIT); bits |= EXTENT_FIRST_DELALLOC; again: - if (!prealloc && (mask & __GFP_WAIT)) { - prealloc = alloc_extent_state(mask); - BUG_ON(!prealloc); + if (preload) { + if (extent_state_preload(mask)) + return -ENOMEM; } spin_lock(&tree->lock); @@ -753,10 +843,9 @@ again: */ node = tree_search(tree, start); if (!node) { - prealloc = alloc_extent_state_atomic(prealloc); + prealloc = alloc_extent_state_atomic(); BUG_ON(!prealloc); err = insert_state(tree, prealloc, start, end, &bits); - prealloc = NULL; BUG_ON(err == -EEXIST); goto out; } @@ -790,7 +879,7 @@ hit_next: goto out; start = last_end + 1; - if (next_node && start < end && prealloc && !need_resched()) { + if (next_node && start < end && !need_resched()) { state = rb_entry(next_node, struct extent_state, rb_node); if (state->start == start) @@ -822,11 +911,10 @@ hit_next: goto out; } - prealloc = alloc_extent_state_atomic(prealloc); + prealloc = alloc_extent_state_atomic(); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, start); BUG_ON(err == -EEXIST); - prealloc = NULL; if (err) goto out; if (state->end <= end) { @@ -855,7 +943,7 @@ hit_next: else this_end = last_start - 1; - prealloc = alloc_extent_state_atomic(prealloc); + prealloc = alloc_extent_state_atomic(); BUG_ON(!prealloc); /* @@ -868,12 +956,10 @@ hit_next: BUG_ON(err == -EEXIST); if (err) { free_extent_state(prealloc); - prealloc = NULL; goto out; } cache_state(prealloc, cached_state); free_extent_state(prealloc); - prealloc = NULL; start = this_end + 1; goto search_again; } @@ -890,19 +976,16 @@ hit_next: goto out; } - prealloc = alloc_extent_state_atomic(prealloc); + prealloc = alloc_extent_state_atomic(); BUG_ON(!prealloc); err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); err = set_state_bits(tree, prealloc, &bits); - if (err) { - prealloc = NULL; + if (err) goto out; - } cache_state(prealloc, cached_state); merge_state(tree, prealloc); - prealloc = NULL; goto out; } @@ -910,8 +993,8 @@ hit_next: out: spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); + if (preload) + extent_state_preload_end(); return err; @@ -919,6 +1002,8 @@ search_again: if (start > end) goto out; spin_unlock(&tree->lock); + if (preload) + extent_state_preload_end(); if (mask & __GFP_WAIT) cond_resched(); goto again; -- 1.7.5.2 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Andi Kleen
2011-Jun-21 20:20 UTC
Re: [PATCH] Btrfs: save preloaded extent_state''s in a percpu cache V2
Josef Bacik <josef@redhat.com> writes:> When doing DIO tracing I noticed we were doing a ton of allocations, a lot of > the time for extent_states. Some of the time we don''t even use the prealloc''ed > extent_state, it just get''s free''d up. So instead create a per-cpu cache like > the radix tree stuff. So we will check to see if our per-cpu cache has a > prealloc''ed extent_state in it and if so we just continue, else we alloc a new > one and fill the cache. Then if we need to use a prealloc''ed extent_state we > can just take it out of our per-cpu cache. We will also refill the cache on > free to try and limit the number of times we have to ask the allocator for > caches. With this patch dbench 50 goes from ~210 mb/s to ~260 mb/s. Thanks,You''re just reimplementing a poor man''s custom slab cache -- all of this is already done in slab. If the difference is really that big better fix slab and have everyone benefit? Did you use slub or slab? Did you analyze where the cycles are spent? -Andi -- ak@linux.intel.com -- Speaking for myself only -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Josef Bacik
2011-Jun-21 21:25 UTC
Re: [PATCH] Btrfs: save preloaded extent_state''s in a percpu cache V2
On 06/21/2011 04:20 PM, Andi Kleen wrote:> Josef Bacik <josef@redhat.com> writes: > >> When doing DIO tracing I noticed we were doing a ton of allocations, a lot of >> the time for extent_states. Some of the time we don''t even use the prealloc''ed >> extent_state, it just get''s free''d up. So instead create a per-cpu cache like >> the radix tree stuff. So we will check to see if our per-cpu cache has a >> prealloc''ed extent_state in it and if so we just continue, else we alloc a new >> one and fill the cache. Then if we need to use a prealloc''ed extent_state we >> can just take it out of our per-cpu cache. We will also refill the cache on >> free to try and limit the number of times we have to ask the allocator for >> caches. With this patch dbench 50 goes from ~210 mb/s to ~260 mb/s. Thanks, > > You''re just reimplementing a poor man''s custom slab cache -- all of this is already > done in slab. > > If the difference is really that big better fix slab and have everyone > benefit? > > Did you use slub or slab? > Did you analyze where the cycles are spent? >Ugh slub debugging bites me again. Thanks, Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html