We already have fs_info->chunk_mutex to avoid concurrent chunk creation. Signed-off-by: Yan Zheng <zheng.yan@oracle.com> --- diff -urp 2/fs/btrfs/ctree.h 3/fs/btrfs/ctree.h --- 2/fs/btrfs/ctree.h 2010-04-18 08:12:22.086699485 +0800 +++ 3/fs/btrfs/ctree.h 2010-04-18 08:13:15.457699211 +0800 @@ -700,9 +700,7 @@ struct btrfs_space_info { struct list_head list; /* for controlling how we free up space for allocations */ - wait_queue_head_t allocate_wait; wait_queue_head_t flush_wait; - int allocating_chunk; int flushing; /* for block groups in our same type */ diff -urp 2/fs/btrfs/extent-tree.c 3/fs/btrfs/extent-tree.c --- 2/fs/btrfs/extent-tree.c 2010-04-18 08:12:22.092698714 +0800 +++ 3/fs/btrfs/extent-tree.c 2010-04-18 08:13:15.463699138 +0800 @@ -70,6 +70,9 @@ static int find_next_key(struct btrfs_pa struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); +static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_space_info *sinfo, u64 num_bytes); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -2687,7 +2690,6 @@ static int update_space_info(struct btrf INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); init_waitqueue_head(&found->flush_wait); - init_waitqueue_head(&found->allocate_wait); spin_lock_init(&found->lock); found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | @@ -3000,71 +3002,6 @@ flush: wake_up(&info->flush_wait); } -static int maybe_allocate_chunk(struct btrfs_root *root, - struct btrfs_space_info *info) -{ - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; - struct btrfs_trans_handle *trans; - bool wait = false; - int ret = 0; - u64 min_metadata; - u64 free_space; - - free_space = btrfs_super_total_bytes(disk_super); - /* - * we allow the metadata to grow to a max of either 10gb or 5% of the - * space in the volume. - */ - min_metadata = min((u64)10 * 1024 * 1024 * 1024, - div64_u64(free_space * 5, 100)); - if (info->total_bytes >= min_metadata) { - spin_unlock(&info->lock); - return 0; - } - - if (info->full) { - spin_unlock(&info->lock); - return 0; - } - - if (!info->allocating_chunk) { - info->force_alloc = 1; - info->allocating_chunk = 1; - } else { - wait = true; - } - - spin_unlock(&info->lock); - - if (wait) { - wait_event(info->allocate_wait, - !info->allocating_chunk); - return 1; - } - - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto out; - } - - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - 4096 + 2 * 1024 * 1024, - info->flags, 0); - btrfs_end_transaction(trans, root); - if (ret) - goto out; -out: - spin_lock(&info->lock); - info->allocating_chunk = 0; - spin_unlock(&info->lock); - wake_up(&info->allocate_wait); - - if (ret) - return 0; - return 1; -} - /* * Reserve metadata space for delalloc. */ @@ -3105,7 +3042,8 @@ again: flushed++; if (flushed == 1) { - if (maybe_allocate_chunk(root, meta_sinfo)) + if (maybe_allocate_chunk(NULL, root, meta_sinfo, + num_bytes)) goto again; flushed++; } else { @@ -3220,7 +3158,8 @@ again: if (used > meta_sinfo->total_bytes) { retries++; if (retries == 1) { - if (maybe_allocate_chunk(root, meta_sinfo)) + if (maybe_allocate_chunk(NULL, root, meta_sinfo, + num_bytes)) goto again; retries++; } else { @@ -3417,13 +3356,28 @@ static void force_metadata_allocation(st rcu_read_unlock(); } +static int should_alloc_chunk(struct btrfs_space_info *sinfo, + u64 alloc_bytes) +{ + u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; + + if (sinfo->bytes_used + sinfo->bytes_reserved + + alloc_bytes + 256 * 1024 * 1024 < num_bytes) + return 0; + + if (sinfo->bytes_used + sinfo->bytes_reserved + + alloc_bytes < div_factor(num_bytes, 8)) + return 0; + + return 1; +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force) { struct btrfs_space_info *space_info; struct btrfs_fs_info *fs_info = extent_root->fs_info; - u64 thresh; int ret = 0; mutex_lock(&fs_info->chunk_mutex); @@ -3446,11 +3400,7 @@ static int do_chunk_alloc(struct btrfs_t goto out; } - thresh = space_info->total_bytes - space_info->bytes_readonly; - thresh = div_factor(thresh, 8); - if (!force && - (space_info->bytes_used + space_info->bytes_pinned + - space_info->bytes_reserved + alloc_bytes) < thresh) { + if (!force && !should_alloc_chunk(space_info, alloc_bytes)) { spin_unlock(&space_info->lock); goto out; } @@ -3472,6 +3422,8 @@ static int do_chunk_alloc(struct btrfs_t spin_lock(&space_info->lock); if (ret) space_info->full = 1; + else + ret = 1; space_info->force_alloc = 0; spin_unlock(&space_info->lock); out: @@ -3479,6 +3431,38 @@ out: return ret; } +static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_space_info *sinfo, u64 num_bytes) +{ + int ret; + int end_trans = 0; + + if (sinfo->full) + return 0; + + spin_lock(&sinfo->lock); + ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024); + spin_unlock(&sinfo->lock); + if (!ret) + return 0; + + if (!trans) { + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); + end_trans = 1; + } + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes + 2 * 1024 * 1024, + get_alloc_profile(root, sinfo->flags), 0); + + if (end_trans) + btrfs_end_transaction(trans, root); + + return ret == 1 ? 1 : 0; +} + static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc, -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Josef Bacik
2010-Apr-19 13:57 UTC
Re: [PATCH 02/12] Btrfs: Kill allocate_wait in space_info
On Mon, Apr 19, 2010 at 06:45:44PM +0800, Yan, Zheng wrote:> We already have fs_info->chunk_mutex to avoid concurrent > chunk creation. > > Signed-off-by: Yan Zheng <zheng.yan@oracle.com> > > --- > diff -urp 2/fs/btrfs/ctree.h 3/fs/btrfs/ctree.h > --- 2/fs/btrfs/ctree.h 2010-04-18 08:12:22.086699485 +0800 > +++ 3/fs/btrfs/ctree.h 2010-04-18 08:13:15.457699211 +0800 > @@ -700,9 +700,7 @@ struct btrfs_space_info { > struct list_head list; > > /* for controlling how we free up space for allocations */ > - wait_queue_head_t allocate_wait; > wait_queue_head_t flush_wait; > - int allocating_chunk; > int flushing; > > /* for block groups in our same type */ > diff -urp 2/fs/btrfs/extent-tree.c 3/fs/btrfs/extent-tree.c > --- 2/fs/btrfs/extent-tree.c 2010-04-18 08:12:22.092698714 +0800 > +++ 3/fs/btrfs/extent-tree.c 2010-04-18 08:13:15.463699138 +0800 > @@ -70,6 +70,9 @@ static int find_next_key(struct btrfs_pa > struct btrfs_key *key); > static void dump_space_info(struct btrfs_space_info *info, u64 bytes, > int dump_block_groups); > +static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, > + struct btrfs_root *root, > + struct btrfs_space_info *sinfo, u64 num_bytes); > > static noinline int > block_group_cache_done(struct btrfs_block_group_cache *cache) > @@ -2687,7 +2690,6 @@ static int update_space_info(struct btrf > INIT_LIST_HEAD(&found->block_groups[i]); > init_rwsem(&found->groups_sem); > init_waitqueue_head(&found->flush_wait); > - init_waitqueue_head(&found->allocate_wait); > spin_lock_init(&found->lock); > found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | > BTRFS_BLOCK_GROUP_SYSTEM | > @@ -3000,71 +3002,6 @@ flush: > wake_up(&info->flush_wait); > } > > -static int maybe_allocate_chunk(struct btrfs_root *root, > - struct btrfs_space_info *info) > -{ > - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; > - struct btrfs_trans_handle *trans; > - bool wait = false; > - int ret = 0; > - u64 min_metadata; > - u64 free_space; > - > - free_space = btrfs_super_total_bytes(disk_super); > - /* > - * we allow the metadata to grow to a max of either 10gb or 5% of the > - * space in the volume. > - */ > - min_metadata = min((u64)10 * 1024 * 1024 * 1024, > - div64_u64(free_space * 5, 100)); > - if (info->total_bytes >= min_metadata) { > - spin_unlock(&info->lock); > - return 0; > - } > - > - if (info->full) { > - spin_unlock(&info->lock); > - return 0; > - } > - > - if (!info->allocating_chunk) { > - info->force_alloc = 1; > - info->allocating_chunk = 1; > - } else { > - wait = true; > - } > - > - spin_unlock(&info->lock); > - > - if (wait) { > - wait_event(info->allocate_wait, > - !info->allocating_chunk); > - return 1; > - } > - > - trans = btrfs_start_transaction(root, 1); > - if (!trans) { > - ret = -ENOMEM; > - goto out; > - } > - > - ret = do_chunk_alloc(trans, root->fs_info->extent_root, > - 4096 + 2 * 1024 * 1024, > - info->flags, 0); > - btrfs_end_transaction(trans, root); > - if (ret) > - goto out; > -out: > - spin_lock(&info->lock); > - info->allocating_chunk = 0; > - spin_unlock(&info->lock); > - wake_up(&info->allocate_wait); > - > - if (ret) > - return 0; > - return 1; > -} > - > /* > * Reserve metadata space for delalloc. > */ > @@ -3105,7 +3042,8 @@ again: > flushed++; > > if (flushed == 1) { > - if (maybe_allocate_chunk(root, meta_sinfo)) > + if (maybe_allocate_chunk(NULL, root, meta_sinfo, > + num_bytes)) > goto again; > flushed++; > } else { > @@ -3220,7 +3158,8 @@ again: > if (used > meta_sinfo->total_bytes) { > retries++; > if (retries == 1) { > - if (maybe_allocate_chunk(root, meta_sinfo)) > + if (maybe_allocate_chunk(NULL, root, meta_sinfo, > + num_bytes)) > goto again; > retries++; > } else { > @@ -3417,13 +3356,28 @@ static void force_metadata_allocation(st > rcu_read_unlock(); > } > > +static int should_alloc_chunk(struct btrfs_space_info *sinfo, > + u64 alloc_bytes) > +{ > + u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; > + > + if (sinfo->bytes_used + sinfo->bytes_reserved + > + alloc_bytes + 256 * 1024 * 1024 < num_bytes) > + return 0; > + > + if (sinfo->bytes_used + sinfo->bytes_reserved + > + alloc_bytes < div_factor(num_bytes, 8)) > + return 0; > + > + return 1; > +} > + > static int do_chunk_alloc(struct btrfs_trans_handle *trans, > struct btrfs_root *extent_root, u64 alloc_bytes, > u64 flags, int force) > { > struct btrfs_space_info *space_info; > struct btrfs_fs_info *fs_info = extent_root->fs_info; > - u64 thresh; > int ret = 0; > > mutex_lock(&fs_info->chunk_mutex); > @@ -3446,11 +3400,7 @@ static int do_chunk_alloc(struct btrfs_t > goto out; > } > > - thresh = space_info->total_bytes - space_info->bytes_readonly; > - thresh = div_factor(thresh, 8); > - if (!force && > - (space_info->bytes_used + space_info->bytes_pinned + > - space_info->bytes_reserved + alloc_bytes) < thresh) { > + if (!force && !should_alloc_chunk(space_info, alloc_bytes)) { > spin_unlock(&space_info->lock); > goto out; > } > @@ -3472,6 +3422,8 @@ static int do_chunk_alloc(struct btrfs_t > spin_lock(&space_info->lock); > if (ret) > space_info->full = 1; > + else > + ret = 1; > space_info->force_alloc = 0; > spin_unlock(&space_info->lock); > out: > @@ -3479,6 +3431,38 @@ out: > return ret; > } > > +static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, > + struct btrfs_root *root, > + struct btrfs_space_info *sinfo, u64 num_bytes) > +{ > + int ret; > + int end_trans = 0; > + > + if (sinfo->full) > + return 0; > +maybe_allocate_chunk is called with the info->lock already held, this will deadlock.> + spin_lock(&sinfo->lock); > + ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024); > + spin_unlock(&sinfo->lock); > + if (!ret) > + return 0; > + > + if (!trans) { > + trans = btrfs_join_transaction(root, 1); > + BUG_ON(IS_ERR(trans)); > + end_trans = 1; > + } > + > + ret = do_chunk_alloc(trans, root->fs_info->extent_root, > + num_bytes + 2 * 1024 * 1024, > + get_alloc_profile(root, sinfo->flags), 0); > + > + if (end_trans) > + btrfs_end_transaction(trans, root); > + > + return ret == 1 ? 1 : 0; > +} > + > static int update_block_group(struct btrfs_trans_handle *trans, > struct btrfs_root *root, > u64 bytenr, u64 num_bytes, int alloc,The purpose of maybe_allocate_chunk was that there is no way to know if some other CPU is currently trying to allocate a chunk for the given space info. We could have two cpu''s come inot do_chunk_alloc at relatively the same time and end up allocating twice the amount of space, which is why I did the waitqueue thing. It seems like this is still a possibility with your patch. Thanks, Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Yan, Zheng
2010-Apr-19 14:46 UTC
Re: [PATCH 02/12] Btrfs: Kill allocate_wait in space_info
On Mon, Apr 19, 2010 at 9:57 PM, Josef Bacik <josef@redhat.com> wrote:> The purpose of maybe_allocate_chunk was that there is no way to know if some > other CPU is currently trying to allocate a chunk for the given space info. We > could have two cpu''s come inot do_chunk_alloc at relatively the same time and > end up allocating twice the amount of space, which is why I did the waitqueue > thing. It seems like this is still a possibility with your patch. Thanks, >This is impossible because the very first thing do_chunk_alloc does is lock the chunk_mutex. Yan, Zheng -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Josef Bacik
2010-Apr-19 14:48 UTC
Re: [PATCH 02/12] Btrfs: Kill allocate_wait in space_info
On Mon, Apr 19, 2010 at 10:46:12PM +0800, Yan, Zheng wrote:> On Mon, Apr 19, 2010 at 9:57 PM, Josef Bacik <josef@redhat.com> wrote: > > The purpose of maybe_allocate_chunk was that there is no way to know if some > > other CPU is currently trying to allocate a chunk for the given space info. We > > could have two cpu''s come inot do_chunk_alloc at relatively the same time and > > end up allocating twice the amount of space, which is why I did the waitqueue > > thing. It seems like this is still a possibility with your patch. Thanks, > > > This is impossible because the very first thing do_chunk_alloc does is > lock the chunk_mutex. >Sure, that just means we don''t get two things creating chunks at the same time, but not from creating them one right after another. So CPU 0 and 1 come in to the check free space stuff, realize they need to allocate a chunk, and race to call do_chunk_alloc. One of them wins, and the other blocks on the chunk_mutex lock. When the first finishes the other one is able to continue and do what it was originally going to do, and then you get two chunks when you really only wanted one. Thanks, Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Yan, Zheng
2010-Apr-19 15:34 UTC
Re: [PATCH 02/12] Btrfs: Kill allocate_wait in space_info
On Mon, Apr 19, 2010 at 10:48 PM, Josef Bacik <josef@redhat.com> wrote:> On Mon, Apr 19, 2010 at 10:46:12PM +0800, Yan, Zheng wrote: >> On Mon, Apr 19, 2010 at 9:57 PM, Josef Bacik <josef@redhat.com> wrote: >> > The purpose of maybe_allocate_chunk was that there is no way to know if some >> > other CPU is currently trying to allocate a chunk for the given space info. We >> > could have two cpu''s come inot do_chunk_alloc at relatively the same time and >> > end up allocating twice the amount of space, which is why I did the waitqueue >> > thing. It seems like this is still a possibility with your patch. Thanks, >> > >> This is impossible because the very first thing do_chunk_alloc does is >> lock the chunk_mutex. >> > > Sure, that just means we don''t get two things creating chunks at the same time, > but not from creating them one right after another. So CPU 0 and 1 come in to > the check free space stuff, realize they need to allocate a chunk, and race to > call do_chunk_alloc. One of them wins, and the other blocks on the chunk_mutex > lock. When the first finishes the other one is able to continue and do what it > was originally going to do, and then you get two chunks when you really only > wanted one. Thanks, >there is a check in do_chunk_alloc, so the later one will do nothing if the first call adds enough space. Yan Zheng -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html