This patchset fixes some problems with the ENOSPC code for block groups, but more importantly it fixes a huge ENOSPC regression that''s occured. With my fs_mark test fs_mark -d /mnt/btrfs-test -D 512 -t 16 -n 4096 -F -S0 on a 2gb fs without these patches fs_mark would exit out with ENOSPC after writing around 50mb. With these patches I can now fill up the disk. Also the new ENOSPC code is super aggressive about allocating metadata chunks, to the point that even with the multi-writer regression fixed I was still only able to fill about 900mb with data on a 2gb fs. With all of these patches I can fill up the 2gb fs with about 1.9gb of data. This is much more reasonable. There doesn''t appear to be any performance regression, but I would appreciate testing to make sure this is actually the case. Thanks, Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Josef Bacik
2010-Oct-15 21:28 UTC
[PATCH 1/4] Btrfs: fix reservation code for mixed block groups
The global reservation stuff tries to add together DATA and METADATA used in order to figure out how much to reserve for everything, but this doesn''t work right for mixed block groups. Instead if we have mixed block groups just set data used to 0. Also with mixed block groups we will use bytes_may_use for keeping track of delalloc bytes, so we need to take that into account in our reservation calculations. Signed-off-by: Josef Bacik <josef@redhat.com> --- fs/btrfs/extent-tree.c | 8 ++++++-- 1 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 72c3d5f..d532f00 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3444,7 +3444,8 @@ static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv, spin_lock(&space_info->lock); unused = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly; + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; if (unused < space_info->total_bytes) unused = space_info->total_bytes - unused; @@ -3738,6 +3739,8 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); spin_lock(&sinfo->lock); + if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) + data_used = 0; meta_used = sinfo->bytes_used; spin_unlock(&sinfo->lock); @@ -3765,7 +3768,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->size = num_bytes; num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + - sinfo->bytes_reserved + sinfo->bytes_readonly; + sinfo->bytes_reserved + sinfo->bytes_readonly + + sinfo->bytes_may_use; if (sinfo->total_bytes > num_bytes) { num_bytes = sinfo->total_bytes - num_bytes; -- 1.6.6.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Currently we try and flush delalloc, but we only do that in a sort of weak way, which works fine in most cases but if we''re under heavy pressure we need to be able to wait for flushing to happen. Also instead of checking the bytes reserved in the block_rsv, check the space info since it is more accurate. The sync option will be used in a future patch. Signed-off-by: Josef Bacik <josef@redhat.com> --- fs/btrfs/ctree.h | 3 ++- fs/btrfs/extent-tree.c | 26 ++++++++++++++------------ fs/btrfs/inode.c | 8 ++++++-- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4833a01..72f5e1a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2439,7 +2439,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); -int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput); +int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput, + int sync); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_writepages(struct address_space *mapping, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d532f00..14a52dd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3342,9 +3342,10 @@ static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, * shrink metadata reservation for delalloc */ static int shrink_delalloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 to_reclaim) + struct btrfs_root *root, u64 to_reclaim, int sync) { struct btrfs_block_rsv *block_rsv; + struct btrfs_space_info *space_info; u64 reserved; u64 max_reclaim; u64 reclaimed = 0; @@ -3353,9 +3354,10 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, int ret; block_rsv = &root->fs_info->delalloc_block_rsv; - spin_lock(&block_rsv->lock); - reserved = block_rsv->reserved; - spin_unlock(&block_rsv->lock); + space_info = block_rsv->space_info; + spin_lock(&space_info->lock); + reserved = space_info->bytes_reserved; + spin_unlock(&space_info->lock); if (reserved == 0) return 0; @@ -3363,7 +3365,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, max_reclaim = min(reserved, to_reclaim); while (1) { - ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0); + ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0, sync); if (!ret) { if (no_reclaim > 2) break; @@ -3378,11 +3380,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, pause = 1; } - spin_lock(&block_rsv->lock); - if (reserved > block_rsv->reserved) - reclaimed = reserved - block_rsv->reserved; - reserved = block_rsv->reserved; - spin_unlock(&block_rsv->lock); + spin_lock(&space_info->lock); + if (reserved > space_info->bytes_reserved) + reclaimed = reserved - space_info->bytes_reserved; + reserved = space_info->bytes_reserved; + spin_unlock(&space_info->lock); if (reserved == 0 || reclaimed >= max_reclaim) break; @@ -3411,7 +3413,7 @@ static int should_retry_reserve(struct btrfs_trans_handle *trans, if (trans && trans->transaction->in_commit) return -ENOSPC; - ret = shrink_delalloc(trans, root, num_bytes); + ret = shrink_delalloc(trans, root, num_bytes, 0); if (ret) return ret; @@ -3960,7 +3962,7 @@ again: block_rsv_add_bytes(block_rsv, to_reserve, 1); if (block_rsv->size > 512 * 1024 * 1024) - shrink_delalloc(NULL, root, to_reserve); + shrink_delalloc(NULL, root, to_reserve, 0); return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1d17518..8b678dc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6683,7 +6683,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) return 0; } -int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) +int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput, + int sync) { struct btrfs_inode *binode; struct inode *inode = NULL; @@ -6705,7 +6706,10 @@ int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) spin_unlock(&root->fs_info->delalloc_lock); if (inode) { - write_inode_now(inode, 0); + if (sync) + filemap_write_and_wait(inode->i_mapping); + else + filemap_flush(inode->i_mapping); if (delay_iput) btrfs_add_delayed_iput(inode); else -- 1.6.6.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Josef Bacik
2010-Oct-15 21:28 UTC
[PATCH 3/4] Btrfs: don''t allocate chunks as aggressively
Because the ENOSPC code over reserves super aggressively we end up allocating chunks way more often than we should. For example with my fs_mark tests on a 2gb fs I can end up reserved 1gb just for metadata, when only 34mb of that is being used. So instead check to see if the amount of space actually used is less than 30% of the total space, and if so don''t allocate a chunk. Signed-off-by: Josef Bacik <josef@redhat.com> --- fs/btrfs/extent-tree.c | 11 ++++++++--- 1 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 14a52dd..265d8e0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3224,7 +3224,8 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_unlock(); } -static int should_alloc_chunk(struct btrfs_space_info *sinfo, +static int should_alloc_chunk(struct btrfs_fs_info *info, + struct btrfs_space_info *sinfo, u64 alloc_bytes) { u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; @@ -3237,6 +3238,9 @@ static int should_alloc_chunk(struct btrfs_space_info *sinfo, alloc_bytes < div_factor(num_bytes, 8)) return 0; + if (sinfo->bytes_used < div_factor(num_bytes, 3)) + return 0; + return 1; } @@ -3268,7 +3272,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, goto out; } - if (!force && !should_alloc_chunk(space_info, alloc_bytes)) { + if (!force && !should_alloc_chunk(fs_info, space_info, alloc_bytes)) { spin_unlock(&space_info->lock); goto out; } @@ -3317,7 +3321,8 @@ static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, return 0; spin_lock(&sinfo->lock); - ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024); + ret = should_alloc_chunk(root->fs_info, sinfo, + num_bytes + 2 * 1024 * 1024); spin_unlock(&sinfo->lock); if (!ret) return 0; -- 1.6.6.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
With multi-threaded writes we were getting ENOSPC early because somebody would come in, start flushing delalloc because they couldn''t make their reservation, and in the meantime other threads would come in and use the space that was getting freed up, so when the original thread went to check to see if they had space they didn''t and they''d return ENOSPC. So instead if we have some free space but not enough for our reservation, take the reservation and then start doing the flushing. The only time we don''t take reservations is when we''ve already overcommitted our space, that way we don''t have people who come late to the party way overcommitting ourselves. This also moves all of the retrying and flushing code into reserve_metdata_bytes so it''s all uniform. This keeps my fs_mark test from returning -ENOSPC as soon as it starts and actually lets me fill up the disk. Thanks, Signed-off-by: Josef Bacik <josef@redhat.com> --- fs/btrfs/ctree.h | 4 +- fs/btrfs/extent-tree.c | 230 ++++++++++++++++++++++++++++++++---------------- fs/btrfs/relocation.c | 14 +--- fs/btrfs/transaction.c | 7 +- 4 files changed, 160 insertions(+), 95 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 72f5e1a..9e923c1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2144,7 +2144,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, - int num_items, int *retries); + int num_items); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, @@ -2165,7 +2165,7 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int *retries); + u64 num_bytes); int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 265d8e0..13f723a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3400,79 +3400,162 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, return reclaimed >= to_reclaim; } -static int should_retry_reserve(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int *retries) +/* + * Retries tells us how many times we''ve called reserve_metadata_bytes. The + * idea is if this is the first call (retries == 0) then we will add to our + * reserved count if we can''t make the allocation in order to hold our place + * while we go and try and free up space. That way for retries > 1 we don''t try + * and add space, we just check to see if the amount of unused space is >= the + * total space, meaning that our reservation is valid. + * + * However if we don''t intend to retry this reservation, pass -1 as retries so + * that it short circuits this logic. + */ +static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes) { struct btrfs_space_info *space_info = block_rsv->space_info; - int ret; - - if ((*retries) > 2) - return -ENOSPC; + u64 unused; + u64 num_bytes = orig_bytes; + int retries = 0; + int ret = 0; + bool reserved = false; - ret = maybe_allocate_chunk(trans, root, space_info, num_bytes); - if (ret) - return 1; +again: + ret = -ENOSPC; + if (reserved) + num_bytes = 0; - if (trans && trans->transaction->in_commit) - return -ENOSPC; + spin_lock(&space_info->lock); + unused = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; - ret = shrink_delalloc(trans, root, num_bytes, 0); - if (ret) - return ret; + /* + * The idea here is that we''ve not already over-reserved the block group + * then we can go ahead and save our reservation first and then start + * flushing if we need to. Otherwise if we''ve already overcommitted + * lets start flushing stuff first and then come back and try to make + * our reservation. + */ + if (unused <= space_info->total_bytes) { + unused -= space_info->total_bytes; + if (unused >= num_bytes) { + if (!reserved) + space_info->bytes_reserved += orig_bytes; + reserved = true; + ret = 0; + } else if (!reserved) { + /* + * Don''t have enough space, reserve what we need and try + * to flush. + */ + space_info->bytes_reserved += orig_bytes; + reserved = true; + } + } else { + /* + * Ok we''re over committed, set num_bytes to some huge number so + * that the flushing stuff will give us plenty of slack space. + */ + num_bytes = orig_bytes * 2; + } - spin_lock(&space_info->lock); - if (space_info->bytes_pinned < num_bytes) - ret = 1; spin_unlock(&space_info->lock); - if (ret) - return -ENOSPC; - (*retries)++; + if (!ret) + return 0; - if (trans) - return -EAGAIN; + /* We haven''t retried yet, try and allocate a chunk. */ + if (retries == 0) { + retries++; + ret = maybe_allocate_chunk(trans, root, space_info, num_bytes); + if (ret) { + if (reserved) + return 0; + goto again; + } + } - trans = btrfs_join_transaction(root, 1); - BUG_ON(IS_ERR(trans)); - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); + /* + * So for the first round of retries (retries == 1) we don''t want to be + * synchronously writing out inodes. For retries == 2 we want to do + * sync, and if that still doesn''t work out move on. + */ + if (retries < 3) { + /* + * We do this because sometimes we get trans from + * start_trans_handle, which will deal with shrink_delalloc + * returning -EAGAIN. HOWEVER, there are cases that don''t deal + * with -EAGAIN, so we''ll get things erroring out that really + * shouldn''t have. So instead just force us to use synchronous + * delalloc flushing if we have a trans to give us the best + * possible chance of actually reclaiming space. + */ + if (trans) + retries = 2; - return 1; -} + ret = shrink_delalloc(trans, root, num_bytes, retries == 2); + if (ret > 0) { + /* + * Ok we reclaimed enough space and already reserved, + * we''re good to go. + */ + if (reserved) + return 0; + /* + * We got enough space but didn''t actually reserve + * already, which means we''re quite low on space, so try + * and reserve again and if it fails lets just skip to + * doing synchronous writeout. + */ + retries = 2; + goto again; + } else if (ret < 0) { + goto out; + } -static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - struct btrfs_space_info *space_info = block_rsv->space_info; - u64 unused; - int ret = -ENOSPC; + /* + * Ok shrink_delalloc didn''t reclaim enough, but we may not have + * done a synchronous writeout either, so bump retries and try + * again. + */ + retries++; + goto again; + } spin_lock(&space_info->lock); - unused = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + /* + * Not enough space to be reclaimed, don''t bother committing the + * transaction. + */ + if (space_info->bytes_pinned < orig_bytes) + ret = -ENOSPC; + spin_unlock(&space_info->lock); + if (ret) + goto out; - if (unused < space_info->total_bytes) - unused = space_info->total_bytes - unused; - else - unused = 0; + ret = -EAGAIN; + if (trans) + goto out; - if (unused >= num_bytes) { - if (block_rsv->priority >= 10) { - space_info->bytes_reserved += num_bytes; - ret = 0; - } else { - if ((unused + block_rsv->reserved) * - block_rsv->priority >- (num_bytes + block_rsv->reserved) * 10) { - space_info->bytes_reserved += num_bytes; - ret = 0; - } - } + + ret = -ENOSPC; + trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + goto out; + ret = btrfs_commit_transaction(trans, root); + if (!ret) + goto again; + +out: + if (reserved) { + spin_lock(&space_info->lock); + space_info->bytes_reserved -= orig_bytes; + spin_unlock(&space_info->lock); } - spin_unlock(&space_info->lock); return ret; } @@ -3616,23 +3699,19 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int *retries) + u64 num_bytes) { int ret; if (num_bytes == 0) return 0; -again: - ret = reserve_metadata_bytes(block_rsv, num_bytes); + + ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; } - ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries); - if (ret > 0) - goto again; - return ret; } @@ -3667,7 +3746,8 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, return 0; if (block_rsv->refill_used) { - ret = reserve_metadata_bytes(block_rsv, num_bytes); + ret = reserve_metadata_bytes(trans, root, block_rsv, + num_bytes); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; @@ -3847,7 +3927,7 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items) int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, - int num_items, int *retries) + int num_items) { u64 num_bytes; int ret; @@ -3857,7 +3937,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, num_bytes = calc_trans_metadata_size(root, num_items); ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, - num_bytes, retries); + num_bytes); if (!ret) { trans->bytes_reserved += num_bytes; trans->block_rsv = &root->fs_info->trans_block_rsv; @@ -3931,14 +4011,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; u64 to_reserve; int nr_extents; - int retries = 0; int ret; if (btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); num_bytes = ALIGN(num_bytes, root->sectorsize); -again: + spin_lock(&BTRFS_I(inode)->accounting_lock); nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; if (nr_extents > BTRFS_I(inode)->reserved_extents) { @@ -3948,18 +4027,14 @@ again: nr_extents = 0; to_reserve = 0; } + spin_unlock(&BTRFS_I(inode)->accounting_lock); to_reserve += calc_csum_metadata_size(inode, num_bytes); - ret = reserve_metadata_bytes(block_rsv, to_reserve); - if (ret) { - spin_unlock(&BTRFS_I(inode)->accounting_lock); - ret = should_retry_reserve(NULL, root, block_rsv, to_reserve, - &retries); - if (ret > 0) - goto again; + ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve); + if (ret) return ret; - } + spin_lock(&BTRFS_I(inode)->accounting_lock); BTRFS_I(inode)->reserved_extents += nr_extents; atomic_inc(&BTRFS_I(inode)->outstanding_extents); spin_unlock(&BTRFS_I(inode)->accounting_lock); @@ -5586,7 +5661,8 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(block_rsv, blocksize); + ret = reserve_metadata_bytes(trans, root, block_rsv, + blocksize); if (ret) return ERR_PTR(ret); return block_rsv; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index af339ee..fd07144 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -179,8 +179,6 @@ struct reloc_control { u64 search_start; u64 extents_found; - int block_rsv_retries; - unsigned int stage:8; unsigned int create_reloc_tree:1; unsigned int merge_reloc_tree:1; @@ -2134,7 +2132,6 @@ int prepare_to_merge(struct reloc_control *rc, int err) LIST_HEAD(reloc_roots); u64 num_bytes = 0; int ret; - int retries = 0; mutex_lock(&root->fs_info->trans_mutex); rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; @@ -2144,7 +2141,7 @@ again: if (!err) { num_bytes = rc->merging_rsv_size; ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, - num_bytes, &retries); + num_bytes); if (ret) err = ret; } @@ -2156,7 +2153,6 @@ again: btrfs_end_transaction(trans, rc->extent_root); btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes); - retries = 0; goto again; } } @@ -2406,15 +2402,13 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes, - &rc->block_rsv_retries); + ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); if (ret) { if (ret == -EAGAIN) rc->commit_transaction = 1; return ret; } - rc->block_rsv_retries = 0; return 0; } @@ -3615,8 +3609,7 @@ int prepare_to_relocate(struct reloc_control *rc) * is no reservation in transaction handle. */ ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, - rc->extent_root->nodesize * 256, - &rc->block_rsv_retries); + rc->extent_root->nodesize * 256); if (ret) return ret; @@ -3628,7 +3621,6 @@ int prepare_to_relocate(struct reloc_control *rc) rc->extents_found = 0; rc->nodes_relocated = 0; rc->merging_rsv_size = 0; - rc->block_rsv_retries = 0; rc->create_reloc_tree = 1; set_reloc_control(rc); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index db0da35..0af647c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -180,7 +180,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; - int retries = 0; int ret; again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); @@ -215,8 +214,7 @@ again: } if (num_items > 0) { - ret = btrfs_trans_reserve_metadata(h, root, num_items, - &retries); + ret = btrfs_trans_reserve_metadata(h, root, num_items); if (ret == -EAGAIN) { btrfs_commit_transaction(h, root); goto again; @@ -855,7 +853,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct extent_buffer *tmp; struct extent_buffer *old; int ret; - int retries = 0; u64 to_reserve = 0; u64 index = 0; u64 objectid; @@ -877,7 +874,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (to_reserve > 0) { ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, - to_reserve, &retries); + to_reserve); if (ret) { pending->error = ret; goto fail; -- 1.6.6.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Oct 15, 2010 at 05:28:31PM -0400, Josef Bacik wrote:> This patchset fixes some problems with the ENOSPC code for block groups, but > more importantly it fixes a huge ENOSPC regression that''s occured. With my > fs_mark test > > fs_mark -d /mnt/btrfs-test -D 512 -t 16 -n 4096 -F -S0 > > on a 2gb fs without these patches fs_mark would exit out with ENOSPC after > writing around 50mb.CAre to add the test to xfstests? We already have a _scratch_mkfs_sized helper to create filesystems of a specific size for ENOSPC tests. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Oct 15, 2010 at 05:35:58PM -0400, Christoph Hellwig wrote:> On Fri, Oct 15, 2010 at 05:28:31PM -0400, Josef Bacik wrote: > > This patchset fixes some problems with the ENOSPC code for block groups, but > > more importantly it fixes a huge ENOSPC regression that''s occured. With my > > fs_mark test > > > > fs_mark -d /mnt/btrfs-test -D 512 -t 16 -n 4096 -F -S0 > > > > on a 2gb fs without these patches fs_mark would exit out with ENOSPC after > > writing around 50mb. > > CAre to add the test to xfstests? We already have a _scratch_mkfs_sized > helper to create filesystems of a specific size for ENOSPC tests.Yup I will do that first thing on Monday. Thanks, Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Josef Bacik
2010-Oct-18 21:13 UTC
Re: [PATCH 3/4] Btrfs: don''t allocate chunks as aggressively
On Fri, Oct 15, 2010 at 05:28:34PM -0400, Josef Bacik wrote:> Because the ENOSPC code over reserves super aggressively we end up allocating > chunks way more often than we should. For example with my fs_mark tests on a > 2gb fs I can end up reserved 1gb just for metadata, when only 34mb of that is > being used. So instead check to see if the amount of space actually used is > less than 30% of the total space, and if so don''t allocate a chunk. > > Signed-off-by: Josef Bacik <josef@redhat.com> > --- > fs/btrfs/extent-tree.c | 11 ++++++++--- > 1 files changed, 8 insertions(+), 3 deletions(-) > > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index 14a52dd..265d8e0 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -3224,7 +3224,8 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) > rcu_read_unlock(); > } > > -static int should_alloc_chunk(struct btrfs_space_info *sinfo, > +static int should_alloc_chunk(struct btrfs_fs_info *info, > + struct btrfs_space_info *sinfo, > u64 alloc_bytes) > { > u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; > @@ -3237,6 +3238,9 @@ static int should_alloc_chunk(struct btrfs_space_info *sinfo, > alloc_bytes < div_factor(num_bytes, 8)) > return 0; > > + if (sinfo->bytes_used < div_factor(num_bytes, 3)) > + return 0; > + > return 1; > } > > @@ -3268,7 +3272,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, > goto out; > } > > - if (!force && !should_alloc_chunk(space_info, alloc_bytes)) { > + if (!force && !should_alloc_chunk(fs_info, space_info, alloc_bytes)) { > spin_unlock(&space_info->lock); > goto out; > } > @@ -3317,7 +3321,8 @@ static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, > return 0; > > spin_lock(&sinfo->lock); > - ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024); > + ret = should_alloc_chunk(root->fs_info, sinfo, > + num_bytes + 2 * 1024 * 1024); > spin_unlock(&sinfo->lock); > if (!ret) > return 0; > -- > 1.6.6.1 >Self-NAK on this one, it seems to cause a few problems with -m single and smaller fs''s, so just drop it. It only creates too much overhead on really small fs''s anyway, and if you no likey that overhead, use mixed block groups :). Thanks, Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Josef Bacik
2010-Oct-19 01:02 UTC
[PATCH] Btrfs: don''t allocate chunks as aggressively V2
Because the ENOSPC code over reserves super aggressively we end up allocating chunks way more often than we should. For example with my fs_mark tests on a 2gb fs I can end up reserved 1gb just for metadata, when only 34mb of that is being used. So instead check to see if the amount of space actually used is less than 30% of the total space, and if so don''t allocate a chunk, but only if we have at least 256mb of free space to make sure we don''t put too much pressure on free space. Signed-off-by: Josef Bacik <josef@redhat.com> --- V1-V2: Cleanup should_alloc_chunk so it doesnt take fs_info, I was using it before for something different, forgot to clean it up. Also added 256mb free floor at Chris''s suggestion to help with the -m single case. fs/btrfs/extent-tree.c | 7 +++++-- 1 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 14a52dd..eac11b1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3224,8 +3224,7 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_unlock(); } -static int should_alloc_chunk(struct btrfs_space_info *sinfo, - u64 alloc_bytes) +static int should_alloc_chunk(struct btrfs_space_info *sinfo, u64 alloc_bytes) { u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; @@ -3237,6 +3236,10 @@ static int should_alloc_chunk(struct btrfs_space_info *sinfo, alloc_bytes < div_factor(num_bytes, 8)) return 0; + if (num_bytes > 256 * 1024 * 1024 && + sinfo->bytes_used < div_factor(num_bytes, 3)) + return 0; + return 1; } -- 1.6.6.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Currently we try and flush delalloc, but we only do that in a sort of weak way, which works fine in most cases but if we''re under heavy pressure we need to be able to wait for flushing to happen. Also instead of checking the bytes reserved in the block_rsv, check the space info since it is more accurate. The sync option will be used in a future patch. Signed-off-by: Josef Bacik <josef@redhat.com> --- V1->V2: fix how we counted reclaimed, and do btrfs_wait_ordered_range if we''re syncing the file since compression does weird things with writeback. fs/btrfs/ctree.h | 3 ++- fs/btrfs/extent-tree.c | 26 ++++++++++++++------------ fs/btrfs/inode.c | 24 ++++++++++++++++++++++-- 3 files changed, 38 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4833a01..72f5e1a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2439,7 +2439,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); -int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput); +int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput, + int sync); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_writepages(struct address_space *mapping, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 080be22..e25525f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3342,9 +3342,10 @@ static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, * shrink metadata reservation for delalloc */ static int shrink_delalloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 to_reclaim) + struct btrfs_root *root, u64 to_reclaim, int sync) { struct btrfs_block_rsv *block_rsv; + struct btrfs_space_info *space_info; u64 reserved; u64 max_reclaim; u64 reclaimed = 0; @@ -3353,9 +3354,10 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, int ret; block_rsv = &root->fs_info->delalloc_block_rsv; - spin_lock(&block_rsv->lock); - reserved = block_rsv->reserved; - spin_unlock(&block_rsv->lock); + space_info = block_rsv->space_info; + spin_lock(&space_info->lock); + reserved = space_info->bytes_reserved; + spin_unlock(&space_info->lock); if (reserved == 0) return 0; @@ -3363,7 +3365,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, max_reclaim = min(reserved, to_reclaim); while (1) { - ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0); + ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0, sync); if (!ret) { if (no_reclaim > 2) break; @@ -3378,11 +3380,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, pause = 1; } - spin_lock(&block_rsv->lock); - if (reserved > block_rsv->reserved) - reclaimed = reserved - block_rsv->reserved; - reserved = block_rsv->reserved; - spin_unlock(&block_rsv->lock); + spin_lock(&space_info->lock); + if (reserved > space_info->bytes_reserved) + reclaimed += reserved - space_info->bytes_reserved; + reserved = space_info->bytes_reserved; + spin_unlock(&space_info->lock); if (reserved == 0 || reclaimed >= max_reclaim) break; @@ -3411,7 +3413,7 @@ static int should_retry_reserve(struct btrfs_trans_handle *trans, if (trans && trans->transaction->in_commit) return -ENOSPC; - ret = shrink_delalloc(trans, root, num_bytes); + ret = shrink_delalloc(trans, root, num_bytes, 0); if (ret) return ret; @@ -3960,7 +3962,7 @@ again: block_rsv_add_bytes(block_rsv, to_reserve, 1); if (block_rsv->size > 512 * 1024 * 1024) - shrink_delalloc(NULL, root, to_reserve); + shrink_delalloc(NULL, root, to_reserve, 0); return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1d17518..ce78cf9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6683,7 +6683,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) return 0; } -int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) +int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput, + int sync) { struct btrfs_inode *binode; struct inode *inode = NULL; @@ -6705,7 +6706,26 @@ int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) spin_unlock(&root->fs_info->delalloc_lock); if (inode) { - write_inode_now(inode, 0); + if (sync) { + filemap_write_and_wait(inode->i_mapping); + /* + * We have to do this because compression doesn''t + * actually set PG_writeback until it submits the pages + * for IO, which happens in an async thread, so we could + * race and not actually wait for any writeback pages + * because they''ve not been submitted yet. Technically + * this could still be the case for the ordered stuff + * since the async thread may not have started to do its + * work yet. If this becomes the case then we need to + * figure out a way to make sure that in writepage we + * wait for any async pages to be submitted before + * returning so that fdatawait does what its supposed to + * do. + */ + btrfs_wait_ordered_range(inode, 0, (u64)-1); + } else { + filemap_flush(inode->i_mapping); + } if (delay_iput) btrfs_add_delayed_iput(inode); else -- 1.6.6.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
With multi-threaded writes we were getting ENOSPC early because somebody would come in, start flushing delalloc because they couldn''t make their reservation, and in the meantime other threads would come in and use the space that was getting freed up, so when the original thread went to check to see if they had space they didn''t and they''d return ENOSPC. So instead if we have some free space but not enough for our reservation, take the reservation and then start doing the flushing. The only time we don''t take reservations is when we''ve already overcommitted our space, that way we don''t have people who come late to the party way overcommitting ourselves. This also moves all of the retrying and flushing code into reserve_metdata_bytes so it''s all uniform. This keeps my fs_mark test from returning -ENOSPC as soon as it starts and actually lets me fill up the disk. Thanks, Signed-off-by: Josef Bacik <josef@redhat.com> --- V1->V2: Don''t allocate chunks, it takes more metadata and can cause problems -only flush if we say so, this keeps us from deadlocking with the tree lock and such -hold our reservation regardless if we are overcommitted or not, just adjust how much we need to reclaim to succeed. -always sync write out delalloc, we cant reclaim unless the io completes anyway fs/btrfs/ctree.h | 4 +- fs/btrfs/extent-tree.c | 238 ++++++++++++++++++++++++++---------------------- fs/btrfs/relocation.c | 14 +-- fs/btrfs/transaction.c | 7 +- 4 files changed, 136 insertions(+), 127 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 72f5e1a..9e923c1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2144,7 +2144,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, - int num_items, int *retries); + int num_items); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, @@ -2165,7 +2165,7 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int *retries); + u64 num_bytes); int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1bb4bca..6942239 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3309,38 +3309,6 @@ out: return ret; } -static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_space_info *sinfo, u64 num_bytes) -{ - int ret; - int end_trans = 0; - - if (sinfo->full) - return 0; - - spin_lock(&sinfo->lock); - ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024); - spin_unlock(&sinfo->lock); - if (!ret) - return 0; - - if (!trans) { - trans = btrfs_join_transaction(root, 1); - BUG_ON(IS_ERR(trans)); - end_trans = 1; - } - - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes + 2 * 1024 * 1024, - get_alloc_profile(root, sinfo->flags), 0); - - if (end_trans) - btrfs_end_transaction(trans, root); - - return ret == 1 ? 1 : 0; -} - /* * shrink metadata reservation for delalloc */ @@ -3398,79 +3366,138 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, return reclaimed >= to_reclaim; } -static int should_retry_reserve(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int *retries) +/* + * Retries tells us how many times we''ve called reserve_metadata_bytes. The + * idea is if this is the first call (retries == 0) then we will add to our + * reserved count if we can''t make the allocation in order to hold our place + * while we go and try and free up space. That way for retries > 1 we don''t try + * and add space, we just check to see if the amount of unused space is >= the + * total space, meaning that our reservation is valid. + * + * However if we don''t intend to retry this reservation, pass -1 as retries so + * that it short circuits this logic. + */ +static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, int flush) { struct btrfs_space_info *space_info = block_rsv->space_info; - int ret; + u64 unused; + u64 num_bytes = orig_bytes; + int retries = 0; + int ret = 0; + bool reserved = false; - if ((*retries) > 2) - return -ENOSPC; +again: + ret = -ENOSPC; + if (reserved) + num_bytes = 0; - ret = maybe_allocate_chunk(trans, root, space_info, num_bytes); - if (ret) - return 1; + spin_lock(&space_info->lock); + unused = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; - if (trans && trans->transaction->in_commit) - return -ENOSPC; + /* + * The idea here is that we''ve not already over-reserved the block group + * then we can go ahead and save our reservation first and then start + * flushing if we need to. Otherwise if we''ve already overcommitted + * lets start flushing stuff first and then come back and try to make + * our reservation. + */ + if (unused <= space_info->total_bytes) { + unused -= space_info->total_bytes; + if (unused >= num_bytes) { + if (!reserved) + space_info->bytes_reserved += orig_bytes; + ret = 0; + } else { + /* + * Ok set num_bytes to orig_bytes since we aren''t + * overocmmitted, this way we only try and reclaim what + * we need. + */ + num_bytes = orig_bytes; + } + } else { + /* + * Ok we''re over committed, set num_bytes to the overcommitted + * amount plus the amount of bytes that we need for this + * reservation. + */ + num_bytes = unused - space_info->total_bytes + + (orig_bytes * (retries + 1)); + } - ret = shrink_delalloc(trans, root, num_bytes, 0); - if (ret) - return ret; + /* + * Couldn''t make our reservation, save our place so while we''re trying + * to reclaim space we can actually use it instead of somebody else + * stealing it from us. + */ + if (ret && !reserved) { + space_info->bytes_reserved += orig_bytes; + reserved = true; + } - spin_lock(&space_info->lock); - if (space_info->bytes_pinned < num_bytes) - ret = 1; spin_unlock(&space_info->lock); - if (ret) - return -ENOSPC; - (*retries)++; - - if (trans) - return -EAGAIN; + if (!ret) + return 0; - trans = btrfs_join_transaction(root, 1); - BUG_ON(IS_ERR(trans)); - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); + if (!flush) + goto out; - return 1; -} + /* + * We do synchronous shrinking since we don''t actually unreserve + * metadata until after the IO is completed. + */ + ret = shrink_delalloc(trans, root, num_bytes, 1); + if (ret > 0) + return 0; + else if (ret < 0) + goto out; -static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - struct btrfs_space_info *space_info = block_rsv->space_info; - u64 unused; - int ret = -ENOSPC; + /* + * So if we were overcommitted it''s possible that somebody else flushed + * out enough space and we simply didn''t have enough space to reclaim, + * so go back around and try again. + */ + if (retries < 2) { + retries++; + goto again; + } spin_lock(&space_info->lock); - unused = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + /* + * Not enough space to be reclaimed, don''t bother committing the + * transaction. + */ + if (space_info->bytes_pinned < orig_bytes) + ret = -ENOSPC; + spin_unlock(&space_info->lock); + if (ret) + goto out; - if (unused < space_info->total_bytes) - unused = space_info->total_bytes - unused; - else - unused = 0; + ret = -EAGAIN; + if (trans) + goto out; - if (unused >= num_bytes) { - if (block_rsv->priority >= 10) { - space_info->bytes_reserved += num_bytes; - ret = 0; - } else { - if ((unused + block_rsv->reserved) * - block_rsv->priority >- (num_bytes + block_rsv->reserved) * 10) { - space_info->bytes_reserved += num_bytes; - ret = 0; - } - } + + ret = -ENOSPC; + trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + goto out; + ret = btrfs_commit_transaction(trans, root); + if (!ret) + goto again; + +out: + if (reserved) { + spin_lock(&space_info->lock); + space_info->bytes_reserved -= orig_bytes; + spin_unlock(&space_info->lock); } - spin_unlock(&space_info->lock); return ret; } @@ -3614,23 +3641,19 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int *retries) + u64 num_bytes) { int ret; if (num_bytes == 0) return 0; -again: - ret = reserve_metadata_bytes(block_rsv, num_bytes); + + ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; } - ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries); - if (ret > 0) - goto again; - return ret; } @@ -3665,7 +3688,8 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, return 0; if (block_rsv->refill_used) { - ret = reserve_metadata_bytes(block_rsv, num_bytes); + ret = reserve_metadata_bytes(trans, root, block_rsv, + num_bytes, 0); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; @@ -3845,7 +3869,7 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items) int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, - int num_items, int *retries) + int num_items) { u64 num_bytes; int ret; @@ -3855,7 +3879,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, num_bytes = calc_trans_metadata_size(root, num_items); ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, - num_bytes, retries); + num_bytes); if (!ret) { trans->bytes_reserved += num_bytes; trans->block_rsv = &root->fs_info->trans_block_rsv; @@ -3929,14 +3953,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; u64 to_reserve; int nr_extents; - int retries = 0; int ret; if (btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); num_bytes = ALIGN(num_bytes, root->sectorsize); -again: + spin_lock(&BTRFS_I(inode)->accounting_lock); nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; if (nr_extents > BTRFS_I(inode)->reserved_extents) { @@ -3946,18 +3969,14 @@ again: nr_extents = 0; to_reserve = 0; } + spin_unlock(&BTRFS_I(inode)->accounting_lock); to_reserve += calc_csum_metadata_size(inode, num_bytes); - ret = reserve_metadata_bytes(block_rsv, to_reserve); - if (ret) { - spin_unlock(&BTRFS_I(inode)->accounting_lock); - ret = should_retry_reserve(NULL, root, block_rsv, to_reserve, - &retries); - if (ret > 0) - goto again; + ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); + if (ret) return ret; - } + spin_lock(&BTRFS_I(inode)->accounting_lock); BTRFS_I(inode)->reserved_extents += nr_extents; atomic_inc(&BTRFS_I(inode)->outstanding_extents); spin_unlock(&BTRFS_I(inode)->accounting_lock); @@ -5584,7 +5603,8 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(block_rsv, blocksize); + ret = reserve_metadata_bytes(trans, root, block_rsv, + blocksize, 0); if (ret) return ERR_PTR(ret); return block_rsv; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index af339ee..fd07144 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -179,8 +179,6 @@ struct reloc_control { u64 search_start; u64 extents_found; - int block_rsv_retries; - unsigned int stage:8; unsigned int create_reloc_tree:1; unsigned int merge_reloc_tree:1; @@ -2134,7 +2132,6 @@ int prepare_to_merge(struct reloc_control *rc, int err) LIST_HEAD(reloc_roots); u64 num_bytes = 0; int ret; - int retries = 0; mutex_lock(&root->fs_info->trans_mutex); rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; @@ -2144,7 +2141,7 @@ again: if (!err) { num_bytes = rc->merging_rsv_size; ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, - num_bytes, &retries); + num_bytes); if (ret) err = ret; } @@ -2156,7 +2153,6 @@ again: btrfs_end_transaction(trans, rc->extent_root); btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes); - retries = 0; goto again; } } @@ -2406,15 +2402,13 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes, - &rc->block_rsv_retries); + ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); if (ret) { if (ret == -EAGAIN) rc->commit_transaction = 1; return ret; } - rc->block_rsv_retries = 0; return 0; } @@ -3615,8 +3609,7 @@ int prepare_to_relocate(struct reloc_control *rc) * is no reservation in transaction handle. */ ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, - rc->extent_root->nodesize * 256, - &rc->block_rsv_retries); + rc->extent_root->nodesize * 256); if (ret) return ret; @@ -3628,7 +3621,6 @@ int prepare_to_relocate(struct reloc_control *rc) rc->extents_found = 0; rc->nodes_relocated = 0; rc->merging_rsv_size = 0; - rc->block_rsv_retries = 0; rc->create_reloc_tree = 1; set_reloc_control(rc); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index db0da35..0af647c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -180,7 +180,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; - int retries = 0; int ret; again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); @@ -215,8 +214,7 @@ again: } if (num_items > 0) { - ret = btrfs_trans_reserve_metadata(h, root, num_items, - &retries); + ret = btrfs_trans_reserve_metadata(h, root, num_items); if (ret == -EAGAIN) { btrfs_commit_transaction(h, root); goto again; @@ -855,7 +853,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct extent_buffer *tmp; struct extent_buffer *old; int ret; - int retries = 0; u64 to_reserve = 0; u64 index = 0; u64 objectid; @@ -877,7 +874,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (to_reserve > 0) { ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, - to_reserve, &retries); + to_reserve); if (ret) { pending->error = ret; goto fail; -- 1.6.6.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html