thr3ads.net - Btrfs devel - [PATCH 0/4] Btrfs: fix ENOSPC regressions [Oct 2010]

If this information is useful, please help other people find it:
Share via:

Josef Bacik

2010-Oct-15 21:28 UTC

[PATCH 0/4] Btrfs: fix ENOSPC regressions

This patchset fixes some problems with the ENOSPC code for block groups, but
more importantly it fixes a huge ENOSPC regression that''s occured. 
With my
fs_mark test

fs_mark -d /mnt/btrfs-test -D 512 -t 16 -n 4096 -F -S0

on a 2gb fs without these patches fs_mark would exit out with ENOSPC after
writing around 50mb.  With these patches I can now fill up the disk.  Also the
new ENOSPC code is super aggressive about allocating metadata chunks, to the
point that even with the multi-writer regression fixed I was still only able to
fill about 900mb with data on a 2gb fs.  With all of these patches I can fill up
the 2gb fs with about 1.9gb of data.  This is much more reasonable.  There
doesn''t appear to be any performance regression, but I would appreciate
testing
to make sure this is actually the case.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Josef Bacik

2010-Oct-15 21:28 UTC

head link

[PATCH 1/4] Btrfs: fix reservation code for mixed block groups

The global reservation stuff tries to add together DATA and METADATA used in
order to figure out how much to reserve for everything, but this
doesn''t work
right for mixed block groups.  Instead if we have mixed block groups just set
data used to 0.  Also with mixed block groups we will use bytes_may_use for
keeping track of delalloc bytes, so we need to take that into account in our
reservation calculations.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c |    8 ++++++--
 1 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72c3d5f..d532f00 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3444,7 +3444,8 @@ static int reserve_metadata_bytes(struct btrfs_block_rsv
*block_rsv,
 
 	spin_lock(&space_info->lock);
 	unused = space_info->bytes_used + space_info->bytes_reserved +
-		 space_info->bytes_pinned + space_info->bytes_readonly;
+		 space_info->bytes_pinned + space_info->bytes_readonly +
+		 space_info->bytes_may_use;
 
 	if (unused < space_info->total_bytes)
 		unused = space_info->total_bytes - unused;
@@ -3738,6 +3739,8 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info
*fs_info)
 
 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 	spin_lock(&sinfo->lock);
+	if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
+		data_used = 0;
 	meta_used = sinfo->bytes_used;
 	spin_unlock(&sinfo->lock);
 
@@ -3765,7 +3768,8 @@ static void update_global_block_rsv(struct btrfs_fs_info
*fs_info)
 	block_rsv->size = num_bytes;
 
 	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
-		    sinfo->bytes_reserved + sinfo->bytes_readonly;
+		    sinfo->bytes_reserved + sinfo->bytes_readonly +
+		    sinfo->bytes_may_use;
 
 	if (sinfo->total_bytes > num_bytes) {
 		num_bytes = sinfo->total_bytes - num_bytes;
-- 
1.6.6.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Josef Bacik

2010-Oct-15 21:28 UTC

head link

[PATCH 2/4] Btrfs: re-work delalloc flushing

Currently we try and flush delalloc, but we only do that in a sort of weak way,
which works fine in most cases but if we''re under heavy pressure we
need to be
able to wait for flushing to happen.  Also instead of checking the bytes
reserved in the block_rsv, check the space info since it is more accurate.  The
sync option will be used in a future patch.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |    3 ++-
 fs/btrfs/extent-tree.c |   26 ++++++++++++++------------
 fs/btrfs/inode.c       |    8 ++++++--
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4833a01..72f5e1a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2439,7 +2439,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle
*trans,
 			       u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
+				   int sync);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state);
 int btrfs_writepages(struct address_space *mapping,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d532f00..14a52dd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3342,9 +3342,10 @@ static int maybe_allocate_chunk(struct btrfs_trans_handle
*trans,
  * shrink metadata reservation for delalloc
  */
 static int shrink_delalloc(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 to_reclaim)
+			   struct btrfs_root *root, u64 to_reclaim, int sync)
 {
 	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_space_info *space_info;
 	u64 reserved;
 	u64 max_reclaim;
 	u64 reclaimed = 0;
@@ -3353,9 +3354,10 @@ static int shrink_delalloc(struct btrfs_trans_handle
*trans,
 	int ret;
 
 	block_rsv = &root->fs_info->delalloc_block_rsv;
-	spin_lock(&block_rsv->lock);
-	reserved = block_rsv->reserved;
-	spin_unlock(&block_rsv->lock);
+	space_info = block_rsv->space_info;
+	spin_lock(&space_info->lock);
+	reserved = space_info->bytes_reserved;
+	spin_unlock(&space_info->lock);
 
 	if (reserved == 0)
 		return 0;
@@ -3363,7 +3365,7 @@ static int shrink_delalloc(struct btrfs_trans_handle
*trans,
 	max_reclaim = min(reserved, to_reclaim);
 
 	while (1) {
-		ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
+		ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0, sync);
 		if (!ret) {
 			if (no_reclaim > 2)
 				break;
@@ -3378,11 +3380,11 @@ static int shrink_delalloc(struct btrfs_trans_handle
*trans,
 			pause = 1;
 		}
 
-		spin_lock(&block_rsv->lock);
-		if (reserved > block_rsv->reserved)
-			reclaimed = reserved - block_rsv->reserved;
-		reserved = block_rsv->reserved;
-		spin_unlock(&block_rsv->lock);
+		spin_lock(&space_info->lock);
+		if (reserved > space_info->bytes_reserved)
+			reclaimed = reserved - space_info->bytes_reserved;
+		reserved = space_info->bytes_reserved;
+		spin_unlock(&space_info->lock);
 
 		if (reserved == 0 || reclaimed >= max_reclaim)
 			break;
@@ -3411,7 +3413,7 @@ static int should_retry_reserve(struct btrfs_trans_handle
*trans,
 	if (trans && trans->transaction->in_commit)
 		return -ENOSPC;
 
-	ret = shrink_delalloc(trans, root, num_bytes);
+	ret = shrink_delalloc(trans, root, num_bytes, 0);
 	if (ret)
 		return ret;
 
@@ -3960,7 +3962,7 @@ again:
 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
 	if (block_rsv->size > 512 * 1024 * 1024)
-		shrink_delalloc(NULL, root, to_reserve);
+		shrink_delalloc(NULL, root, to_reserve, 0);
 
 	return 0;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1d17518..8b678dc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6683,7 +6683,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root,
int delay_iput)
 	return 0;
 }
 
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
+				   int sync)
 {
 	struct btrfs_inode *binode;
 	struct inode *inode = NULL;
@@ -6705,7 +6706,10 @@ int btrfs_start_one_delalloc_inode(struct btrfs_root
*root, int delay_iput)
 	spin_unlock(&root->fs_info->delalloc_lock);
 
 	if (inode) {
-		write_inode_now(inode, 0);
+		if (sync)
+			filemap_write_and_wait(inode->i_mapping);
+		else
+			filemap_flush(inode->i_mapping);
 		if (delay_iput)
 			btrfs_add_delayed_iput(inode);
 		else
-- 
1.6.6.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Josef Bacik

2010-Oct-15 21:28 UTC

head link

[PATCH 3/4] Btrfs: don''t allocate chunks as aggressively

Because the ENOSPC code over reserves super aggressively we end up allocating
chunks way more often than we should.  For example with my fs_mark tests on a
2gb fs I can end up reserved 1gb just for metadata, when only 34mb of that is
being used.  So instead check to see if the amount of space actually used is
less than 30% of the total space, and if so don''t allocate a chunk.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c |   11 ++++++++---
 1 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 14a52dd..265d8e0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3224,7 +3224,8 @@ static void force_metadata_allocation(struct btrfs_fs_info
*info)
 	rcu_read_unlock();
 }
 
-static int should_alloc_chunk(struct btrfs_space_info *sinfo,
+static int should_alloc_chunk(struct btrfs_fs_info *info,
+			      struct btrfs_space_info *sinfo,
 			      u64 alloc_bytes)
 {
 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
@@ -3237,6 +3238,9 @@ static int should_alloc_chunk(struct btrfs_space_info
*sinfo,
 	    alloc_bytes < div_factor(num_bytes, 8))
 		return 0;
 
+	if (sinfo->bytes_used < div_factor(num_bytes, 3))
+		return 0;
+
 	return 1;
 }
 
@@ -3268,7 +3272,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle
*trans,
 		goto out;
 	}
 
-	if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
+	if (!force && !should_alloc_chunk(fs_info, space_info, alloc_bytes)) {
 		spin_unlock(&space_info->lock);
 		goto out;
 	}
@@ -3317,7 +3321,8 @@ static int maybe_allocate_chunk(struct btrfs_trans_handle
*trans,
 		return 0;
 
 	spin_lock(&sinfo->lock);
-	ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
+	ret = should_alloc_chunk(root->fs_info, sinfo,
+				 num_bytes + 2 * 1024 * 1024);
 	spin_unlock(&sinfo->lock);
 	if (!ret)
 		return 0;
-- 
1.6.6.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Josef Bacik

2010-Oct-15 21:28 UTC

head link

[PATCH 4/4] Btrfs: rework how we reserve metadata bytes

With multi-threaded writes we were getting ENOSPC early because somebody would
come in, start flushing delalloc because they couldn''t make their
reservation,
and in the meantime other threads would come in and use the space that was
getting freed up, so when the original thread went to check to see if they had
space they didn''t and they''d return ENOSPC.  So instead if we
have some free
space but not enough for our reservation, take the reservation and then start
doing the flushing.  The only time we don''t take reservations is when
we''ve
already overcommitted our space, that way we don''t have people who come
late to
the party way overcommitting ourselves.  This also moves all of the retrying and
flushing code into reserve_metdata_bytes so it''s all uniform.  This
keeps my
fs_mark test from returning -ENOSPC as soon as it starts and actually lets me
fill up the disk.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |    4 +-
 fs/btrfs/extent-tree.c |  230 ++++++++++++++++++++++++++++++++----------------
 fs/btrfs/relocation.c  |   14 +---
 fs/btrfs/transaction.c |    7 +-
 4 files changed, 160 insertions(+), 95 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 72f5e1a..9e923c1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2144,7 +2144,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64
bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				int num_items, int *retries);
+				int num_items);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -2165,7 +2165,7 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info
*fs_info,
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
-			u64 num_bytes, int *retries);
+			u64 num_bytes);
 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 265d8e0..13f723a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3400,79 +3400,162 @@ static int shrink_delalloc(struct btrfs_trans_handle
*trans,
 	return reclaimed >= to_reclaim;
 }
 
-static int should_retry_reserve(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_block_rsv *block_rsv,
-				u64 num_bytes, int *retries)
+/*
+ * Retries tells us how many times we''ve called
reserve_metadata_bytes.  The
+ * idea is if this is the first call (retries == 0) then we will add to our
+ * reserved count if we can''t make the allocation in order to hold our
place
+ * while we go and try and free up space.  That way for retries > 1 we
don''t try
+ * and add space, we just check to see if the amount of unused space is >=
the
+ * total space, meaning that our reservation is valid.
+ *
+ * However if we don''t intend to retry this reservation, pass -1 as
retries so
+ * that it short circuits this logic.
+ */
+static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_block_rsv *block_rsv,
+				  u64 orig_bytes)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
-	int ret;
-
-	if ((*retries) > 2)
-		return -ENOSPC;
+	u64 unused;
+	u64 num_bytes = orig_bytes;
+	int retries = 0;
+	int ret = 0;
+	bool reserved = false;
 
-	ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
-	if (ret)
-		return 1;
+again:
+	ret = -ENOSPC;
+	if (reserved)
+		num_bytes = 0;
 
-	if (trans && trans->transaction->in_commit)
-		return -ENOSPC;
+	spin_lock(&space_info->lock);
+	unused = space_info->bytes_used + space_info->bytes_reserved +
+		 space_info->bytes_pinned + space_info->bytes_readonly +
+		 space_info->bytes_may_use;
 
-	ret = shrink_delalloc(trans, root, num_bytes, 0);
-	if (ret)
-		return ret;
+	/*
+	 * The idea here is that we''ve not already over-reserved the block
group
+	 * then we can go ahead and save our reservation first and then start
+	 * flushing if we need to.  Otherwise if we''ve already overcommitted
+	 * lets start flushing stuff first and then come back and try to make
+	 * our reservation.
+	 */
+	if (unused <= space_info->total_bytes) {
+		unused -= space_info->total_bytes;
+		if (unused >= num_bytes) {
+			if (!reserved)
+				space_info->bytes_reserved += orig_bytes;
+			reserved = true;
+			ret = 0;
+		} else if (!reserved) {
+			/*
+			 * Don''t have enough space, reserve what we need and try
+			 * to flush.
+			 */
+			space_info->bytes_reserved += orig_bytes;
+			reserved = true;
+		}
+	} else {
+		/*
+		 * Ok we''re over committed, set num_bytes to some huge number so
+		 * that the flushing stuff will give us plenty of slack space.
+		 */
+		num_bytes = orig_bytes * 2;
+	}
 
-	spin_lock(&space_info->lock);
-	if (space_info->bytes_pinned < num_bytes)
-		ret = 1;
 	spin_unlock(&space_info->lock);
-	if (ret)
-		return -ENOSPC;
 
-	(*retries)++;
+	if (!ret)
+		return 0;
 
-	if (trans)
-		return -EAGAIN;
+	/* We haven''t retried yet, try and allocate a chunk. */
+	if (retries == 0) {
+		retries++;
+		ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
+		if (ret) {
+			if (reserved)
+				return 0;
+			goto again;
+		}
+	}
 
-	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(IS_ERR(trans));
-	ret = btrfs_commit_transaction(trans, root);
-	BUG_ON(ret);
+	/*
+	 * So for the first round of retries (retries == 1) we don''t want to
be
+	 * synchronously writing out inodes.  For retries == 2 we want to do
+	 * sync, and if that still doesn''t work out move on.
+	 */
+	if (retries < 3) {
+		/*
+		 * We do this because sometimes we get trans from
+		 * start_trans_handle, which will deal with shrink_delalloc
+		 * returning -EAGAIN.  HOWEVER, there are cases that don''t deal
+		 * with -EAGAIN, so we''ll get things erroring out that really
+		 * shouldn''t have.  So instead just force us to use synchronous
+		 * delalloc flushing if we have a trans to give us the best
+		 * possible chance of actually reclaiming space.
+		 */
+		if (trans)
+			retries = 2;
 
-	return 1;
-}
+		ret = shrink_delalloc(trans, root, num_bytes, retries == 2);
+		if (ret > 0) {
+			/*
+			 * Ok we reclaimed enough space and already reserved,
+			 * we''re good to go.
+			 */
+			if (reserved)
+				return 0;
+			/*
+			 * We got enough space but didn''t actually reserve
+			 * already, which means we''re quite low on space, so try
+			 * and reserve again and if it fails lets just skip to
+			 * doing synchronous writeout.
+			 */
+			retries = 2;
+			goto again;
+		} else if (ret < 0) {
+			goto out;
+		}
 
-static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
-				  u64 num_bytes)
-{
-	struct btrfs_space_info *space_info = block_rsv->space_info;
-	u64 unused;
-	int ret = -ENOSPC;
+		/*
+		 * Ok shrink_delalloc didn''t reclaim enough, but we may not have
+		 * done a synchronous writeout either, so bump retries and try
+		 * again.
+		 */
+		retries++;
+		goto again;
+	}
 
 	spin_lock(&space_info->lock);
-	unused = space_info->bytes_used + space_info->bytes_reserved +
-		 space_info->bytes_pinned + space_info->bytes_readonly +
-		 space_info->bytes_may_use;
+	/*
+	 * Not enough space to be reclaimed, don''t bother committing the
+	 * transaction.
+	 */
+	if (space_info->bytes_pinned < orig_bytes)
+		ret = -ENOSPC;
+	spin_unlock(&space_info->lock);
+	if (ret)
+		goto out;
 
-	if (unused < space_info->total_bytes)
-		unused = space_info->total_bytes - unused;
-	else
-		unused = 0;
+	ret = -EAGAIN;
+	if (trans)
+		goto out;
 
-	if (unused >= num_bytes) {
-		if (block_rsv->priority >= 10) {
-			space_info->bytes_reserved += num_bytes;
-			ret = 0;
-		} else {
-			if ((unused + block_rsv->reserved) *
-			    block_rsv->priority >-			    (num_bytes + block_rsv->reserved)
* 10) {
-				space_info->bytes_reserved += num_bytes;
-				ret = 0;
-			}
-		}
+
+	ret = -ENOSPC;
+	trans = btrfs_join_transaction(root, 1);
+	if (IS_ERR(trans))
+		goto out;
+	ret = btrfs_commit_transaction(trans, root);
+	if (!ret)
+		goto again;
+
+out:
+	if (reserved) {
+		spin_lock(&space_info->lock);
+		space_info->bytes_reserved -= orig_bytes;
+		spin_unlock(&space_info->lock);
 	}
-	spin_unlock(&space_info->lock);
 
 	return ret;
 }
@@ -3616,23 +3699,19 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info
*fs_info,
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
-			u64 num_bytes, int *retries)
+			u64 num_bytes)
 {
 	int ret;
 
 	if (num_bytes == 0)
 		return 0;
-again:
-	ret = reserve_metadata_bytes(block_rsv, num_bytes);
+
+	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
 	}
 
-	ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
-	if (ret > 0)
-		goto again;
-
 	return ret;
 }
 
@@ -3667,7 +3746,8 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle
*trans,
 		return 0;
 
 	if (block_rsv->refill_used) {
-		ret = reserve_metadata_bytes(block_rsv, num_bytes);
+		ret = reserve_metadata_bytes(trans, root, block_rsv,
+					     num_bytes);
 		if (!ret) {
 			block_rsv_add_bytes(block_rsv, num_bytes, 0);
 			return 0;
@@ -3847,7 +3927,7 @@ static u64 calc_trans_metadata_size(struct btrfs_root
*root, int num_items)
 
 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
-				 int num_items, int *retries)
+				 int num_items)
 {
 	u64 num_bytes;
 	int ret;
@@ -3857,7 +3937,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle
*trans,
 
 	num_bytes = calc_trans_metadata_size(root, num_items);
 	ret = btrfs_block_rsv_add(trans, root,
&root->fs_info->trans_block_rsv,
-				  num_bytes, retries);
+				  num_bytes);
 	if (!ret) {
 		trans->bytes_reserved += num_bytes;
 		trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3931,14 +4011,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode,
u64 num_bytes)
 	struct btrfs_block_rsv *block_rsv =
&root->fs_info->delalloc_block_rsv;
 	u64 to_reserve;
 	int nr_extents;
-	int retries = 0;
 	int ret;
 
 	if (btrfs_transaction_in_commit(root->fs_info))
 		schedule_timeout(1);
 
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
-again:
+
 	spin_lock(&BTRFS_I(inode)->accounting_lock);
 	nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
 	if (nr_extents > BTRFS_I(inode)->reserved_extents) {
@@ -3948,18 +4027,14 @@ again:
 		nr_extents = 0;
 		to_reserve = 0;
 	}
+	spin_unlock(&BTRFS_I(inode)->accounting_lock);
 
 	to_reserve += calc_csum_metadata_size(inode, num_bytes);
-	ret = reserve_metadata_bytes(block_rsv, to_reserve);
-	if (ret) {
-		spin_unlock(&BTRFS_I(inode)->accounting_lock);
-		ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
-					   &retries);
-		if (ret > 0)
-			goto again;
+	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve);
+	if (ret)
 		return ret;
-	}
 
+	spin_lock(&BTRFS_I(inode)->accounting_lock);
 	BTRFS_I(inode)->reserved_extents += nr_extents;
 	atomic_inc(&BTRFS_I(inode)->outstanding_extents);
 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -5586,7 +5661,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	block_rsv = get_block_rsv(trans, root);
 
 	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(block_rsv, blocksize);
+		ret = reserve_metadata_bytes(trans, root, block_rsv,
+					     blocksize);
 		if (ret)
 			return ERR_PTR(ret);
 		return block_rsv;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index af339ee..fd07144 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -179,8 +179,6 @@ struct reloc_control {
 	u64 search_start;
 	u64 extents_found;
 
-	int block_rsv_retries;
-
 	unsigned int stage:8;
 	unsigned int create_reloc_tree:1;
 	unsigned int merge_reloc_tree:1;
@@ -2134,7 +2132,6 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 	LIST_HEAD(reloc_roots);
 	u64 num_bytes = 0;
 	int ret;
-	int retries = 0;
 
 	mutex_lock(&root->fs_info->trans_mutex);
 	rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
@@ -2144,7 +2141,7 @@ again:
 	if (!err) {
 		num_bytes = rc->merging_rsv_size;
 		ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
-					  num_bytes, &retries);
+					  num_bytes);
 		if (ret)
 			err = ret;
 	}
@@ -2156,7 +2153,6 @@ again:
 			btrfs_end_transaction(trans, rc->extent_root);
 			btrfs_block_rsv_release(rc->extent_root,
 						rc->block_rsv, num_bytes);
-			retries = 0;
 			goto again;
 		}
 	}
@@ -2406,15 +2402,13 @@ static int reserve_metadata_space(struct
btrfs_trans_handle *trans,
 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
 
 	trans->block_rsv = rc->block_rsv;
-	ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
-				  &rc->block_rsv_retries);
+	ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
 	if (ret) {
 		if (ret == -EAGAIN)
 			rc->commit_transaction = 1;
 		return ret;
 	}
 
-	rc->block_rsv_retries = 0;
 	return 0;
 }
 
@@ -3615,8 +3609,7 @@ int prepare_to_relocate(struct reloc_control *rc)
 	 * is no reservation in transaction handle.
 	 */
 	ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
-				  rc->extent_root->nodesize * 256,
-				  &rc->block_rsv_retries);
+				  rc->extent_root->nodesize * 256);
 	if (ret)
 		return ret;
 
@@ -3628,7 +3621,6 @@ int prepare_to_relocate(struct reloc_control *rc)
 	rc->extents_found = 0;
 	rc->nodes_relocated = 0;
 	rc->merging_rsv_size = 0;
-	rc->block_rsv_retries = 0;
 
 	rc->create_reloc_tree = 1;
 	set_reloc_control(rc);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index db0da35..0af647c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -180,7 +180,6 @@ static struct btrfs_trans_handle *start_transaction(struct
btrfs_root *root,
 {
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
-	int retries = 0;
 	int ret;
 again:
 	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
@@ -215,8 +214,7 @@ again:
 	}
 
 	if (num_items > 0) {
-		ret = btrfs_trans_reserve_metadata(h, root, num_items,
-						   &retries);
+		ret = btrfs_trans_reserve_metadata(h, root, num_items);
 		if (ret == -EAGAIN) {
 			btrfs_commit_transaction(h, root);
 			goto again;
@@ -855,7 +853,6 @@ static noinline int create_pending_snapshot(struct
btrfs_trans_handle *trans,
 	struct extent_buffer *tmp;
 	struct extent_buffer *old;
 	int ret;
-	int retries = 0;
 	u64 to_reserve = 0;
 	u64 index = 0;
 	u64 objectid;
@@ -877,7 +874,7 @@ static noinline int create_pending_snapshot(struct
btrfs_trans_handle *trans,
 
 	if (to_reserve > 0) {
 		ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
-					  to_reserve, &retries);
+					  to_reserve);
 		if (ret) {
 			pending->error = ret;
 			goto fail;
-- 
1.6.6.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Christoph Hellwig

2010-Oct-15 21:35 UTC

head link

Re: [PATCH 0/4] Btrfs: fix ENOSPC regressions

On Fri, Oct 15, 2010 at 05:28:31PM -0400, Josef Bacik
wrote:> This patchset fixes some problems with the ENOSPC code for block groups,
but
> more importantly it fixes a huge ENOSPC regression that''s occured.
With my
> fs_mark test
> 
> fs_mark -d /mnt/btrfs-test -D 512 -t 16 -n 4096 -F -S0
> 
> on a 2gb fs without these patches fs_mark would exit out with ENOSPC after
> writing around 50mb.
CAre to add the test to xfstests?  We already have a _scratch_mkfs_sized
helper to create filesystems of a specific size for ENOSPC tests.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Josef Bacik

2010-Oct-15 21:37 UTC

head link

Re: [PATCH 0/4] Btrfs: fix ENOSPC regressions

On Fri, Oct 15, 2010 at 05:35:58PM -0400, Christoph Hellwig
wrote:> On Fri, Oct 15, 2010 at 05:28:31PM -0400, Josef Bacik wrote:
> > This patchset fixes some problems with the ENOSPC code for block
groups, but
> > more importantly it fixes a huge ENOSPC regression that''s
occured.  With my
> > fs_mark test
> > 
> > fs_mark -d /mnt/btrfs-test -D 512 -t 16 -n 4096 -F -S0
> > 
> > on a 2gb fs without these patches fs_mark would exit out with ENOSPC
after
> > writing around 50mb.
> 
> CAre to add the test to xfstests?  We already have a _scratch_mkfs_sized
> helper to create filesystems of a specific size for ENOSPC tests.
Yup I will do that first thing on Monday.  Thanks,

Josef 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Josef Bacik

2010-Oct-18 21:13 UTC

head link

Re: [PATCH 3/4] Btrfs: don''t allocate chunks as aggressively

On Fri, Oct 15, 2010 at 05:28:34PM -0400, Josef Bacik
wrote:> Because the ENOSPC code over reserves super aggressively we end up
allocating
> chunks way more often than we should.  For example with my fs_mark tests on
a
> 2gb fs I can end up reserved 1gb just for metadata, when only 34mb of that
is
> being used.  So instead check to see if the amount of space actually used
is
> less than 30% of the total space, and if so don''t allocate a
chunk.
> 
> Signed-off-by: Josef Bacik <josef@redhat.com>
> ---
>  fs/btrfs/extent-tree.c |   11 ++++++++---
>  1 files changed, 8 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 14a52dd..265d8e0 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -3224,7 +3224,8 @@ static void force_metadata_allocation(struct
btrfs_fs_info *info)
>  	rcu_read_unlock();
>  }
>  
> -static int should_alloc_chunk(struct btrfs_space_info *sinfo,
> +static int should_alloc_chunk(struct btrfs_fs_info *info,
> +			      struct btrfs_space_info *sinfo,
>  			      u64 alloc_bytes)
>  {
>  	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
> @@ -3237,6 +3238,9 @@ static int should_alloc_chunk(struct btrfs_space_info
*sinfo,
>  	    alloc_bytes < div_factor(num_bytes, 8))
>  		return 0;
>  
> +	if (sinfo->bytes_used < div_factor(num_bytes, 3))
> +		return 0;
> +
>  	return 1;
>  }
>  
> @@ -3268,7 +3272,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle
*trans,
>  		goto out;
>  	}
>  
> -	if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
> +	if (!force && !should_alloc_chunk(fs_info, space_info,
alloc_bytes)) {
>  		spin_unlock(&space_info->lock);
>  		goto out;
>  	}
> @@ -3317,7 +3321,8 @@ static int maybe_allocate_chunk(struct
btrfs_trans_handle *trans,
>  		return 0;
>  
>  	spin_lock(&sinfo->lock);
> -	ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
> +	ret = should_alloc_chunk(root->fs_info, sinfo,
> +				 num_bytes + 2 * 1024 * 1024);
>  	spin_unlock(&sinfo->lock);
>  	if (!ret)
>  		return 0;
> -- 
> 1.6.6.1
> 
Self-NAK on this one, it seems to cause a few problems with -m single and
smaller fs''s, so just drop it.  It only creates too much overhead on
really
small fs''s anyway, and if you no likey that overhead, use mixed block
groups :).
Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Josef Bacik

2010-Oct-19 01:02 UTC

head link

[PATCH] Btrfs: don''t allocate chunks as aggressively V2

Because the ENOSPC code over reserves super aggressively we end up allocating
chunks way more often than we should.  For example with my fs_mark tests on a
2gb fs I can end up reserved 1gb just for metadata, when only 34mb of that is
being used.  So instead check to see if the amount of space actually used is
less than 30% of the total space, and if so don''t allocate a chunk, but
only if
we have at least 256mb of free space to make sure we don''t put too much
pressure
on free space.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
V1-V2: Cleanup should_alloc_chunk so it doesnt take fs_info, I was using it
before for something different, forgot to clean it up.  Also added 256mb free
floor at Chris''s suggestion to help with the -m single case.

 fs/btrfs/extent-tree.c |    7 +++++--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 14a52dd..eac11b1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3224,8 +3224,7 @@ static void force_metadata_allocation(struct btrfs_fs_info
*info)
 	rcu_read_unlock();
 }
 
-static int should_alloc_chunk(struct btrfs_space_info *sinfo,
-			      u64 alloc_bytes)
+static int should_alloc_chunk(struct btrfs_space_info *sinfo, u64 alloc_bytes)
 {
 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
 
@@ -3237,6 +3236,10 @@ static int should_alloc_chunk(struct btrfs_space_info
*sinfo,
 	    alloc_bytes < div_factor(num_bytes, 8))
 		return 0;
 
+	if (num_bytes > 256 * 1024 * 1024 &&
+	    sinfo->bytes_used < div_factor(num_bytes, 3))
+		return 0;
+
 	return 1;
 }
 
-- 
1.6.6.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Josef Bacik

2010-Oct-22 18:45 UTC

head link

[PATCH] Btrfs: re-work delalloc flushing V2

Currently we try and flush delalloc, but we only do that in a sort of weak way,
which works fine in most cases but if we''re under heavy pressure we
need to be
able to wait for flushing to happen.  Also instead of checking the bytes
reserved in the block_rsv, check the space info since it is more accurate.  The
sync option will be used in a future patch.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
V1->V2: fix how we counted reclaimed, and do btrfs_wait_ordered_range if
we''re
syncing the file since compression does weird things with writeback.

 fs/btrfs/ctree.h       |    3 ++-
 fs/btrfs/extent-tree.c |   26 ++++++++++++++------------
 fs/btrfs/inode.c       |   24 ++++++++++++++++++++++--
 3 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4833a01..72f5e1a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2439,7 +2439,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle
*trans,
 			       u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
+				   int sync);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state);
 int btrfs_writepages(struct address_space *mapping,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 080be22..e25525f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3342,9 +3342,10 @@ static int maybe_allocate_chunk(struct btrfs_trans_handle
*trans,
  * shrink metadata reservation for delalloc
  */
 static int shrink_delalloc(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 to_reclaim)
+			   struct btrfs_root *root, u64 to_reclaim, int sync)
 {
 	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_space_info *space_info;
 	u64 reserved;
 	u64 max_reclaim;
 	u64 reclaimed = 0;
@@ -3353,9 +3354,10 @@ static int shrink_delalloc(struct btrfs_trans_handle
*trans,
 	int ret;
 
 	block_rsv = &root->fs_info->delalloc_block_rsv;
-	spin_lock(&block_rsv->lock);
-	reserved = block_rsv->reserved;
-	spin_unlock(&block_rsv->lock);
+	space_info = block_rsv->space_info;
+	spin_lock(&space_info->lock);
+	reserved = space_info->bytes_reserved;
+	spin_unlock(&space_info->lock);
 
 	if (reserved == 0)
 		return 0;
@@ -3363,7 +3365,7 @@ static int shrink_delalloc(struct btrfs_trans_handle
*trans,
 	max_reclaim = min(reserved, to_reclaim);
 
 	while (1) {
-		ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
+		ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0, sync);
 		if (!ret) {
 			if (no_reclaim > 2)
 				break;
@@ -3378,11 +3380,11 @@ static int shrink_delalloc(struct btrfs_trans_handle
*trans,
 			pause = 1;
 		}
 
-		spin_lock(&block_rsv->lock);
-		if (reserved > block_rsv->reserved)
-			reclaimed = reserved - block_rsv->reserved;
-		reserved = block_rsv->reserved;
-		spin_unlock(&block_rsv->lock);
+		spin_lock(&space_info->lock);
+		if (reserved > space_info->bytes_reserved)
+			reclaimed += reserved - space_info->bytes_reserved;
+		reserved = space_info->bytes_reserved;
+		spin_unlock(&space_info->lock);
 
 		if (reserved == 0 || reclaimed >= max_reclaim)
 			break;
@@ -3411,7 +3413,7 @@ static int should_retry_reserve(struct btrfs_trans_handle
*trans,
 	if (trans && trans->transaction->in_commit)
 		return -ENOSPC;
 
-	ret = shrink_delalloc(trans, root, num_bytes);
+	ret = shrink_delalloc(trans, root, num_bytes, 0);
 	if (ret)
 		return ret;
 
@@ -3960,7 +3962,7 @@ again:
 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
 	if (block_rsv->size > 512 * 1024 * 1024)
-		shrink_delalloc(NULL, root, to_reserve);
+		shrink_delalloc(NULL, root, to_reserve, 0);
 
 	return 0;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1d17518..ce78cf9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6683,7 +6683,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root,
int delay_iput)
 	return 0;
 }
 
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
+				   int sync)
 {
 	struct btrfs_inode *binode;
 	struct inode *inode = NULL;
@@ -6705,7 +6706,26 @@ int btrfs_start_one_delalloc_inode(struct btrfs_root
*root, int delay_iput)
 	spin_unlock(&root->fs_info->delalloc_lock);
 
 	if (inode) {
-		write_inode_now(inode, 0);
+		if (sync) {
+			filemap_write_and_wait(inode->i_mapping);
+			/*
+			 * We have to do this because compression doesn''t
+			 * actually set PG_writeback until it submits the pages
+			 * for IO, which happens in an async thread, so we could
+			 * race and not actually wait for any writeback pages
+			 * because they''ve not been submitted yet.  Technically
+			 * this could still be the case for the ordered stuff
+			 * since the async thread may not have started to do its
+			 * work yet.  If this becomes the case then we need to
+			 * figure out a way to make sure that in writepage we
+			 * wait for any async pages to be submitted before
+			 * returning so that fdatawait does what its supposed to
+			 * do.
+			 */
+			btrfs_wait_ordered_range(inode, 0, (u64)-1);
+		} else {
+			filemap_flush(inode->i_mapping);
+		}
 		if (delay_iput)
 			btrfs_add_delayed_iput(inode);
 		else
-- 
1.6.6.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Josef Bacik

2010-Oct-22 18:50 UTC

head link

[PATCH] Btrfs: rework how we reserve metadata bytes V2

With multi-threaded writes we were getting ENOSPC early because somebody would
come in, start flushing delalloc because they couldn''t make their
reservation,
and in the meantime other threads would come in and use the space that was
getting freed up, so when the original thread went to check to see if they had
space they didn''t and they''d return ENOSPC.  So instead if we
have some free
space but not enough for our reservation, take the reservation and then start
doing the flushing.  The only time we don''t take reservations is when
we''ve
already overcommitted our space, that way we don''t have people who come
late to
the party way overcommitting ourselves.  This also moves all of the retrying and
flushing code into reserve_metdata_bytes so it''s all uniform.  This
keeps my
fs_mark test from returning -ENOSPC as soon as it starts and actually lets me
fill up the disk.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
V1->V2: Don''t allocate chunks, it takes more metadata and can cause
problems
-only flush if we say so, this keeps us from deadlocking with the tree lock and
such
-hold our reservation regardless if we are overcommitted or not, just adjust how
much we need to reclaim to succeed.
-always sync write out delalloc, we cant reclaim unless the io completes anyway
 fs/btrfs/ctree.h       |    4 +-
 fs/btrfs/extent-tree.c |  238 ++++++++++++++++++++++++++----------------------
 fs/btrfs/relocation.c  |   14 +--
 fs/btrfs/transaction.c |    7 +-
 4 files changed, 136 insertions(+), 127 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 72f5e1a..9e923c1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2144,7 +2144,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64
bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				int num_items, int *retries);
+				int num_items);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -2165,7 +2165,7 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info
*fs_info,
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
-			u64 num_bytes, int *retries);
+			u64 num_bytes);
 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1bb4bca..6942239 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3309,38 +3309,6 @@ out:
 	return ret;
 }
 
-static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_space_info *sinfo, u64 num_bytes)
-{
-	int ret;
-	int end_trans = 0;
-
-	if (sinfo->full)
-		return 0;
-
-	spin_lock(&sinfo->lock);
-	ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
-	spin_unlock(&sinfo->lock);
-	if (!ret)
-		return 0;
-
-	if (!trans) {
-		trans = btrfs_join_transaction(root, 1);
-		BUG_ON(IS_ERR(trans));
-		end_trans = 1;
-	}
-
-	ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-			     num_bytes + 2 * 1024 * 1024,
-			     get_alloc_profile(root, sinfo->flags), 0);
-
-	if (end_trans)
-		btrfs_end_transaction(trans, root);
-
-	return ret == 1 ? 1 : 0;
-}
-
 /*
  * shrink metadata reservation for delalloc
  */
@@ -3398,79 +3366,138 @@ static int shrink_delalloc(struct btrfs_trans_handle
*trans,
 	return reclaimed >= to_reclaim;
 }
 
-static int should_retry_reserve(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_block_rsv *block_rsv,
-				u64 num_bytes, int *retries)
+/*
+ * Retries tells us how many times we''ve called
reserve_metadata_bytes.  The
+ * idea is if this is the first call (retries == 0) then we will add to our
+ * reserved count if we can''t make the allocation in order to hold our
place
+ * while we go and try and free up space.  That way for retries > 1 we
don''t try
+ * and add space, we just check to see if the amount of unused space is >=
the
+ * total space, meaning that our reservation is valid.
+ *
+ * However if we don''t intend to retry this reservation, pass -1 as
retries so
+ * that it short circuits this logic.
+ */
+static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_block_rsv *block_rsv,
+				  u64 orig_bytes, int flush)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
-	int ret;
+	u64 unused;
+	u64 num_bytes = orig_bytes;
+	int retries = 0;
+	int ret = 0;
+	bool reserved = false;
 
-	if ((*retries) > 2)
-		return -ENOSPC;
+again:
+	ret = -ENOSPC;
+	if (reserved)
+		num_bytes = 0;
 
-	ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
-	if (ret)
-		return 1;
+	spin_lock(&space_info->lock);
+	unused = space_info->bytes_used + space_info->bytes_reserved +
+		 space_info->bytes_pinned + space_info->bytes_readonly +
+		 space_info->bytes_may_use;
 
-	if (trans && trans->transaction->in_commit)
-		return -ENOSPC;
+	/*
+	 * The idea here is that we''ve not already over-reserved the block
group
+	 * then we can go ahead and save our reservation first and then start
+	 * flushing if we need to.  Otherwise if we''ve already overcommitted
+	 * lets start flushing stuff first and then come back and try to make
+	 * our reservation.
+	 */
+	if (unused <= space_info->total_bytes) {
+		unused -= space_info->total_bytes;
+		if (unused >= num_bytes) {
+			if (!reserved)
+				space_info->bytes_reserved += orig_bytes;
+			ret = 0;
+		} else {
+			/*
+			 * Ok set num_bytes to orig_bytes since we aren''t
+			 * overocmmitted, this way we only try and reclaim what
+			 * we need.
+			 */
+			num_bytes = orig_bytes;
+		}
+	} else {
+		/*
+		 * Ok we''re over committed, set num_bytes to the overcommitted
+		 * amount plus the amount of bytes that we need for this
+		 * reservation.
+		 */
+		num_bytes = unused - space_info->total_bytes +
+			(orig_bytes * (retries + 1));
+	}
 
-	ret = shrink_delalloc(trans, root, num_bytes, 0);
-	if (ret)
-		return ret;
+	/*
+	 * Couldn''t make our reservation, save our place so while
we''re trying
+	 * to reclaim space we can actually use it instead of somebody else
+	 * stealing it from us.
+	 */
+	if (ret && !reserved) {
+		space_info->bytes_reserved += orig_bytes;
+		reserved = true;
+	}
 
-	spin_lock(&space_info->lock);
-	if (space_info->bytes_pinned < num_bytes)
-		ret = 1;
 	spin_unlock(&space_info->lock);
-	if (ret)
-		return -ENOSPC;
 
-	(*retries)++;
-
-	if (trans)
-		return -EAGAIN;
+	if (!ret)
+		return 0;
 
-	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(IS_ERR(trans));
-	ret = btrfs_commit_transaction(trans, root);
-	BUG_ON(ret);
+	if (!flush)
+		goto out;
 
-	return 1;
-}
+	/*
+	 * We do synchronous shrinking since we don''t actually unreserve
+	 * metadata until after the IO is completed.
+	 */
+	ret = shrink_delalloc(trans, root, num_bytes, 1);
+	if (ret > 0)
+		return 0;
+	else if (ret < 0)
+		goto out;
 
-static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
-				  u64 num_bytes)
-{
-	struct btrfs_space_info *space_info = block_rsv->space_info;
-	u64 unused;
-	int ret = -ENOSPC;
+	/*
+	 * So if we were overcommitted it''s possible that somebody else
flushed
+	 * out enough space and we simply didn''t have enough space to
reclaim,
+	 * so go back around and try again.
+	 */
+	if (retries < 2) {
+		retries++;
+		goto again;
+	}
 
 	spin_lock(&space_info->lock);
-	unused = space_info->bytes_used + space_info->bytes_reserved +
-		 space_info->bytes_pinned + space_info->bytes_readonly +
-		 space_info->bytes_may_use;
+	/*
+	 * Not enough space to be reclaimed, don''t bother committing the
+	 * transaction.
+	 */
+	if (space_info->bytes_pinned < orig_bytes)
+		ret = -ENOSPC;
+	spin_unlock(&space_info->lock);
+	if (ret)
+		goto out;
 
-	if (unused < space_info->total_bytes)
-		unused = space_info->total_bytes - unused;
-	else
-		unused = 0;
+	ret = -EAGAIN;
+	if (trans)
+		goto out;
 
-	if (unused >= num_bytes) {
-		if (block_rsv->priority >= 10) {
-			space_info->bytes_reserved += num_bytes;
-			ret = 0;
-		} else {
-			if ((unused + block_rsv->reserved) *
-			    block_rsv->priority >-			    (num_bytes + block_rsv->reserved)
* 10) {
-				space_info->bytes_reserved += num_bytes;
-				ret = 0;
-			}
-		}
+
+	ret = -ENOSPC;
+	trans = btrfs_join_transaction(root, 1);
+	if (IS_ERR(trans))
+		goto out;
+	ret = btrfs_commit_transaction(trans, root);
+	if (!ret)
+		goto again;
+
+out:
+	if (reserved) {
+		spin_lock(&space_info->lock);
+		space_info->bytes_reserved -= orig_bytes;
+		spin_unlock(&space_info->lock);
 	}
-	spin_unlock(&space_info->lock);
 
 	return ret;
 }
@@ -3614,23 +3641,19 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info
*fs_info,
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
-			u64 num_bytes, int *retries)
+			u64 num_bytes)
 {
 	int ret;
 
 	if (num_bytes == 0)
 		return 0;
-again:
-	ret = reserve_metadata_bytes(block_rsv, num_bytes);
+
+	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
 	}
 
-	ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
-	if (ret > 0)
-		goto again;
-
 	return ret;
 }
 
@@ -3665,7 +3688,8 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle
*trans,
 		return 0;
 
 	if (block_rsv->refill_used) {
-		ret = reserve_metadata_bytes(block_rsv, num_bytes);
+		ret = reserve_metadata_bytes(trans, root, block_rsv,
+					     num_bytes, 0);
 		if (!ret) {
 			block_rsv_add_bytes(block_rsv, num_bytes, 0);
 			return 0;
@@ -3845,7 +3869,7 @@ static u64 calc_trans_metadata_size(struct btrfs_root
*root, int num_items)
 
 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
-				 int num_items, int *retries)
+				 int num_items)
 {
 	u64 num_bytes;
 	int ret;
@@ -3855,7 +3879,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle
*trans,
 
 	num_bytes = calc_trans_metadata_size(root, num_items);
 	ret = btrfs_block_rsv_add(trans, root,
&root->fs_info->trans_block_rsv,
-				  num_bytes, retries);
+				  num_bytes);
 	if (!ret) {
 		trans->bytes_reserved += num_bytes;
 		trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3929,14 +3953,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode,
u64 num_bytes)
 	struct btrfs_block_rsv *block_rsv =
&root->fs_info->delalloc_block_rsv;
 	u64 to_reserve;
 	int nr_extents;
-	int retries = 0;
 	int ret;
 
 	if (btrfs_transaction_in_commit(root->fs_info))
 		schedule_timeout(1);
 
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
-again:
+
 	spin_lock(&BTRFS_I(inode)->accounting_lock);
 	nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
 	if (nr_extents > BTRFS_I(inode)->reserved_extents) {
@@ -3946,18 +3969,14 @@ again:
 		nr_extents = 0;
 		to_reserve = 0;
 	}
+	spin_unlock(&BTRFS_I(inode)->accounting_lock);
 
 	to_reserve += calc_csum_metadata_size(inode, num_bytes);
-	ret = reserve_metadata_bytes(block_rsv, to_reserve);
-	if (ret) {
-		spin_unlock(&BTRFS_I(inode)->accounting_lock);
-		ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
-					   &retries);
-		if (ret > 0)
-			goto again;
+	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
+	if (ret)
 		return ret;
-	}
 
+	spin_lock(&BTRFS_I(inode)->accounting_lock);
 	BTRFS_I(inode)->reserved_extents += nr_extents;
 	atomic_inc(&BTRFS_I(inode)->outstanding_extents);
 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
@@ -5584,7 +5603,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	block_rsv = get_block_rsv(trans, root);
 
 	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(block_rsv, blocksize);
+		ret = reserve_metadata_bytes(trans, root, block_rsv,
+					     blocksize, 0);
 		if (ret)
 			return ERR_PTR(ret);
 		return block_rsv;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index af339ee..fd07144 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -179,8 +179,6 @@ struct reloc_control {
 	u64 search_start;
 	u64 extents_found;
 
-	int block_rsv_retries;
-
 	unsigned int stage:8;
 	unsigned int create_reloc_tree:1;
 	unsigned int merge_reloc_tree:1;
@@ -2134,7 +2132,6 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 	LIST_HEAD(reloc_roots);
 	u64 num_bytes = 0;
 	int ret;
-	int retries = 0;
 
 	mutex_lock(&root->fs_info->trans_mutex);
 	rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
@@ -2144,7 +2141,7 @@ again:
 	if (!err) {
 		num_bytes = rc->merging_rsv_size;
 		ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
-					  num_bytes, &retries);
+					  num_bytes);
 		if (ret)
 			err = ret;
 	}
@@ -2156,7 +2153,6 @@ again:
 			btrfs_end_transaction(trans, rc->extent_root);
 			btrfs_block_rsv_release(rc->extent_root,
 						rc->block_rsv, num_bytes);
-			retries = 0;
 			goto again;
 		}
 	}
@@ -2406,15 +2402,13 @@ static int reserve_metadata_space(struct
btrfs_trans_handle *trans,
 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
 
 	trans->block_rsv = rc->block_rsv;
-	ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
-				  &rc->block_rsv_retries);
+	ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
 	if (ret) {
 		if (ret == -EAGAIN)
 			rc->commit_transaction = 1;
 		return ret;
 	}
 
-	rc->block_rsv_retries = 0;
 	return 0;
 }
 
@@ -3615,8 +3609,7 @@ int prepare_to_relocate(struct reloc_control *rc)
 	 * is no reservation in transaction handle.
 	 */
 	ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
-				  rc->extent_root->nodesize * 256,
-				  &rc->block_rsv_retries);
+				  rc->extent_root->nodesize * 256);
 	if (ret)
 		return ret;
 
@@ -3628,7 +3621,6 @@ int prepare_to_relocate(struct reloc_control *rc)
 	rc->extents_found = 0;
 	rc->nodes_relocated = 0;
 	rc->merging_rsv_size = 0;
-	rc->block_rsv_retries = 0;
 
 	rc->create_reloc_tree = 1;
 	set_reloc_control(rc);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index db0da35..0af647c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -180,7 +180,6 @@ static struct btrfs_trans_handle *start_transaction(struct
btrfs_root *root,
 {
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
-	int retries = 0;
 	int ret;
 again:
 	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
@@ -215,8 +214,7 @@ again:
 	}
 
 	if (num_items > 0) {
-		ret = btrfs_trans_reserve_metadata(h, root, num_items,
-						   &retries);
+		ret = btrfs_trans_reserve_metadata(h, root, num_items);
 		if (ret == -EAGAIN) {
 			btrfs_commit_transaction(h, root);
 			goto again;
@@ -855,7 +853,6 @@ static noinline int create_pending_snapshot(struct
btrfs_trans_handle *trans,
 	struct extent_buffer *tmp;
 	struct extent_buffer *old;
 	int ret;
-	int retries = 0;
 	u64 to_reserve = 0;
 	u64 index = 0;
 	u64 objectid;
@@ -877,7 +874,7 @@ static noinline int create_pending_snapshot(struct
btrfs_trans_handle *trans,
 
 	if (to_reserve > 0) {
 		ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
-					  to_reserve, &retries);
+					  to_reserve);
 		if (ret) {
 			pending->error = ret;
 			goto fail;
-- 
1.6.6.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Btrfs devel - Oct 2010 - [PATCH 0/4] Btrfs: fix ENOSPC regressions

[PATCH 0/4] Btrfs: fix ENOSPC regressions

[PATCH 1/4] Btrfs: fix reservation code for mixed block groups

[PATCH 2/4] Btrfs: re-work delalloc flushing

[PATCH 3/4] Btrfs: don''t allocate chunks as aggressively

[PATCH 4/4] Btrfs: rework how we reserve metadata bytes

Re: [PATCH 0/4] Btrfs: fix ENOSPC regressions

Re: [PATCH 0/4] Btrfs: fix ENOSPC regressions

Re: [PATCH 3/4] Btrfs: don''t allocate chunks as aggressively

[PATCH] Btrfs: don''t allocate chunks as aggressively V2

[PATCH] Btrfs: re-work delalloc flushing V2

[PATCH] Btrfs: rework how we reserve metadata bytes V2