This patch removes the large 85% brick wall that was keeping btrfs from panicing
because it ran out of space, and instead puts a nice 95% drywall in its place.
Instead of tracking delalloc bytes on a systemwide basis, just do it as per
space info. For now that means basically just one space_info has anything set
for bytes_delalloc, but in the future when there are different allocation
schemes for different files this will be less overkill. We reserve space by
using btrfs_delalloc_reserve_space, and then free space by using
btrfs_delalloc_free_space. You need to make sure that these are always even,
IOW for every reserve you must have a free, so either free if you
aren''t goign
to use the reserved space, or free when the space is actually allocated.
For checkers we have btrfs_check_data_free_space which checks to make sure we
have sufficient metadata space and space for bytes allocation. For metadata
operations we have btrfs_check_metadata_free_space, which is where the 95% wall
comes in. We make sure that we always have at least 5% metadata space free in
order to make sure we have enough space to rm things if we run out of data
space. So btrfs_check_data_free_space will call
btrfs_check_metadata_free_space, but every metadata operation (like
mknod/create/link etc) just needs to call btrfs_check_metadata_free_space. If
the operation is going to result in free''d space, such as unlink and
rmdir, you
don''t need to do the check.
Tested this with fs_mark, dd, fsx without a problem. Tested against head to
make sure there wasn''t a performance regression and everything came out
even.
Thanks,
Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
fs/btrfs/ctree.h | 33 +++++++++----
fs/btrfs/extent-tree.c | 125 ++++++++++++++++++++++++++++++++++++++++++------
fs/btrfs/file.c | 11 +++-
fs/btrfs/inode.c | 60 +++++------------------
fs/btrfs/ioctl.c | 16 +++++--
5 files changed, 168 insertions(+), 77 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index de103a8..6819f3e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -600,13 +600,23 @@ struct btrfs_block_group_item {
struct btrfs_space_info {
u64 flags;
- u64 total_bytes;
- u64 bytes_used;
- u64 bytes_pinned;
- u64 bytes_reserved;
- u64 bytes_readonly;
- int full;
- int force_alloc;
+
+ u64 total_bytes; /* total bytes in the space */
+ u64 bytes_used; /* total bytes used on disk */
+ u64 bytes_pinned; /* total bytes pinned, will be freed when the
+ transaction finishes */
+ u64 bytes_reserved; /* total bytes the allocator has reserved for
+ current allocations */
+ u64 bytes_readonly; /* total bytes that are read only */
+ u64 bytes_delalloc; /* number of bytes reserved for allocation,
+ this space is not necessarily reserved yet
+ by the allocator */
+
+ int full; /* indicates that we cannot allocate any more
+ chunks for this space */
+ int force_alloc; /* set if we need to force a chunk alloc for
+ this space */
+
struct list_head list;
/* for block groups in our same type */
@@ -1786,6 +1796,13 @@ int btrfs_add_dead_reloc_root(struct btrfs_root *root);
int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+int btrfs_check_metadata_free_space(struct btrfs_root *root);
+int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+ u64 bytes);
+int btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+ u64 bytes);
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+ u64 bytes);
/* ctree.c */
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
@@ -2029,8 +2046,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long
offset,
unsigned long btrfs_force_ra(struct address_space *mapping,
struct file_ra_state *ra, struct file *file,
pgoff_t offset, pgoff_t last_index);
-int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
- int for_del);
int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_delete_inode(struct inode *inode);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3b26f09..6601123 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1814,6 +1814,7 @@ static int update_space_info(struct btrfs_fs_info *info,
u64 flags,
found->bytes_pinned = 0;
found->bytes_reserved = 0;
found->bytes_readonly = 0;
+ found->bytes_delalloc = 0;
found->full = 0;
found->force_alloc = 0;
*space_info = found;
@@ -1877,6 +1878,114 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root,
u64 flags)
return flags;
}
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ u64 alloc_profile;
+
+ if (data) {
+ alloc_profile = info->avail_data_alloc_bits &
+ info->data_alloc_profile;
+ data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
+ } else if (root == root->fs_info->chunk_root) {
+ alloc_profile = info->avail_system_alloc_bits &
+ info->system_alloc_profile;
+ data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
+ } else {
+ alloc_profile = info->avail_metadata_alloc_bits &
+ info->metadata_alloc_profile;
+ data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
+ }
+
+ return btrfs_reduce_alloc_profile(root, data);
+}
+
+int btrfs_check_metadata_free_space(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 alloc_target, thresh;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ /*
+ * if the metadata area isn''t maxed out then there is no sense in
+ * checking how much is used, since we can always allocate a new chunk
+ */
+ if (!meta_sinfo->full)
+ return 0;
+
+ spin_lock(&meta_sinfo->lock);
+ thresh = meta_sinfo->total_bytes * 95;
+
+ do_div(thresh, 100);
+
+ if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
+ spin_unlock(&meta_sinfo->lock);
+ return -ENOSPC;
+ }
+ spin_unlock(&meta_sinfo->lock);
+
+ return 0;
+}
+
+int btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+ u64 bytes)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *data_sinfo;
+ u64 alloc_target;
+
+ /* get the space info for where this inode will be storing its data */
+ alloc_target = btrfs_get_alloc_profile(root, 1);
+ data_sinfo = __find_space_info(info, alloc_target);
+
+ /* make sure we have enough space to handle the data first */
+ spin_lock(&data_sinfo->lock);
+ if (data_sinfo->total_bytes - data_sinfo->bytes_used -
+ data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
+ data_sinfo->bytes_pinned - data_sinfo->bytes_readonly < bytes) {
+ spin_unlock(&data_sinfo->lock);
+ printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
+ ", %llu bytes_used, %llu bytes_reserved, "
+ "%llu bytes_pinned, %llu bytes_readonly, %llu total\n",
+ bytes, data_sinfo->bytes_delalloc,
+ data_sinfo->bytes_used, data_sinfo->bytes_reserved,
+ data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
+ data_sinfo->total_bytes);
+ return -ENOSPC;
+ }
+ data_sinfo->bytes_delalloc += bytes;
+ spin_unlock(&data_sinfo->lock);
+
+ return btrfs_check_metadata_free_space(root);
+}
+
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+ u64 bytes)
+{
+ struct btrfs_space_info *info;
+ u64 alloc_target;
+
+ alloc_target = btrfs_get_alloc_profile(root, 1);
+ info = __find_space_info(root->fs_info, alloc_target);
+
+ spin_lock(&info->lock);
+ if (bytes == (u64)-1) {
+ info->bytes_delalloc = 0;
+ } else if (bytes > info->bytes_delalloc) {
+ printk(KERN_ERR "trying to free more bytes than we have\n");
+ dump_stack();
+ info->bytes_delalloc = 0;
+ } else {
+ info->bytes_delalloc -= bytes;
+ }
+ spin_unlock(&info->lock);
+}
+
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 alloc_bytes,
u64 flags, int force)
@@ -3039,24 +3148,10 @@ static int __btrfs_reserve_extent(struct
btrfs_trans_handle *trans,
{
int ret;
u64 search_start = 0;
- u64 alloc_profile;
struct btrfs_fs_info *info = root->fs_info;
- if (data) {
- alloc_profile = info->avail_data_alloc_bits &
- info->data_alloc_profile;
- data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
- } else if (root == root->fs_info->chunk_root) {
- alloc_profile = info->avail_system_alloc_bits &
- info->system_alloc_profile;
- data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
- } else {
- alloc_profile = info->avail_metadata_alloc_bits &
- info->metadata_alloc_profile;
- data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
- }
+ data = btrfs_get_alloc_profile(root, data);
again:
- data = btrfs_reduce_alloc_profile(root, data);
/*
* the only place that sets empty_size is btrfs_realloc_node, which
* is not called recursively on allocations
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3e8023e..c369371 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1091,28 +1091,33 @@ static ssize_t btrfs_file_write(struct file *file, const
char __user *buf,
WARN_ON(num_pages > nrptrs);
memset(pages, 0, sizeof(struct page *) * nrptrs);
- ret = btrfs_check_free_space(root, write_bytes, 0);
+ ret = btrfs_delalloc_reserve_space(root, inode, write_bytes);
if (ret)
goto out;
ret = prepare_pages(root, file, pages, num_pages,
pos, first_index, last_index,
write_bytes);
- if (ret)
+ if (ret) {
+ btrfs_delalloc_free_space(root, inode, write_bytes);
goto out;
+ }
ret = btrfs_copy_from_user(pos, num_pages,
write_bytes, pages, buf);
if (ret) {
btrfs_drop_pages(pages, num_pages);
+ btrfs_delalloc_free_space(root, inode, write_bytes);
goto out;
}
ret = dirty_and_release_pages(NULL, root, file, pages,
num_pages, pos, write_bytes);
btrfs_drop_pages(pages, num_pages);
- if (ret)
+ if (ret) {
+ btrfs_delalloc_free_space(root, inode, write_bytes);
goto out;
+ }
if (will_write) {
btrfs_fdatawrite_range(inode->i_mapping, pos,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 288c2cd..910907b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -91,34 +91,6 @@ static noinline int cow_file_range(struct inode *inode,
unsigned long *nr_written, int unlock);
/*
- * a very lame attempt at stopping writes when the FS is 85% full. There
- * are countless ways this is incorrect, but it is better than nothing.
- */
-int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
- int for_del)
-{
- u64 total;
- u64 used;
- u64 thresh;
- int ret = 0;
-
- spin_lock(&root->fs_info->delalloc_lock);
- total = btrfs_super_total_bytes(&root->fs_info->super_copy);
- used = btrfs_super_bytes_used(&root->fs_info->super_copy);
- if (for_del)
- thresh = total * 90;
- else
- thresh = total * 85;
-
- do_div(thresh, 100);
-
- if (used + root->fs_info->delalloc_bytes + num_required > thresh)
- ret = -ENOSPC;
- spin_unlock(&root->fs_info->delalloc_lock);
- return ret;
-}
-
-/*
* this does all the hard work for inserting an inline extent into
* the btree. The caller should have done a btrfs_drop_extents so that
* no overlapping inline items exist in the btree
@@ -1198,9 +1170,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64
start, u64 end,
(unsigned long long)end - start + 1,
(unsigned long long)
root->fs_info->delalloc_bytes);
+ btrfs_delalloc_free_space(root, inode, (u64)-1);
root->fs_info->delalloc_bytes = 0;
BTRFS_I(inode)->delalloc_bytes = 0;
} else {
+ btrfs_delalloc_free_space(root, inode,
+ end - start + 1);
root->fs_info->delalloc_bytes -= end - start + 1;
BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
}
@@ -2217,10 +2192,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry
*dentry)
root = BTRFS_I(dir)->root;
- ret = btrfs_check_free_space(root, 1, 1);
- if (ret)
- goto fail;
-
trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, dir);
@@ -2233,7 +2204,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry
*dentry)
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
-fail:
btrfs_btree_balance_dirty(root, nr);
return ret;
}
@@ -2256,10 +2226,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry
*dentry)
return -ENOTEMPTY;
}
- ret = btrfs_check_free_space(root, 1, 1);
- if (ret)
- goto fail;
-
trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, dir);
@@ -2276,7 +2242,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry
*dentry)
fail_trans:
nr = trans->blocks_used;
ret = btrfs_end_transaction_throttle(trans, root);
-fail:
btrfs_btree_balance_dirty(root, nr);
if (ret && !err)
@@ -2786,7 +2751,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
if (size <= hole_start)
return 0;
- err = btrfs_check_free_space(root, 1, 0);
+ err = btrfs_check_metadata_free_space(root);
if (err)
return err;
@@ -3563,7 +3528,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry
*dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
- err = btrfs_check_free_space(root, 1, 0);
+ err = btrfs_check_metadata_free_space(root);
if (err)
goto fail;
@@ -3626,7 +3591,7 @@ static int btrfs_create(struct inode *dir, struct dentry
*dentry,
u64 objectid;
u64 index = 0;
- err = btrfs_check_free_space(root, 1, 0);
+ err = btrfs_check_metadata_free_space(root);
if (err)
goto fail;
trans = btrfs_start_transaction(root, 1);
@@ -3694,7 +3659,7 @@ static int btrfs_link(struct dentry *old_dentry, struct
inode *dir,
return -ENOENT;
btrfs_inc_nlink(inode);
- err = btrfs_check_free_space(root, 1, 0);
+ err = btrfs_check_metadata_free_space(root);
if (err)
goto fail;
err = btrfs_set_inode_index(dir, &index);
@@ -3740,7 +3705,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry
*dentry, int mode)
u64 index = 0;
unsigned long nr = 1;
- err = btrfs_check_free_space(root, 1, 0);
+ err = btrfs_check_metadata_free_space(root);
if (err)
goto out_unlock;
@@ -4297,7 +4262,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct
page *page)
u64 page_start;
u64 page_end;
- ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
+ ret = btrfs_delalloc_reserve_space(root, inode, PAGE_CACHE_SIZE);
if (ret)
goto out;
@@ -4310,6 +4275,7 @@ again:
if ((page->mapping != inode->i_mapping) ||
(page_start >= size)) {
+ btrfs_delalloc_free_space(root, inode, PAGE_CACHE_SIZE);
/* page got truncated out from underneath us */
goto out_unlock;
}
@@ -4332,6 +4298,8 @@ again:
}
btrfs_set_extent_delalloc(inode, page_start, page_end);
+ btrfs_delalloc_free_space(root, inode, PAGE_CACHE_SIZE -
+ (page_end - page_start + 1));
ret = 0;
/* page is wholly or partially inside EOF */
@@ -4592,7 +4560,7 @@ static int btrfs_rename(struct inode *old_dir, struct
dentry *old_dentry,
if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
return -EXDEV;
- ret = btrfs_check_free_space(root, 1, 0);
+ ret = btrfs_check_metadata_free_space(root);
if (ret)
goto out_unlock;
@@ -4710,7 +4678,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry
*dentry,
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
return -ENAMETOOLONG;
- err = btrfs_check_free_space(root, 1, 0);
+ err = btrfs_check_metadata_free_space(root);
if (err)
goto out_fail;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 988fdc8..d2ac10f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -70,7 +70,7 @@ static noinline int create_subvol(struct btrfs_root *root,
u64 index = 0;
unsigned long nr = 1;
- ret = btrfs_check_free_space(root, 1, 0);
+ ret = btrfs_check_metadata_free_space(root);
if (ret)
goto fail_commit;
@@ -203,7 +203,7 @@ static int create_snapshot(struct btrfs_root *root, struct
dentry *dentry,
if (!root->ref_cows)
return -EINVAL;
- ret = btrfs_check_free_space(root, 1, 0);
+ ret = btrfs_check_metadata_free_space(root);
if (ret)
goto fail_unlock;
@@ -371,10 +371,11 @@ static int btrfs_defrag_file(struct file *file)
unsigned long total_read = 0;
u64 page_start;
u64 page_end;
+ u64 size = inode->i_size;
unsigned long i;
int ret;
- ret = btrfs_check_free_space(root, inode->i_size, 0);
+ ret = btrfs_delalloc_reserve_space(root, inode, size);
if (ret)
return -ENOSPC;
@@ -388,14 +389,18 @@ static int btrfs_defrag_file(struct file *file)
total_read++;
again:
page = grab_cache_page(inode->i_mapping, i);
- if (!page)
+ if (!page) {
+ btrfs_delalloc_free_space(root, inode, size);
goto out_unlock;
+ }
+
if (!PageUptodate(page)) {
btrfs_readpage(NULL, page);
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
page_cache_release(page);
+ btrfs_delalloc_free_space(root, inode, size);
goto out_unlock;
}
}
@@ -424,6 +429,9 @@ again:
clear_page_dirty_for_io(page);
btrfs_set_extent_delalloc(inode, page_start, page_end);
+ btrfs_delalloc_free_space(root, inode, size -
+ (page_end - page_start + 1));
+ size -= page_end - page_start + 1;
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
set_page_dirty(page);
--
1.5.4.3
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html