Fix leak of vfsmount write reference and open_ioctl_trans reference on ENOMEM. Clean up the error paths while we''re at it. Signed-off-by: Sage Weil <sage@newdream.net> --- fs/btrfs/ioctl.c | 41 ++++++++++++++++++++++------------------- 1 files changed, 22 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4de7ef6..9a780c8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1232,15 +1232,15 @@ static long btrfs_ioctl_trans_start(struct file *file) struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - int ret = 0; + int ret; + ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + goto out; - if (file->private_data) { - ret = -EINPROGRESS; + ret = -EINPROGRESS; + if (file->private_data) goto out; - } ret = mnt_want_write(file->f_path.mnt); if (ret) @@ -1250,12 +1250,19 @@ static long btrfs_ioctl_trans_start(struct file *file) root->fs_info->open_ioctl_trans++; mutex_unlock(&root->fs_info->trans_mutex); + ret = -ENOMEM; trans = btrfs_start_ioctl_transaction(root, 0); - if (trans) - file->private_data = trans; - else - ret = -ENOMEM; - /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ + if (!trans) + goto out_drop; + + file->private_data = trans; + return 0; + +out_drop: + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans--; + mutex_unlock(&root->fs_info->trans_mutex); + mnt_drop_write(file->f_path.mnt); out: return ret; } @@ -1271,24 +1278,20 @@ long btrfs_ioctl_trans_end(struct file *file) struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - int ret = 0; trans = file->private_data; - if (!trans) { - ret = -EINVAL; - goto out; - } - btrfs_end_transaction(trans, root); + if (!trans) + return -EINVAL; file->private_data = NULL; + btrfs_end_transaction(trans, root); + mutex_lock(&root->fs_info->trans_mutex); root->fs_info->open_ioctl_trans--; mutex_unlock(&root->fs_info->trans_mutex); mnt_drop_write(file->f_path.mnt); - -out: - return ret; + return 0; } long btrfs_ioctl(struct file *file, unsigned int -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Sage Weil
2009-Sep-29 18:50 UTC
[PATCH 2/4] Btrfs: fix deadlock with free space handling and user transactions
If an ioctl-initiated transaction is open, we can''t force a commit during the free space checks in order to free up pinned extents or else we deadlock. Just ENOSPC instead. A more satisfying solution that reserves space for the entire user transaction up front is forthcoming... Signed-off-by: Sage Weil <sage@newdream.net> --- fs/btrfs/extent-tree.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a4b2b03..d119c03 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3168,7 +3168,7 @@ alloc: spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ - if (!committed) { + if (!committed && !root->fs_info->open_ioctl_trans) { committed = 1; trans = btrfs_join_transaction(root, 1); if (!trans) -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Pull out the actual free space check, so setting an inode''s initial space_info is done by the caller instead of a wonky goto. Signed-off-by: Sage Weil <sage@newdream.net> --- fs/btrfs/extent-tree.c | 80 ++++++++++++++++++++++++++++------------------- 1 files changed, 48 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d119c03..c190bdb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3111,25 +3111,33 @@ again: return 0; } -/* - * This will check the space that the inode allocates from to make sure we have - * enough space for bytes. - */ -int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) +static int __alloc_chunk(struct btrfs_root *root, u64 bytes) { - struct btrfs_space_info *data_sinfo; - int ret = 0, committed = 0; + int ret; + u64 alloc_target; + struct btrfs_trans_handle *trans; - /* make sure bytes are sectorsize aligned */ - bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + alloc_target = btrfs_get_alloc_profile(root, 1); + trans = btrfs_start_transaction(root, 1); + if (!trans) + return -ENOMEM; - data_sinfo = BTRFS_I(inode)->space_info; - if (!data_sinfo) - goto alloc; + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + bytes + 2 * 1024 * 1024, + alloc_target, 0); + btrfs_end_transaction(trans, root); + return ret; +} + +int btrfs_check_data_free_space_info(struct btrfs_root *root, + struct btrfs_space_info *data_sinfo, + u64 bytes, + struct inode *inode) +{ + int ret = 0, committed = 0; -again: /* make sure we have enough space to handle the data first */ +again: spin_lock(&data_sinfo->lock); if (data_sinfo->total_bytes - data_sinfo->bytes_used - data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - @@ -3142,27 +3150,11 @@ again: * to alloc a new chunk. */ if (!data_sinfo->full) { - u64 alloc_target; - data_sinfo->force_alloc = 1; spin_unlock(&data_sinfo->lock); -alloc: - alloc_target = btrfs_get_alloc_profile(root, 1); - trans = btrfs_start_transaction(root, 1); - if (!trans) - return -ENOMEM; - - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - bytes + 2 * 1024 * 1024, - alloc_target, 0); - btrfs_end_transaction(trans, root); + ret = __alloc_chunk(root, bytes); if (ret) return ret; - - if (!data_sinfo) { - btrfs_set_inode_space_info(root, inode); - data_sinfo = BTRFS_I(inode)->space_info; - } goto again; } spin_unlock(&data_sinfo->lock); @@ -3195,11 +3187,35 @@ alloc: data_sinfo->bytes_may_use += bytes; BTRFS_I(inode)->reserved_bytes += bytes; spin_unlock(&data_sinfo->lock); - return 0; } /* + * This will check the space that the inode allocates from to make sure we have + * enough space for bytes. + */ +int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes) +{ + struct btrfs_space_info *data_sinfo; + int ret; + + /* make sure bytes are sectorsize aligned */ + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + data_sinfo = BTRFS_I(inode)->space_info; + if (!data_sinfo) { + ret = __alloc_chunk(root, bytes); + if (ret) + return ret; + btrfs_set_inode_space_info(root, inode); + data_sinfo = BTRFS_I(inode)->space_info; + } + + return btrfs_check_data_free_space_info(root, data_sinfo, bytes, inode); +} + +/* * if there was an error for whatever reason after calling * btrfs_check_data_free_space, call this so we can cleanup the counters. */ -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Sage Weil
2009-Sep-29 18:50 UTC
[PATCH 4/4] Btrfs: add TRANS_RESV_START ioctl to check/reserve free space on transaction start
The current TRANS_START ioctl is problematic when the volume fills up because a process can get ENOSPC on any operation that''s supoosed to be "contained" within its transaction without any prior warning. This defines a new ioctl, TRANS_RESV_START, that checks and sets aside space for the entire transaction, so that the process will get ENOSPC right away. The bytes_ioctl_trans_reserved is only checked and adjusted by the TRANS_RESV_START ioctl; any write()s that follow will be allowed to use up that reserved space. This is clearly imperfect (a mixture of an ioctl transaction workload and a regular workload will violate the reservations), but unavoidable with the current user transaction approach because we don''t know whether any given operation is contained by a user transaction or not. Signed-off-by: Sage Weil <sage@newdream.net> --- fs/btrfs/ctree.h | 7 ++++ fs/btrfs/extent-tree.c | 59 +++++++++++++++++++++++++++++-- fs/btrfs/ioctl.c | 92 +++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/ioctl.h | 6 +++ 4 files changed, 153 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b3959a1..14b7980 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -682,6 +682,9 @@ struct btrfs_space_info { u64 bytes_delalloc; /* number of bytes currently reserved for delayed allocation */ + u64 bytes_ioctl_trans_reserved; /* number of bytes reserved by ioctl + transactions */ + int full; /* indicates that we cannot allocate any more chunks for this space */ int force_alloc; /* set if we need to force a chunk alloc for @@ -2036,6 +2039,10 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct btrfs_root *root, struct inode *inode, u64 bytes); +int btrfs_reserve_ioctl_trans_free_space(struct btrfs_root *root, + u64 bytes, u64 ops); +void btrfs_unreserve_ioctl_trans_space(struct btrfs_root *root, + u64 bytes, u64 ops); void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c190bdb..d6316f8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2670,6 +2670,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_reserved = 0; found->bytes_readonly = 0; found->bytes_delalloc = 0; + found->bytes_ioctl_trans_reserved = 0; found->full = 0; found->force_alloc = 0; *space_info = found; @@ -3129,6 +3130,14 @@ static int __alloc_chunk(struct btrfs_root *root, u64 bytes) return ret; } +/* + * if @inode is defined, reserve bytes on that inode. otherwise, we + * are reserving the space for an ioctl transaction, and need to + * subtract off other ioctl reserved space too. we _only_ check ioctl + * reserved space here: the write() that follows needs to be able to + * use it, and we don''t know which writes "belong" to which ioctl + * transactions. + */ int btrfs_check_data_free_space_info(struct btrfs_root *root, struct btrfs_space_info *data_sinfo, u64 bytes, @@ -3142,6 +3151,7 @@ again: if (data_sinfo->total_bytes - data_sinfo->bytes_used - data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - + (inode ? 0 : data_sinfo->bytes_ioctl_trans_reserved) - data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { struct btrfs_trans_handle *trans; @@ -3160,7 +3170,8 @@ again: spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ - if (!committed && !root->fs_info->open_ioctl_trans) { + if (!committed && + (!inode || !root->fs_info->open_ioctl_trans)) { committed = 1; trans = btrfs_join_transaction(root, 1); if (!trans) @@ -3184,8 +3195,12 @@ again: (unsigned long long)data_sinfo->total_bytes); return -ENOSPC; } - data_sinfo->bytes_may_use += bytes; - BTRFS_I(inode)->reserved_bytes += bytes; + if (inode) { + data_sinfo->bytes_may_use += bytes; + BTRFS_I(inode)->reserved_bytes += bytes; + } else { + data_sinfo->bytes_ioctl_trans_reserved += bytes; + } spin_unlock(&data_sinfo->lock); return 0; } @@ -3215,6 +3230,44 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, return btrfs_check_data_free_space_info(root, data_sinfo, bytes, inode); } +int btrfs_reserve_ioctl_trans_free_space(struct btrfs_root *root, u64 bytes, + u64 ops) +{ + struct btrfs_space_info *data_sinfo; + u64 alloc_target; + int ret; + + /* make sure bytes are sectorsize aligned */ + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + bytes += calculate_bytes_needed(root, ops * 5); + + alloc_target = btrfs_get_alloc_profile(root, 1); + data_sinfo = __find_space_info(root->fs_info, alloc_target); + ret = btrfs_check_data_free_space_info(root, data_sinfo, bytes, NULL); + if (ret) + return ret; + return 0; +} + +void btrfs_unreserve_ioctl_trans_space(struct btrfs_root *root, + u64 bytes, u64 ops) +{ + struct btrfs_space_info *data_sinfo; + u64 alloc_target; + + /* make sure bytes are sectorsize aligned */ + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + bytes += calculate_bytes_needed(root, ops * 5); + + if (bytes) { + alloc_target = btrfs_get_alloc_profile(root, 1); + data_sinfo = __find_space_info(root->fs_info, alloc_target); + spin_lock(&data_sinfo->lock); + data_sinfo->bytes_ioctl_trans_reserved -= bytes; + spin_unlock(&data_sinfo->lock); + } +} + /* * if there was an error for whatever reason after calling * btrfs_check_data_free_space, call this so we can cleanup the counters. diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9a780c8..dd4fde1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1227,11 +1227,76 @@ static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) * basically own the machine, and have a very in depth understanding * of all the possible deadlocks and enospc problems. */ +struct btrfs_ioctl_trans { + struct btrfs_trans_handle *trans; + u64 reserved_bytes, reserved_ops; +}; + +static long btrfs_ioctl_trans_resv_start(struct file *file, void __user *argp) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_ioctl_trans *ioctl_trans; + struct btrfs_ioctl_trans_resv_start resv; + int ret; + + ret = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -EINPROGRESS; + if (file->private_data) + goto out; + + ret = -EFAULT; + if (copy_from_user(&resv, argp, sizeof(resv))) + goto out; + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + goto out; + + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans++; + mutex_unlock(&root->fs_info->trans_mutex); + + ret = btrfs_reserve_ioctl_trans_free_space(root, resv.bytes, resv.ops); + if (ret) + goto out_drop; + + ret = -ENOMEM; + ioctl_trans = kzalloc(sizeof(*ioctl_trans), GFP_KERNEL); + if (!ioctl_trans) + goto out_free_bytes; + + ioctl_trans->trans = btrfs_start_ioctl_transaction(root, 0); + if (!ioctl_trans->trans) + goto out_free_ioctl_trans; + + ioctl_trans->reserved_bytes = resv.bytes; + ioctl_trans->reserved_ops = resv.ops; + file->private_data = ioctl_trans; + return 0; + +out_free_ioctl_trans: + kfree(ioctl_trans); +out_free_bytes: + btrfs_unreserve_ioctl_trans_space(root, resv.bytes, resv.ops); +out_drop: + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans--; + mutex_unlock(&root->fs_info->trans_mutex); + mnt_drop_write(file->f_path.mnt); +out: + return ret; +} + + static long btrfs_ioctl_trans_start(struct file *file) { struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; + struct btrfs_ioctl_trans *ioctl_trans; int ret; ret = -EPERM; @@ -1251,13 +1316,19 @@ static long btrfs_ioctl_trans_start(struct file *file) mutex_unlock(&root->fs_info->trans_mutex); ret = -ENOMEM; - trans = btrfs_start_ioctl_transaction(root, 0); - if (!trans) + ioctl_trans = kzalloc(sizeof(*ioctl_trans), GFP_KERNEL); + if (!ioctl_trans) goto out_drop; - file->private_data = trans; + ioctl_trans->trans = btrfs_start_ioctl_transaction(root, 0); + if (ioctl_trans->trans) + goto out_free_ioctl_trans; + + file->private_data = ioctl_trans; return 0; +out_free_ioctl_trans: + kfree(ioctl_trans); out_drop: mutex_lock(&root->fs_info->trans_mutex); root->fs_info->open_ioctl_trans--; @@ -1277,14 +1348,17 @@ long btrfs_ioctl_trans_end(struct file *file) { struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; + struct btrfs_ioctl_trans *ioctl_trans; - trans = file->private_data; - if (!trans) + ioctl_trans = file->private_data; + if (!ioctl_trans) return -EINVAL; file->private_data = NULL; - btrfs_end_transaction(trans, root); + btrfs_unreserve_ioctl_trans_space(root, ioctl_trans->reserved_bytes, + ioctl_trans->reserved_ops); + btrfs_end_transaction(ioctl_trans->trans, root); + kfree(ioctl_trans); mutex_lock(&root->fs_info->trans_mutex); root->fs_info->open_ioctl_trans--; @@ -1327,6 +1401,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_clone(file, arg, 0, 0, 0); case BTRFS_IOC_CLONE_RANGE: return btrfs_ioctl_clone_range(file, argp); + case BTRFS_IOC_TRANS_RESV_START: + return btrfs_ioctl_trans_resv_start(file, argp); case BTRFS_IOC_TRANS_START: return btrfs_ioctl_trans_start(file); case BTRFS_IOC_TRANS_END: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index bc49914..9810980 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -48,6 +48,12 @@ struct btrfs_ioctl_clone_range_args { * use by applications that know how to avoid the * resulting deadlocks */ +/* check/reserve disk space for the transaction up-front */ +struct btrfs_ioctl_trans_resv_start { + __u64 bytes, ops; +}; +#define BTRFS_IOC_TRANS_RESV_START _IOW(BTRFS_IOCTL_MAGIC, 5, \ + struct btrfs_ioctl_trans_resv_start) #define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) #define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) #define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html