Hi Chris, These ioctls let a user application hold a transaction open while it performs a series of operations. A final ioctl does a sync on the fs (closing the current transaction). This is the main requirement for Ceph''s OSD to be able to keep the data it''s storing in a btrfs volume consistent, and AFAICS it works just fine. The application would do something like fd = ::open("some/file", O_RDONLY); ::ioctl(fd, BTRFS_IOC_TRANS_START); /* do a bunch of stuff */ ::ioctl(fd, BTRFS_IOC_TRANS_END); or just ::close(fd); And to ensure it commits to disk, ::ioctl(fd, BTRFS_IOC_SYNC); When a transaction is held open, the trans_handle is attached to the struct file (via private_data) so that it will get cleaned up if the process dies unexpectedly. A held transaction is also ended on fsync() to avoid a deadlock. There may be other places I missed? A misbehaving application could also deliberately hold a transaction open, effectively locking up the FS, so it may make sense to restrict something like this to root or something. Anyway, does this look reasonable? Thanks- sage diff -r e4cd88595ed7 ctree.h --- a/ctree.h Thu Feb 21 14:54:12 2008 -0500 +++ b/ctree.h Tue Apr 22 12:08:53 2008 -0700 @@ -1179,6 +1179,7 @@ void btrfs_destroy_inode(struct inode *i void btrfs_destroy_inode(struct inode *inode); int btrfs_init_cachep(void); void btrfs_destroy_cachep(void); +long btrfs_ioctl_trans_end(struct file *file); long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, struct btrfs_root *root); @@ -1199,6 +1200,8 @@ int btrfs_drop_extents(struct btrfs_tran int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, u64 inline_limit, u64 *hint_block); +int btrfs_file_release(struct inode *inode, struct file *file); + /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root, int cache_only); @@ -1217,4 +1220,5 @@ int btrfs_delete_xattrs(struct btrfs_tra struct btrfs_root *root, struct inode *inode); /* super.c */ u64 btrfs_parse_size(char *str); +int btrfs_sync_fs(struct super_block *sb, int wait); #endif diff -r e4cd88595ed7 file.c --- a/file.c Thu Feb 21 14:54:12 2008 -0500 +++ b/file.c Tue Apr 22 12:08:53 2008 -0700 @@ -941,6 +941,8 @@ static int btrfs_sync_file(struct file * /* * ok we haven''t committed the transaction yet, lets do a commit */ + if (file->private_data) + btrfs_ioctl_trans_end(file); trans = btrfs_start_transaction(root, 1); if (!trans) { ret = -ENOMEM; @@ -950,6 +952,13 @@ out: out: mutex_unlock(&root->fs_info->fs_mutex); return ret > 0 ? EIO : ret; +} + +int btrfs_file_release(struct inode *inode, struct file *file) +{ + if (file->private_data) + btrfs_ioctl_trans_end(file); + return 0; } static struct vm_operations_struct btrfs_file_vm_ops = { @@ -980,6 +989,7 @@ struct file_operations btrfs_file_operat .write = btrfs_file_write, .mmap = btrfs_file_mmap, .open = generic_file_open, + .release = btrfs_file_release, .fsync = btrfs_sync_file, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT diff -r e4cd88595ed7 inode.c --- a/inode.c Thu Feb 21 14:54:12 2008 -0500 +++ b/inode.c Tue Apr 22 12:08:53 2008 -0700 @@ -2690,6 +2690,42 @@ static int btrfs_ioctl_defrag(struct fil return 0; } +long btrfs_ioctl_trans_start(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (file->private_data) + return -EINPROGRESS; + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 0); + if (trans) + file->private_data = trans; + else + ret = -ENOMEM; + /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ + mutex_unlock(&root->fs_info->fs_mutex); + return ret; +} + +long btrfs_ioctl_trans_end(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans = file->private_data; + + if (!trans) + return -EINVAL; + /*printk(KERN_INFO "btrfs_ioctl_trans_end on %p\n", file);*/ + mutex_lock(&root->fs_info->fs_mutex); + btrfs_end_transaction(trans, root); + file->private_data = 0; + mutex_unlock(&root->fs_info->fs_mutex); + return 0; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2702,6 +2738,13 @@ long btrfs_ioctl(struct file *file, unsi return btrfs_ioctl_defrag(file); case BTRFS_IOC_RESIZE: return btrfs_ioctl_resize(root, (void __user *)arg); + case BTRFS_IOC_TRANS_START: + return btrfs_ioctl_trans_start(file); + case BTRFS_IOC_TRANS_END: + return btrfs_ioctl_trans_end(file); + case BTRFS_IOC_SYNC: + btrfs_sync_fs(file->f_dentry->d_sb, 1); + return 0; } return -ENOTTY; @@ -3019,6 +3062,7 @@ static struct file_operations btrfs_dir_ #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_ioctl, #endif + .release = btrfs_file_release, }; static struct extent_io_ops btrfs_extent_io_ops = { diff -r e4cd88595ed7 ioctl.h --- a/ioctl.h Thu Feb 21 14:54:12 2008 -0500 +++ b/ioctl.h Tue Apr 22 12:08:53 2008 -0700 @@ -32,4 +32,9 @@ struct btrfs_ioctl_vol_args { struct btrfs_ioctl_vol_args) #define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ struct btrfs_ioctl_vol_args) + +#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) +#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) +#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) #endif + diff -r e4cd88595ed7 super.c --- a/super.c Thu Feb 21 14:54:12 2008 -0500 +++ b/super.c Tue Apr 22 12:08:53 2008 -0700 @@ -284,7 +284,7 @@ fail_close: return err; } -static int btrfs_sync_fs(struct super_block *sb, int wait) +int btrfs_sync_fs(struct super_block *sb, int wait) { struct btrfs_trans_handle *trans; struct btrfs_root *root; -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Chris, These ioctls let a user application hold a transaction open while it performs a series of operations. A final ioctl does a sync on the fs (closing the current transaction). This is the main requirement for Ceph's OSD to be able to keep the data it's storing in a btrfs volume consistent, and AFAICS it works just fine. The application would do something like fd = ::open("some/file", O_RDONLY); ::ioctl(fd, BTRFS_IOC_TRANS_START); /* do a bunch of stuff */ ::ioctl(fd, BTRFS_IOC_TRANS_END); or just ::close(fd); And to ensure it commits to disk, ::ioctl(fd, BTRFS_IOC_SYNC); When a transaction is held open, the trans_handle is attached to the struct file (via private_data) so that it will get cleaned up if the process dies unexpectedly. A held transaction is also ended on fsync() to avoid a deadlock. There may be other places I missed? A misbehaving application could also deliberately hold a transaction open, effectively locking up the FS, so it may make sense to restrict something like this to root or something. Anyway, does this look reasonable? Thanks- sage diff -r e4cd88595ed7 ctree.h --- a/ctree.h Thu Feb 21 14:54:12 2008 -0500 +++ b/ctree.h Tue Apr 22 12:08:53 2008 -0700 @@ -1179,6 +1179,7 @@ void btrfs_destroy_inode(struct inode *i void btrfs_destroy_inode(struct inode *inode); int btrfs_init_cachep(void); void btrfs_destroy_cachep(void); +long btrfs_ioctl_trans_end(struct file *file); long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, struct btrfs_root *root); @@ -1199,6 +1200,8 @@ int btrfs_drop_extents(struct btrfs_tran int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, u64 inline_limit, u64 *hint_block); +int btrfs_file_release(struct inode *inode, struct file *file); + /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root, int cache_only); @@ -1217,4 +1220,5 @@ int btrfs_delete_xattrs(struct btrfs_tra struct btrfs_root *root, struct inode *inode); /* super.c */ u64 btrfs_parse_size(char *str); +int btrfs_sync_fs(struct super_block *sb, int wait); #endif diff -r e4cd88595ed7 file.c --- a/file.c Thu Feb 21 14:54:12 2008 -0500 +++ b/file.c Tue Apr 22 12:08:53 2008 -0700 @@ -941,6 +941,8 @@ static int btrfs_sync_file(struct file * /* * ok we haven't committed the transaction yet, lets do a commit */ + if (file->private_data) + btrfs_ioctl_trans_end(file); trans = btrfs_start_transaction(root, 1); if (!trans) { ret = -ENOMEM; @@ -950,6 +952,13 @@ out: out: mutex_unlock(&root->fs_info->fs_mutex); return ret > 0 ? EIO : ret; +} + +int btrfs_file_release(struct inode *inode, struct file *file) +{ + if (file->private_data) + btrfs_ioctl_trans_end(file); + return 0; } static struct vm_operations_struct btrfs_file_vm_ops = { @@ -980,6 +989,7 @@ struct file_operations btrfs_file_operat .write = btrfs_file_write, .mmap = btrfs_file_mmap, .open = generic_file_open, + .release = btrfs_file_release, .fsync = btrfs_sync_file, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT diff -r e4cd88595ed7 inode.c --- a/inode.c Thu Feb 21 14:54:12 2008 -0500 +++ b/inode.c Tue Apr 22 12:08:53 2008 -0700 @@ -2690,6 +2690,42 @@ static int btrfs_ioctl_defrag(struct fil return 0; } +long btrfs_ioctl_trans_start(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (file->private_data) + return -EINPROGRESS; + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 0); + if (trans) + file->private_data = trans; + else + ret = -ENOMEM; + /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ + mutex_unlock(&root->fs_info->fs_mutex); + return ret; +} + +long btrfs_ioctl_trans_end(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans = file->private_data; + + if (!trans) + return -EINVAL; + /*printk(KERN_INFO "btrfs_ioctl_trans_end on %p\n", file);*/ + mutex_lock(&root->fs_info->fs_mutex); + btrfs_end_transaction(trans, root); + file->private_data = 0; + mutex_unlock(&root->fs_info->fs_mutex); + return 0; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2702,6 +2738,13 @@ long btrfs_ioctl(struct file *file, unsi return btrfs_ioctl_defrag(file); case BTRFS_IOC_RESIZE: return btrfs_ioctl_resize(root, (void __user *)arg); + case BTRFS_IOC_TRANS_START: + return btrfs_ioctl_trans_start(file); + case BTRFS_IOC_TRANS_END: + return btrfs_ioctl_trans_end(file); + case BTRFS_IOC_SYNC: + btrfs_sync_fs(file->f_dentry->d_sb, 1); + return 0; } return -ENOTTY; @@ -3019,6 +3062,7 @@ static struct file_operations btrfs_dir_ #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_ioctl, #endif + .release = btrfs_file_release, }; static struct extent_io_ops btrfs_extent_io_ops = { diff -r e4cd88595ed7 ioctl.h --- a/ioctl.h Thu Feb 21 14:54:12 2008 -0500 +++ b/ioctl.h Tue Apr 22 12:08:53 2008 -0700 @@ -32,4 +32,9 @@ struct btrfs_ioctl_vol_args { struct btrfs_ioctl_vol_args) #define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ struct btrfs_ioctl_vol_args) + +#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) +#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) +#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) #endif + diff -r e4cd88595ed7 super.c --- a/super.c Thu Feb 21 14:54:12 2008 -0500 +++ b/super.c Tue Apr 22 12:08:53 2008 -0700 @@ -284,7 +284,7 @@ fail_close: return err; } -static int btrfs_sync_fs(struct super_block *sb, int wait) +int btrfs_sync_fs(struct super_block *sb, int wait) { struct btrfs_trans_handle *trans; struct btrfs_root *root;
> A misbehaving application could also deliberately hold a transaction open, > effectively locking up the FS, so it may make sense to restrict something > like this to root or something.I suspect it doesn't have to be deliberate. Have you tried this under memory pressure? I wonder if the application can get stuck waiting for memory which will only be freed once the transaction closes. I'm reasonably sure that we've discussed this persistent theoretical problem with these kinds of interfaces ;). - z
On Tuesday 22 April 2008, Sage Weil wrote:> Hi Chris, > > These ioctls let a user application hold a transaction open while it > performs a series of operations. A final ioctl does a sync on the fs > (closing the current transaction). This is the main requirement for > Ceph's OSD to be able to keep the data it's storing in a btrfs volume > consistent, and AFAICS it works just fine. The application would do > something like >I'm definitely willing to include it for you to experiment with. Holding a transaction from userland can indeed lead to deadlock, but in this case your userland basically owns the server anyway. I'm worried about some nasty corner cases still, but btrfs is blissfully ignoring those right now anyway. One problem will be operations that are basically boundless (truncating a file, large writes). Eventually the ENOSPC support will hook into the transaction system to make sure a given operation reserves enough free space. With your ioctls, the "do a bunch of stuff" will need to honor the same accounting rules as the kernel code (which don't exist yet). I thought your original plan was to do all of this from userland (without a kernel filesystem at all)? The btrfs progs share most of the same code with the kernel, so with a little love to the transaction and IO subsystems, you'd be able to use it as a library style DB. -chris