Hi- I'm working on a clone ioctl that will quickly and efficiently duplicate the contents of a file, e.g. int main(int argc, const char **argv) { int in = open(argv[1], O_RDONLY); int out = open(argv[2], O_CREAT|O_TRUNC|O_WRONLY, 0644); ioctl(out, BTRFS_IOC_CLONE, in); close(in); close(out); return 0; } I've probably got the locking order a bit wrong, lots of error handling is missing, and I suspect there's a cleaner way to do the target inode size update, but it behaves well enough in my (limited :) testing. Oh, and I wasn't certain the 'offset' in file_extent_item could be safely ignored when duplicating the extent reference. My assumption was that it is orthogonal to extent allocation and isn't related to the backref. However, btrfs_insert_file_extent() always set offset=0. I'm guessing I need to add an argument there and fix up the other callers? Anyway, any comments or suggestions (on the interface or implemantation) are welcome.. :) sage diff -r 1791a620d509 inode.c --- a/inode.c Thu Apr 24 13:43:27 2008 -0700 +++ b/inode.c Thu Apr 24 15:10:17 2008 -0700 @@ -18,6 +18,7 @@ #include <linux/bio.h> #include <linux/buffer_head.h> +#include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> @@ -2726,6 +2727,158 @@ long btrfs_ioctl_trans_end(struct file * return 0; } +void dup_item_to_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf, + int slot, + struct btrfs_key *key, + u64 destino) +{ + struct btrfs_path *cpath = btrfs_alloc_path(); + int len = btrfs_item_size_nr(leaf, slot); + int dstoff; + struct btrfs_key ckey = *key; + int ret; + + ckey.objectid = destino; + ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len); + dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]); + copy_extent_buffer(cpath->nodes[0], leaf, dstoff, + btrfs_item_ptr_offset(leaf, slot), + len); + btrfs_release_path(root, cpath); +} + +long btrfs_ioctl_clone(struct file *file, unsigned long src_fd) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct file *src_file; + struct inode *src; + struct btrfs_trans_handle *trans; + int ret; + u64 pos; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + u32 nritems; + int nextret; + int slot; + + src_file = fget(src_fd); + if (!src_file) + return -EBADF; + src = src_file->f_dentry->d_inode; + + ret = -EXDEV; + if (src->i_sb != inode->i_sb) + goto out_fput; + + if (inode < src) { + mutex_lock(&inode->i_mutex); + mutex_lock(&src->i_mutex); + } else { + mutex_lock(&src->i_mutex); + mutex_lock(&inode->i_mutex); + } + + ret = -ENOTEMPTY; + if (inode->i_size) + goto out_unlock; + + /* do any pending delalloc/csum calc on src, one way or another */ + filemap_write_and_wait(src->i_mapping); + + mutex_lock(&root->fs_info->fs_mutex); + trans = btrfs_start_transaction(root, 0); + path = btrfs_alloc_path(); + pos = 0; + while (1) { + ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino, + pos, 0); + if (ret < 0) + goto out; + if (ret > 0) { + if (path->slots[0] == 0) { + ret = 0; + goto out; + } + path->slots[0]--; + } + next_slot: + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + printk("key(%llu %x %llu)\n", + key.objectid, key.type, key.offset); + if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY || + key.objectid != src->i_ino) + goto out; + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + struct btrfs_file_extent_item *extent; + int found_type; + u64 len; + pos = key.offset; + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, extent); + len = btrfs_file_extent_num_bytes(leaf, extent); + if (found_type == BTRFS_FILE_EXTENT_REG) { + u64 ds = btrfs_file_extent_disk_bytenr(leaf, + extent); + u64 dl = btrfs_file_extent_disk_num_bytes(leaf, + extent); + printk(" %llu~%llu disk %llu~%llu off %llu\n", + pos, len, ds, dl, + btrfs_file_extent_offset(leaf, extent)); + btrfs_insert_file_extent(trans, root, + inode->i_ino, pos, + ds, dl, len); + btrfs_inc_extent_ref(trans, root, ds, dl, + root->root_key.objectid, + trans->transid, + inode->i_ino, pos); + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) + dup_item_to_inode(trans, root, path, leaf, slot, + &key, inode->i_ino); + pos = key.offset + len; + } else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) + dup_item_to_inode(trans, root, path, leaf, slot, &key, + inode->i_ino); + + nritems = btrfs_header_nritems(leaf); + if (slot >= nritems - 1) { + nextret = btrfs_next_leaf(root, path); + if (nextret) + goto out; + } else { + path->slots[0]++; + } + goto next_slot; + } + +out: + ret = 0; + mutex_unlock(&root->fs_info->fs_mutex); + + i_size_write(inode, src->i_size); + inode->i_blocks = src->i_blocks; + mark_inode_dirty(inode); + + mutex_lock(&root->fs_info->fs_mutex); + btrfs_end_transaction(trans, root); + mutex_unlock(&root->fs_info->fs_mutex); + +out_unlock: + mutex_unlock(&src->i_mutex); + mutex_unlock(&inode->i_mutex); +out_fput: + fput(src_file); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2744,6 +2897,9 @@ long btrfs_ioctl(struct file *file, unsi return btrfs_ioctl_trans_end(file); case BTRFS_IOC_SYNC: btrfs_sync_fs(file->f_dentry->d_sb, 1); + return 0; + case BTRFS_IOC_CLONE: + btrfs_ioctl_clone(file, arg); return 0; } diff -r 1791a620d509 ioctl.h --- a/ioctl.h Thu Apr 24 13:43:27 2008 -0700 +++ b/ioctl.h Thu Apr 24 15:10:17 2008 -0700 @@ -36,5 +36,6 @@ struct btrfs_ioctl_vol_args { #define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) #define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) #define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) +#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) #endif
On Thursday 24 April 2008, Sage Weil wrote:> Hi- > > I'm working on a clone ioctl that will quickly and efficiently duplicate > the contents of a file, e.g.Very cool. I'd actually loved to see this wrapped into a program that will cow a directory tree. Basically the same as cp -al, but with cow instead of linking.> > int main(int argc, const char **argv) > { > int in = open(argv[1], O_RDONLY); > int out = open(argv[2], O_CREAT|O_TRUNC|O_WRONLY, 0644); > ioctl(out, BTRFS_IOC_CLONE, in); > close(in); > close(out); > return 0; > } > > I've probably got the locking order a bit wrong, lots of error handling is > missing, and I suspect there's a cleaner way to do the target inode size > update, but it behaves well enough in my (limited :) testing. > > Oh, and I wasn't certain the 'offset' in file_extent_item could be safely > ignored when duplicating the extent reference. My assumption was that it > is orthogonal to extent allocation and isn't related to the backref. > However, btrfs_insert_file_extent() always set offset=0. I'm guessing I > need to add an argument there and fix up the other callers? >Yes, you need to preserve the offset. There's only one place right now that sets a non-zero offset and it inserts the extent by hand for other reasons (if you're brave, file.c:btrfs_drop_extents) The reason file extents have an offset field is to allow COW without read/modify/write. Picture something like this: # create a single 100MB extent in file foo dd if=/dev/zero of=foo bs=1M count=100 sync # write into the middle dd if=/dev/zero of=foo bs=4k count=1 seek=100 conv=notrunc sync We've written into the middle of that 100MB extent, and we need to do COW. One option is to read the whole thing, change 4k and write it all back. Instead, btrfs does something like this (+/- off by need more coffee errors): file pos = 0 -> [ old extent, offset = 0, num_bytes = 400k ] file pos = 409600 -> [ new 4k extent, offset = 0, num_bytes = 4k ] file pos = 413696 -> [ old extent, offset = 413696, num_bytes = 100MB - 404k] An extra reference is taken on the old extent to reflect that we're pointing to it twice.> Anyway, any comments or suggestions (on the interface or implemantation) > are welcome.. :) >By taking the inode mutex, you protect against file_write and truncates changing the file. But, we also need to prevent mmaps from changing the file pages as well. What you want to do lock all the file bytes in the extent tree: lock_extent(&BTRFS_I(src_inode)->io_tree, 0, (u64)-1, GFP_NOFS); But unfortunately, the code to fill delayed allocation takes that same lock. So you need to loop a bit: while(1) { filemap_write_and_wait(src_inode); lock_extent() if (BTRFS_I(src_inode)->delalloc_bytes == 0) break; unlock_extent() } That should keep you from racing with btrfs_page_mkwrite() -chris
On Thursday 24 April 2008, Sage Weil wrote:> Hi- > > I'm working on a clone ioctl that will quickly and efficiently duplicate > the contents of a file, e.g. >Just FYI, I didn't sneak this into v0.14 because I didn't quite have the cycles to test it. It'll go in this week. -chris
On Thursday 24 April 2008, Sage Weil wrote:> Hi- > > I''m working on a clone ioctl that will quickly and efficiently duplicate > the contents of a file, e.g.Sage''s work has been pushed into the stable and unstable trees, along with a small command called bcp to trigger the clone ioctls. bcp is used like this: bcp src dst If src is a directory, it is copied recursively. If the clone ioctl fails, a fallback to buffer copies is done instead. Sage, I had to make a few small changes to your ioctl code. One was to skip reference count updates if the extent is a hole, and the other was to change around mark_inode_dirty a bit to avoid transaction deadlock. We aren''t actually making any pages dirty so it is safe to just update the inode. -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2 May 2008, Chris Mason wrote:> Sage''s work has been pushed into the stable and unstable trees, along with aThanks! For the transaction ioctls... would it make more sense to specify a string of ops to apply atomically in a vector instead of exposing the raw transaction start/end to userspace? Would that avoid the deadlock and memory issues? sage -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wednesday 21 May 2008, Mingming wrote:> Hi Chris, > > btrfs failed to compile on 2.6.26-rc2 since the put_inode callback > function is removed from super_operations. > > I noticed btrfs_put_inode() seems being actively used, to drop the inode > from the tree for ordered mode transaction. Any idea where to relocate > this function? Do we need to call btrfs_del_ordered_inode() for every > iput()?The ordered inode list has incremented the inode->i_count, so we can''t do the iput code inside clear_inode. But, we should be able to safely move the checks to file_release. This is basically an optimization so that we don''t leave clean inodes hanging around on the data=ordered list. It is safe to remove the put_inode call completely, it''ll just be a little slower. -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
try this patch out, http://zen-sources.org/files/btrfs-stable-for-2.6.26-rc3.patch it should apply to 2.6.26-rc*... probably even 2.6.25 im btrfs with 2.6.26-rc3 with no issues On Wed, May 21, 2008 at 1:19 PM, Mingming <cmm@us.ibm.com> wrote:> Hi Chris, > > btrfs failed to compile on 2.6.26-rc2 since the put_inode callback > function is removed from super_operations. > > I noticed btrfs_put_inode() seems being actively used, to drop the inode > from the tree for ordered mode transaction. Any idea where to relocate > this function? Do we need to call btrfs_del_ordered_inode() for every > iput()? > > Thanks, > Mingming > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html >-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wednesday 21 May 2008, Ryan Hope wrote:> try this patch out, > http://zen-sources.org/files/btrfs-stable-for-2.6.26-rc3.patchIt is a little hard to tell from the big long patch what the incremental difference is ;) -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 2008-05-21 at 14:02 -0400, Chris Mason wrote:> On Wednesday 21 May 2008, Mingming wrote: > > Hi Chris, > > > > btrfs failed to compile on 2.6.26-rc2 since the put_inode callback > > function is removed from super_operations. > > > > I noticed btrfs_put_inode() seems being actively used, to drop the inode > > from the tree for ordered mode transaction. Any idea where to relocate > > this function? Do we need to call btrfs_del_ordered_inode() for every > > iput()? > > The ordered inode list has incremented the inode->i_count, so we can''t do the > iput code inside clear_inode. But, we should be able to safely move the > checks to file_release. >Ah okay. file release is called on the last file_close() to this inode. But I thought NFS could grab the inode without thr file_open/file_close, so we might also need to delete the inode from the ordered tree at delete_inode time, like ext3?> This is basically an optimization so that we don''t leave clean inodes hanging > around on the data=ordered list. It is safe to remove the put_inode call > completely, it''ll just be a little slower. > > -chris-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wednesday 21 May 2008, Mingming wrote:> On Wed, 2008-05-21 at 14:02 -0400, Chris Mason wrote: > > On Wednesday 21 May 2008, Mingming wrote: > > > Hi Chris, > > > > > > btrfs failed to compile on 2.6.26-rc2 since the put_inode callback > > > function is removed from super_operations. > > > > > > I noticed btrfs_put_inode() seems being actively used, to drop the > > > inode from the tree for ordered mode transaction. Any idea where to > > > relocate this function? Do we need to call btrfs_del_ordered_inode() > > > for every iput()? > > > > The ordered inode list has incremented the inode->i_count, so we can''t do > > the iput code inside clear_inode. But, we should be able to safely move > > the checks to file_release. > > Ah okay. file release is called on the last file_close() to this inode. > But I thought NFS could grab the inode without thr file_open/file_close, > so we might also need to delete the inode from the ordered tree at > delete_inode time, like ext3?I do it right now in unlink, but delete_inode is fine too. -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 2008-05-21 at 14:32 -0400, Chris Mason wrote:> On Wednesday 21 May 2008, Ryan Hope wrote: > > try this patch out, > > http://zen-sources.org/files/btrfs-stable-for-2.6.26-rc3.patch > > It is a little hard to tell from the big long patch what the incremental > difference is ;) >Looks like just break the hook up in super_operations:-)> -chris-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Chris, I thought I spotted a few bugs, While looking at how to properly remove inode from ordered tree, let me know if I got it right. * inode->i_count accounting issue - remove extra i_count_decrease after calling filemap_file_write_and_wait() in btrfs_write_ordered_inodes() - Remove extra i_count decrease after each call btrfs_del_ordered_inode(), it decrease the i_count again. I couldn''t find the where the i_count being increased. The tree_lock should protecting it races with btrfs_write_ordered_inodes(), no? * There is possible race with inode delete and btrfs_find_first_ordered_inode(). The inode could possibly in the process of freeing while we are trying to get hold of it during commit transaction. The fix is using igrab() instead, and search for next inode in the tree if the found one is in the middle of being released. * get rid of btrfs_put_inode(), and move the functionality under the btrfs_del_ordered_inode() directly. * Remove the inode from ordered tree at last iput(). Did not do it at file release() time, as it may remove the inode from the ordered tree before ensure the ordering of write to the same inode from other process. Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but it would not be hurt to do it again at delete_inode() time. Here is the patch trying to address above issues... compiled, but not tested. Is ordered mode on by default? Regards, Mingming diff -r c3290d51e5f9 inode.c --- a/inode.c Fri May 16 13:30:15 2008 -0400 +++ b/inode.c Wed May 21 14:51:22 2008 -0700 @@ -857,15 +857,11 @@ static int btrfs_unlink(struct inode *di nr = trans->blocks_used; if (inode->i_nlink == 0) { - int found; /* if the inode isn''t linked anywhere, * we don''t need to worry about * data=ordered */ - found = btrfs_del_ordered_inode(inode); - if (found == 1) { - atomic_dec(&inode->i_count); - } + btrfs_del_ordered_inode(inode); } btrfs_end_transaction(trans, root); @@ -1271,24 +1267,6 @@ fail: return err; } -void btrfs_put_inode(struct inode *inode) -{ - int ret; - - if (!BTRFS_I(inode)->ordered_trans) { - return; - } - - if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) || - mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)) - return; - - ret = btrfs_del_ordered_inode(inode); - if (ret == 1) { - atomic_dec(&inode->i_count); - } -} - void btrfs_delete_inode(struct inode *inode) { struct btrfs_trans_handle *trans; @@ -1311,6 +1289,7 @@ void btrfs_delete_inode(struct inode *in goto no_delete_lock; nr = trans->blocks_used; + btrfs_del_ordered_inode(inode); clear_inode(inode); btrfs_end_transaction(trans, root); diff -r c3290d51e5f9 ordered-data.c --- a/ordered-data.c Fri May 16 13:30:15 2008 -0400 +++ b/ordered-data.c Wed May 21 14:55:25 2008 -0700 @@ -166,7 +166,7 @@ int btrfs_find_first_ordered_inode(struc struct inode **inode) { struct tree_entry *entry; - struct rb_node *node; + struct rb_node *node, *next; write_lock(&tree->lock); node = tree_search(&tree->tree, *root_objectid, *objectid); @@ -174,6 +174,7 @@ int btrfs_find_first_ordered_inode(struc write_unlock(&tree->lock); return 0; } +again: entry = rb_entry(node, struct tree_entry, rb_node); while(comp_entry(entry, *root_objectid, *objectid) >= 0) { @@ -187,9 +188,20 @@ int btrfs_find_first_ordered_inode(struc return 0; } + *inode = igrab(entry->inode); + if (!*inode) { + next = rb_next(node); + rb_erase(node, &tree->tree); + kfree(entry); + if (!next) { + write_unlock(&tree->lock); + return 0; + } + node = next; + goto again; + } + *root_objectid = entry->root_objectid; - *inode = entry->inode; - atomic_inc(&entry->inode->i_count); *objectid = entry->objectid; write_unlock(&tree->lock); return 1; @@ -200,7 +212,7 @@ int btrfs_find_del_first_ordered_inode(s struct inode **inode) { struct tree_entry *entry; - struct rb_node *node; + struct rb_node *node, *next; write_lock(&tree->lock); node = tree_search(&tree->tree, *root_objectid, *objectid); @@ -208,7 +220,7 @@ int btrfs_find_del_first_ordered_inode(s write_unlock(&tree->lock); return 0; } - +retry: entry = rb_entry(node, struct tree_entry, rb_node); while(comp_entry(entry, *root_objectid, *objectid) >= 0) { node = rb_next(node); @@ -220,11 +232,21 @@ int btrfs_find_del_first_ordered_inode(s write_unlock(&tree->lock); return 0; } + *inode = igrab(entry->inode); + if (!*inode) { + next = rb_next(node); + rb_erase(node, &tree->tree); + kfree(entry); + if (!next) { + write_unlock(&tree->lock); + return 0; + } + node = next; + goto retry; + } *root_objectid = entry->root_objectid; *objectid = entry->objectid; - *inode = entry->inode; - atomic_inc(&entry->inode->i_count); rb_erase(node, &tree->tree); write_unlock(&tree->lock); kfree(entry); @@ -253,11 +275,19 @@ static int __btrfs_del_ordered_inode(str return 1; } -int btrfs_del_ordered_inode(struct inode *inode) +void btrfs_del_ordered_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 root_objectid = root->root_key.objectid; int ret = 0; + + if (!BTRFS_I(inode)->ordered_trans) { + return; + } + + if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)) + return; spin_lock(&root->fs_info->new_trans_lock); if (root->fs_info->running_transaction) { @@ -267,7 +297,7 @@ int btrfs_del_ordered_inode(struct inode inode->i_ino); } spin_unlock(&root->fs_info->new_trans_lock); - return ret; + return; } int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode) diff -r c3290d51e5f9 ordered-data.h --- a/ordered-data.h Fri May 16 13:30:15 2008 -0400 +++ b/ordered-data.h Wed May 21 14:56:49 2008 -0700 @@ -38,6 +38,6 @@ int btrfs_find_first_ordered_inode(struc int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree, u64 *root_objectid, u64 *objectid, struct inode **inode); -int btrfs_del_ordered_inode(struct inode *inode); +void btrfs_del_ordered_inode(struct inode *inode); int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode); #endif diff -r c3290d51e5f9 super.c --- a/super.c Fri May 16 13:30:15 2008 -0400 +++ b/super.c Wed May 21 12:54:48 2008 -0700 @@ -487,7 +487,6 @@ static void btrfs_unlockfs(struct super_ static struct super_operations btrfs_super_ops = { .delete_inode = btrfs_delete_inode, - .put_inode = btrfs_put_inode, .put_super = btrfs_put_super, .write_super = btrfs_write_super, .sync_fs = btrfs_sync_fs, diff -r c3290d51e5f9 transaction.c --- a/transaction.c Fri May 16 13:30:15 2008 -0400 +++ b/transaction.c Wed May 21 14:18:34 2008 -0700 @@ -538,7 +538,6 @@ int btrfs_write_ordered_inodes(struct bt filemap_write_and_wait(inode->i_mapping); atomic_dec(&BTRFS_I(inode)->ordered_writeback); } - atomic_dec(&inode->i_count); iput(inode); mutex_lock(&root->fs_info->fs_mutex); -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wednesday 21 May 2008, Mingming wrote:> Hi Chris, I thought I spotted a few bugs, While looking at how to > properly remove inode from ordered tree, let me know if I got it right.Hi Mingming, thanks for going through this code. The i_count rules in the current code should work like this: * btrfs_add_ordered_inode calls igrab when the inode is inserted into the list. The whole time the inode is on the list, there''s an extra reference on i_count. There will be no final iput while the inode is on the list. * btrfs_find_first_ordered_inode and btrfs_find_del_ordered_inode both increment i_count before returning the inode. This gives the caller an extra reference on i_count in addition to the one held by the list. * ordered-data.c never drops i_count. This allows (and forces) the caller to decide when it is safe to call iput because callers have different locks held and because the last iput might can in theory trigger delete_inode. All of the current callers of btrfs_del_ordered_inode already have a i_count increased before they call it. So, they can safely just do an atomic_dec on i_count and let their own iputs later do the delete_inode.> > * inode->i_count accounting issue > - remove extra i_count_decrease after calling > filemap_file_write_and_wait() in btrfs_write_ordered_inodes()There''s an atomic_dec of i_count to drop the reference from btrfs_find_del_first_ordered_inode, and an iput to drop the reference added by igrab when the inode was put on the list.> > - Remove extra i_count decrease after each call > btrfs_del_ordered_inode(), it decrease the i_count again. I couldn''t > find the where the i_count being increased. The tree_lock should > protecting it races with btrfs_write_ordered_inodes(), no?The increase comes from igrab on list insertion> > * There is possible race with inode delete and > btrfs_find_first_ordered_inode(). The inode could possibly in the > process of freeing while we are trying to get hold of it during commit > transaction. The fix is using igrab() instead, and search for next inode > in the tree if the found one is in the middle of being released.These kinds of races where the main reason why I had the list take a reference on the inode. delete_inode won''t be called while i_count is increased. Over the long term I''d prefer to move the ordered-data list to a model where the list doesn''t have a reference and it is magically removed after all the dirty pages are gone (by the end_io_hook handlers in inode.c). The end_io hooks in inode.c may be sufficient for this.> > * get rid of btrfs_put_inode(), and move the functionality under the > btrfs_del_ordered_inode() directly.I like this change, thanks.> > * Remove the inode from ordered tree at last iput(). Did not do it at > file release() time, as it may remove the inode from the ordered tree > before ensure the ordering of write to the same inode from other > process. > > Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but > it would not be hurt to do it again at delete_inode() time.I''m afraid we''ll have to do it at file_release time, at least until the ordered list is changed not to keep a reference.> > > Here is the patch trying to address above issues... compiled, but not > tested. Is ordered mode on by default? >Yes, there''s actually no way to turn it off ;) -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2008-05-22 at 10:11 -0400, Chris Mason wrote:> On Wednesday 21 May 2008, Mingming wrote: > > Hi Chris, I thought I spotted a few bugs, While looking at how to > > properly remove inode from ordered tree, let me know if I got it right. > > Hi Mingming, thanks for going through this code. The i_count rules in the > current code should work like this: >Hi Chris, thanks for your detailed clarification.> * btrfs_add_ordered_inode calls igrab when the inode is inserted into the > list. The whole time the inode is on the list, there''s an extra reference on > i_count. There will be no final iput while the inode is on the list. >Ah I missed that. That explains all my confusion of the i_count accounting.> > > > * There is possible race with inode delete and > > btrfs_find_first_ordered_inode(). The inode could possibly in the > > process of freeing while we are trying to get hold of it during commit > > transaction. The fix is using igrab() instead, and search for next inode > > in the tree if the found one is in the middle of being released. > > These kinds of races where the main reason why I had the list take a reference > on the inode. delete_inode won''t be called while i_count is increased. > > Over the long term I''d prefer to move the ordered-data list to a model where > the list doesn''t have a reference and it is magically removed after all the > dirty pages are gone (by the end_io_hook handlers in inode.c). The end_io > hooks in inode.c may be sufficient for this. >Make sense.> > > > * get rid of btrfs_put_inode(), and move the functionality under the > > btrfs_del_ordered_inode() directly. > > I like this change, thanks. > > > > > * Remove the inode from ordered tree at last iput(). Did not do it at > > file release() time, as it may remove the inode from the ordered tree > > before ensure the ordering of write to the same inode from other > > process. > > > > Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but > > it would not be hurt to do it again at delete_inode() time. > > I''m afraid we''ll have to do it at file_release time, at least until the > ordered list is changed not to keep a reference. >Yes with the i_count logic delete_inode() is not the right place to call btrfs_del_ordered_inode. But I am still not quite sure whether it is safe to remove the inode from the ordered tree at the file_release() time. i.e. whether the dirty data already being flushed to disk at last file_close()/file_release() time and when two process open and write to the same inode ... Regards, Mingming -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thursday 22 May 2008, Mingming wrote:> On Thu, 2008-05-22 at 10:11 -0400, Chris Mason wrote: > > On Wednesday 21 May 2008, Mingming wrote: > > > Hi Chris, I thought I spotted a few bugs, While looking at how to > > > properly remove inode from ordered tree, let me know if I got it right. > > > > Hi Mingming, thanks for going through this code. The i_count rules in > > the current code should work like this: > > Hi Chris, thanks for your detailed clarification. > > > * btrfs_add_ordered_inode calls igrab when the inode is inserted into the > > list. The whole time the inode is on the list, there''s an extra > > reference on i_count. There will be no final iput while the inode is on > > the list. > > Ah I missed that. That explains all my confusion of the i_count accounting. > > > > * There is possible race with inode delete and > > > btrfs_find_first_ordered_inode(). The inode could possibly in the > > > process of freeing while we are trying to get hold of it during commit > > > transaction. The fix is using igrab() instead, and search for next > > > inode in the tree if the found one is in the middle of being released. > > > > These kinds of races where the main reason why I had the list take a > > reference on the inode. delete_inode won''t be called while i_count is > > increased. > > > > Over the long term I''d prefer to move the ordered-data list to a model > > where the list doesn''t have a reference and it is magically removed after > > all the dirty pages are gone (by the end_io_hook handlers in inode.c). > > The end_io hooks in inode.c may be sufficient for this. > > Make sense. > > > > * get rid of btrfs_put_inode(), and move the functionality under the > > > btrfs_del_ordered_inode() directly. > > > > I like this change, thanks. > > > > > * Remove the inode from ordered tree at last iput(). Did not do it at > > > file release() time, as it may remove the inode from the ordered tree > > > before ensure the ordering of write to the same inode from other > > > process. > > > > > > Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but > > > it would not be hurt to do it again at delete_inode() time. > > > > I''m afraid we''ll have to do it at file_release time, at least until the > > ordered list is changed not to keep a reference. > > Yes with the i_count logic delete_inode() is not the right place to call > btrfs_del_ordered_inode. > > But I am still not quite sure whether it is safe to remove the inode > from the ordered tree at the file_release() time. i.e. whether the dirty > data already being flushed to disk at last file_close()/file_release() > time and when two process open and write to the same inode ...I get around this by testing for dirty/writeback pages before removing the inode from the ordered list. If another writer allocates blocks to the file, it will be added back to the list. -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2008-05-22 at 13:47 -0400, Chris Mason wrote:> On Thursday 22 May 2008, Mingming wrote: > > On Thu, 2008-05-22 at 10:11 -0400, Chris Mason wrote: > > > On Wednesday 21 May 2008, Mingming wrote: > > > > Hi Chris, I thought I spotted a few bugs, While looking at how to > > > > properly remove inode from ordered tree, let me know if I got it right. > > > > > > Hi Mingming, thanks for going through this code. The i_count rules in > > > the current code should work like this: > > > > Hi Chris, thanks for your detailed clarification. > > > > > * btrfs_add_ordered_inode calls igrab when the inode is inserted into the > > > list. The whole time the inode is on the list, there''s an extra > > > reference on i_count. There will be no final iput while the inode is on > > > the list. > > > > Ah I missed that. That explains all my confusion of the i_count accounting. > > > > > > * There is possible race with inode delete and > > > > btrfs_find_first_ordered_inode(). The inode could possibly in the > > > > process of freeing while we are trying to get hold of it during commit > > > > transaction. The fix is using igrab() instead, and search for next > > > > inode in the tree if the found one is in the middle of being released. > > > > > > These kinds of races where the main reason why I had the list take a > > > reference on the inode. delete_inode won''t be called while i_count is > > > increased. > > > > > > Over the long term I''d prefer to move the ordered-data list to a model > > > where the list doesn''t have a reference and it is magically removed after > > > all the dirty pages are gone (by the end_io_hook handlers in inode.c). > > > The end_io hooks in inode.c may be sufficient for this. > > > > Make sense. > > > > > > * get rid of btrfs_put_inode(), and move the functionality under the > > > > btrfs_del_ordered_inode() directly. > > > > > > I like this change, thanks. > > > > > > > * Remove the inode from ordered tree at last iput(). Did not do it at > > > > file release() time, as it may remove the inode from the ordered tree > > > > before ensure the ordering of write to the same inode from other > > > > process. > > > > > > > > Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but > > > > it would not be hurt to do it again at delete_inode() time. > > > > > > I''m afraid we''ll have to do it at file_release time, at least until the > > > ordered list is changed not to keep a reference. > > > > Yes with the i_count logic delete_inode() is not the right place to call > > btrfs_del_ordered_inode. > > > > But I am still not quite sure whether it is safe to remove the inode > > from the ordered tree at the file_release() time. i.e. whether the dirty > > data already being flushed to disk at last file_close()/file_release() > > time and when two process open and write to the same inode ... > > I get around this by testing for dirty/writeback pages before removing the > inode from the ordered list. If another writer allocates blocks to the file, > it will be added back to the list. >I see.:) How about patch below? Mingming diff -r c3290d51e5f9 file.c --- a/file.c Fri May 16 13:30:15 2008 -0400 +++ b/file.c Thu May 22 13:29:42 2008 -0700 @@ -978,6 +978,12 @@ out_nolock: return num_written ? num_written : err; } +static int btrfs_release_file (struct inode * inode, struct file * filp) +{ + btrfs_del_ordered_inode(inode); + return 0; +} + static int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) { @@ -1044,6 +1050,7 @@ struct file_operations btrfs_file_operat .write = btrfs_file_write, .mmap = btrfs_file_mmap, .open = generic_file_open, + .release = btrfs_release_file, .fsync = btrfs_sync_file, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT diff -r c3290d51e5f9 inode.c --- a/inode.c Fri May 16 13:30:15 2008 -0400 +++ b/inode.c Thu May 22 13:31:14 2008 -0700 @@ -857,15 +857,11 @@ static int btrfs_unlink(struct inode *di nr = trans->blocks_used; if (inode->i_nlink == 0) { - int found; /* if the inode isn''t linked anywhere, * we don''t need to worry about * data=ordered */ - found = btrfs_del_ordered_inode(inode); - if (found == 1) { - atomic_dec(&inode->i_count); - } + btrfs_del_ordered_inode(inode); } btrfs_end_transaction(trans, root); @@ -1271,24 +1267,6 @@ fail: return err; } -void btrfs_put_inode(struct inode *inode) -{ - int ret; - - if (!BTRFS_I(inode)->ordered_trans) { - return; - } - - if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) || - mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)) - return; - - ret = btrfs_del_ordered_inode(inode); - if (ret == 1) { - atomic_dec(&inode->i_count); - } -} - void btrfs_delete_inode(struct inode *inode) { struct btrfs_trans_handle *trans; diff -r c3290d51e5f9 ordered-data.c --- a/ordered-data.c Fri May 16 13:30:15 2008 -0400 +++ b/ordered-data.c Thu May 22 13:24:56 2008 -0700 @@ -231,7 +231,7 @@ int btrfs_find_del_first_ordered_inode(s return 1; } -static int __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree, +static void __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree, struct inode *inode, u64 root_objectid, u64 objectid) { @@ -243,31 +243,38 @@ static int __btrfs_del_ordered_inode(str node = __tree_search(&tree->tree, root_objectid, objectid, &prev); if (!node) { write_unlock(&tree->lock); - return 0; + return; } rb_erase(node, &tree->tree); BTRFS_I(inode)->ordered_trans = 0; write_unlock(&tree->lock); + atomic_dec(&inode->i_count); entry = rb_entry(node, struct tree_entry, rb_node); kfree(entry); - return 1; -} - -int btrfs_del_ordered_inode(struct inode *inode) + return; +} + +void btrfs_del_ordered_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 root_objectid = root->root_key.objectid; - int ret = 0; + + if (!BTRFS_I(inode)->ordered_trans) { + return; + } + + if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)) + return; spin_lock(&root->fs_info->new_trans_lock); if (root->fs_info->running_transaction) { struct btrfs_ordered_inode_tree *tree; tree = &root->fs_info->running_transaction->ordered_inode_tree; - ret = __btrfs_del_ordered_inode(tree, inode, root_objectid, + __btrfs_del_ordered_inode(tree, inode, root_objectid, inode->i_ino); } spin_unlock(&root->fs_info->new_trans_lock); - return ret; } int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode) diff -r c3290d51e5f9 ordered-data.h --- a/ordered-data.h Fri May 16 13:30:15 2008 -0400 +++ b/ordered-data.h Thu May 22 13:25:09 2008 -0700 @@ -38,6 +38,6 @@ int btrfs_find_first_ordered_inode(struc int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree, u64 *root_objectid, u64 *objectid, struct inode **inode); -int btrfs_del_ordered_inode(struct inode *inode); +void btrfs_del_ordered_inode(struct inode *inode); int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode); #endif diff -r c3290d51e5f9 super.c --- a/super.c Fri May 16 13:30:15 2008 -0400 +++ b/super.c Thu May 22 13:30:44 2008 -0700 @@ -487,7 +487,6 @@ static void btrfs_unlockfs(struct super_ static struct super_operations btrfs_super_ops = { .delete_inode = btrfs_delete_inode, - .put_inode = btrfs_put_inode, .put_super = btrfs_put_super, .write_super = btrfs_write_super, .sync_fs = btrfs_sync_fs, -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thursday 22 May 2008, Mingming wrote: [ ... ]> > I get around this by testing for dirty/writeback pages before removing > > the inode from the ordered list. If another writer allocates blocks to > > the file, it will be added back to the list. > > I see.:) How about patch below?Looks good, I''ll give it a shot here. -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html