thr3ads.net - Btrfs devel - [Btrfs-devel] cloning file data [Apr 2008]

If this information is useful, please help other people find it:
Share via:

Sage Weil

2008-Apr-24 22:47 UTC

[Btrfs-devel] cloning file data

Hi-

I'm working on a clone ioctl that will quickly and efficiently duplicate 
the contents of a file, e.g.

int main(int argc, const char **argv)
{
  int in = open(argv[1], O_RDONLY);
  int out = open(argv[2], O_CREAT|O_TRUNC|O_WRONLY, 0644);
  ioctl(out, BTRFS_IOC_CLONE, in);
  close(in);
  close(out);
  return 0;
}

I've probably got the locking order a bit wrong, lots of error handling is 
missing, and I suspect there's a cleaner way to do the target inode size 
update, but it behaves well enough in my (limited :) testing.  

Oh, and I wasn't certain the 'offset' in file_extent_item could be
safely
ignored when duplicating the extent reference.  My assumption was that it 
is orthogonal to extent allocation and isn't related to the backref.  
However, btrfs_insert_file_extent() always set offset=0.  I'm guessing I 
need to add an argument there and fix up the other callers?

Anyway, any comments or suggestions (on the interface or implemantation) 
are welcome.. :)

sage


diff -r 1791a620d509 inode.c
--- a/inode.c	Thu Apr 24 13:43:27 2008 -0700
+++ b/inode.c	Thu Apr 24 15:10:17 2008 -0700
@@ -18,6 +18,7 @@
 
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
+#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
@@ -2726,6 +2727,158 @@ long btrfs_ioctl_trans_end(struct file *
 	return 0;
 }
 
+void dup_item_to_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct btrfs_path *path,
+		       struct extent_buffer *leaf,
+		       int slot,
+		       struct btrfs_key *key,
+		       u64 destino)
+{
+	struct btrfs_path *cpath = btrfs_alloc_path();
+	int len = btrfs_item_size_nr(leaf, slot);
+	int dstoff;
+	struct btrfs_key ckey = *key;
+	int ret;
+
+	ckey.objectid = destino;
+	ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len);
+	dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]);
+	copy_extent_buffer(cpath->nodes[0], leaf, dstoff,
+			   btrfs_item_ptr_offset(leaf, slot),
+			   len);
+	btrfs_release_path(root, cpath);
+}
+
+long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct file *src_file;
+	struct inode *src;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	u64 pos;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int nextret;
+	int slot;
+
+	src_file = fget(src_fd);
+	if (!src_file)
+		return -EBADF;
+	src = src_file->f_dentry->d_inode;
+
+	ret = -EXDEV;
+	if (src->i_sb != inode->i_sb)
+		goto out_fput;
+
+	if (inode < src) {
+		mutex_lock(&inode->i_mutex);
+		mutex_lock(&src->i_mutex);
+	} else {
+		mutex_lock(&src->i_mutex);
+		mutex_lock(&inode->i_mutex);	
+	}
+	
+	ret = -ENOTEMPTY;
+	if (inode->i_size)
+		goto out_unlock;
+
+	/* do any pending delalloc/csum calc on src, one way or another */
+	filemap_write_and_wait(src->i_mapping);
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 0);
+	path = btrfs_alloc_path();
+	pos = 0;
+	while (1) {
+		ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino,
+					       pos, 0);
+		if (ret < 0)
+			goto out;
+		if (ret > 0) {
+			if (path->slots[0] == 0) {
+				ret = 0;
+				goto out;
+			}
+			path->slots[0]--;
+		}
+	next_slot:
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		printk("key(%llu %x %llu)\n",
+		       key.objectid, key.type, key.offset);
+		if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
+		    key.objectid != src->i_ino)
+			goto out;
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+			struct btrfs_file_extent_item *extent;
+			int found_type;
+			u64 len;
+			pos = key.offset;
+			extent = btrfs_item_ptr(leaf, slot,
+						struct btrfs_file_extent_item);
+			found_type = btrfs_file_extent_type(leaf, extent);
+			len = btrfs_file_extent_num_bytes(leaf, extent);
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
+				u64 ds = btrfs_file_extent_disk_bytenr(leaf, 
+								       extent);
+				u64 dl = btrfs_file_extent_disk_num_bytes(leaf, 
+								 extent);
+				printk(" %llu~%llu disk %llu~%llu off %llu\n",
+				       pos, len, ds, dl,
+				       btrfs_file_extent_offset(leaf, extent));
+				btrfs_insert_file_extent(trans, root, 
+							 inode->i_ino, pos,
+							 ds, dl, len);
+				btrfs_inc_extent_ref(trans, root, ds, dl,
+						     root->root_key.objectid,
+						     trans->transid,
+						     inode->i_ino, pos);
+			} else if (found_type == BTRFS_FILE_EXTENT_INLINE)
+				dup_item_to_inode(trans, root, path, leaf, slot,
+						  &key, inode->i_ino);
+			pos = key.offset + len;
+		} else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY)
+			dup_item_to_inode(trans, root, path, leaf, slot, &key, 
+					  inode->i_ino);
+		
+		nritems = btrfs_header_nritems(leaf);
+		if (slot >= nritems - 1) {
+			nextret = btrfs_next_leaf(root, path);
+			if (nextret)
+				goto out;
+		} else {
+			path->slots[0]++;
+		}
+		goto next_slot;
+	}
+
+out:
+	ret = 0;
+	mutex_unlock(&root->fs_info->fs_mutex);	
+
+	i_size_write(inode, src->i_size);
+	inode->i_blocks = src->i_blocks;
+	mark_inode_dirty(inode);
+
+	mutex_lock(&root->fs_info->fs_mutex);	
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);	
+
+out_unlock:
+	mutex_unlock(&src->i_mutex);
+	mutex_unlock(&inode->i_mutex);
+out_fput:
+	fput(src_file);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -2744,6 +2897,9 @@ long btrfs_ioctl(struct file *file, unsi
 		return btrfs_ioctl_trans_end(file);
 	case BTRFS_IOC_SYNC:
 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
+		return 0;
+	case BTRFS_IOC_CLONE:
+		btrfs_ioctl_clone(file, arg);
 		return 0;
 	}
 
diff -r 1791a620d509 ioctl.h
--- a/ioctl.h	Thu Apr 24 13:43:27 2008 -0700
+++ b/ioctl.h	Thu Apr 24 15:10:17 2008 -0700
@@ -36,5 +36,6 @@ struct btrfs_ioctl_vol_args {
 #define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6)
 #define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7)
 #define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8)
+#define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int)
 #endif

Chris Mason

2008-Apr-25 13:41 UTC

head link

[Btrfs-devel] cloning file data

On Thursday 24 April 2008, Sage Weil wrote:> Hi-
>
> I'm working on a clone ioctl that will quickly and efficiently
duplicate
> the contents of a file, e.g.
Very cool.  I'd actually loved to see this wrapped into a program that will 
cow a directory tree.  Basically the same as cp -al, but with cow instead of 
linking.
>
> int main(int argc, const char **argv)
> {
>   int in = open(argv[1], O_RDONLY);
>   int out = open(argv[2], O_CREAT|O_TRUNC|O_WRONLY, 0644);
>   ioctl(out, BTRFS_IOC_CLONE, in);
>   close(in);
>   close(out);
>   return 0;
> }
>
> I've probably got the locking order a bit wrong, lots of error handling
is
> missing, and I suspect there's a cleaner way to do the target inode
size
> update, but it behaves well enough in my (limited :) testing.
>
> Oh, and I wasn't certain the 'offset' in file_extent_item could
be safely
> ignored when duplicating the extent reference.  My assumption was that it
> is orthogonal to extent allocation and isn't related to the backref.
> However, btrfs_insert_file_extent() always set offset=0.  I'm guessing
I
> need to add an argument there and fix up the other callers?
>Yes, you need to preserve the offset.  There's only one place right now that
sets a non-zero offset and it inserts the extent by hand for other reasons 
(if you're brave, file.c:btrfs_drop_extents)

The reason file extents have an offset field is to allow COW without 
read/modify/write.  Picture something like this:

# create a single 100MB extent in file foo
dd if=/dev/zero of=foo bs=1M count=100
sync

# write into the middle
dd if=/dev/zero of=foo bs=4k count=1 seek=100 conv=notrunc
sync

We've written into the middle of that 100MB extent, and we need to do COW.  
One option is to read the whole thing, change 4k and write it all back.  
Instead, btrfs does something like this (+/- off by need more coffee errors):

file pos = 0 -> [ old extent, offset = 0, num_bytes = 400k ]
file pos = 409600 -> [ new 4k extent, offset = 0, num_bytes = 4k ]
file pos = 413696 -> [ old extent, offset = 413696, num_bytes = 100MB - 404k]

An extra reference is taken on the old extent to reflect that we're pointing
to it twice.
> Anyway, any comments or suggestions (on the interface or implemantation)
> are welcome.. :)
>By taking the inode mutex, you protect against file_write and truncates 
changing the file.  But, we also need to prevent mmaps from changing the file 
pages as well.  What you want to do lock all the file bytes in the extent 
tree:

lock_extent(&BTRFS_I(src_inode)->io_tree, 0, (u64)-1, GFP_NOFS);

But unfortunately, the code to fill delayed allocation takes that same lock.  
So you need to loop a bit:

while(1) {
    filemap_write_and_wait(src_inode);
    lock_extent()
    if (BTRFS_I(src_inode)->delalloc_bytes == 0)
        break;
    unlock_extent()
}

That should keep you from racing with btrfs_page_mkwrite()

-chris

Chris Mason

2008-Apr-29 20:52 UTC

head link

[Btrfs-devel] cloning file data

On Thursday 24 April 2008, Sage Weil wrote:> Hi-
>
> I'm working on a clone ioctl that will quickly and efficiently
duplicate
> the contents of a file, e.g.
>
Just FYI, I didn't sneak this into v0.14 because I didn't quite have the
cycles to test it.  It'll go in this week.

-chris

Chris Mason

2008-May-02 20:50 UTC

head link

Re: [Btrfs-devel] cloning file data

On Thursday 24 April 2008, Sage Weil wrote:> Hi-
>
> I''m working on a clone ioctl that will quickly and efficiently
duplicate
> the contents of a file, e.g.
Sage''s work has been pushed into the stable and unstable trees, along
with a
small command called bcp to trigger the clone ioctls.  bcp is used like this:

bcp src dst

If src is a directory, it is copied recursively.  If the clone ioctl fails, a 
fallback to buffer copies is done instead.

Sage, I had to make a few small changes to your ioctl code.  One was to skip 
reference count updates if the extent is a hole, and the other was to change 
around mark_inode_dirty a bit to avoid transaction deadlock.  We aren''t
actually making any pages dirty so it is safe to just update the inode.

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Sage Weil

2008-May-02 21:38 UTC

head link

Re: [Btrfs-devel] cloning file data

On Fri, 2 May 2008, Chris Mason wrote:> Sage''s work has been pushed into the stable and unstable trees,
along with a
Thanks!

For the transaction ioctls...  would it make more sense to specify a 
string of ops to apply atomically in a vector instead of exposing the raw 
transaction start/end to userspace?  Would that avoid the deadlock and 
memory issues?

sage
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Chris Mason

2008-May-21 18:02 UTC

head link

Re: btrfs_put_inode

On Wednesday 21 May 2008, Mingming wrote:> Hi Chris,
>
> btrfs failed to compile on 2.6.26-rc2 since the put_inode callback
> function is removed from super_operations.
>
> I noticed btrfs_put_inode() seems being actively used, to drop the inode
> from the tree for ordered mode transaction. Any idea where to relocate
> this function? Do we need to call btrfs_del_ordered_inode() for every
> iput()?
The ordered inode list has incremented the inode->i_count, so we
can''t do the
iput code inside clear_inode.  But, we should be able to safely move the 
checks to file_release.

This is basically an optimization so that we don''t leave clean inodes
hanging
around on the data=ordered list.  It is safe to remove the put_inode call 
completely, it''ll just be a little slower.

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Ryan Hope

2008-May-21 18:23 UTC

head link

Re: btrfs_put_inode

try this patch out,
http://zen-sources.org/files/btrfs-stable-for-2.6.26-rc3.patch

it should apply to 2.6.26-rc*... probably even 2.6.25

im btrfs with 2.6.26-rc3 with no issues

On Wed, May 21, 2008 at 1:19 PM, Mingming <cmm@us.ibm.com>
wrote:> Hi Chris,
>
> btrfs failed to compile on 2.6.26-rc2 since the put_inode callback
> function is removed from super_operations.
>
> I noticed btrfs_put_inode() seems being actively used, to drop the inode
> from the tree for ordered mode transaction. Any idea where to relocate
> this function? Do we need to call btrfs_del_ordered_inode() for every
> iput()?
>
> Thanks,
> Mingming
>
> --
> To unsubscribe from this list: send the line "unsubscribe
linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Chris Mason

2008-May-21 18:32 UTC

head link

Re: btrfs_put_inode

On Wednesday 21 May 2008, Ryan Hope wrote:> try this patch out,
> http://zen-sources.org/files/btrfs-stable-for-2.6.26-rc3.patch
It is a little hard to tell from the big long patch what the incremental 
difference is ;)

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Mingming

2008-May-21 18:45 UTC

head link

Re: btrfs_put_inode

On Wed, 2008-05-21 at 14:02 -0400, Chris Mason wrote:> On Wednesday 21 May 2008, Mingming wrote:
> > Hi Chris,
> >
> > btrfs failed to compile on 2.6.26-rc2 since the put_inode callback
> > function is removed from super_operations.
> >
> > I noticed btrfs_put_inode() seems being actively used, to drop the
inode
> > from the tree for ordered mode transaction. Any idea where to relocate
> > this function? Do we need to call btrfs_del_ordered_inode() for every
> > iput()?
> 
> The ordered inode list has incremented the inode->i_count, so we
can''t do the
> iput code inside clear_inode.  But, we should be able to safely move the 
> checks to file_release.
> Ah okay. file release is called on the last file_close() to this inode.
But I thought NFS could grab the inode without thr file_open/file_close,
so we might also need to delete the inode from the ordered tree at
delete_inode time, like ext3?
> This is basically an optimization so that we don''t leave clean
inodes hanging
> around on the data=ordered list.  It is safe to remove the put_inode call 
> completely, it''ll just be a little slower.
> 
> -chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Chris Mason

2008-May-21 18:52 UTC

head link

Re: btrfs_put_inode

On Wednesday 21 May 2008, Mingming wrote:> On Wed, 2008-05-21 at 14:02 -0400, Chris Mason wrote:
> > On Wednesday 21 May 2008, Mingming wrote:
> > > Hi Chris,
> > >
> > > btrfs failed to compile on 2.6.26-rc2 since the put_inode
callback
> > > function is removed from super_operations.
> > >
> > > I noticed btrfs_put_inode() seems being actively used, to drop
the
> > > inode from the tree for ordered mode transaction. Any idea where
to
> > > relocate this function? Do we need to call
btrfs_del_ordered_inode()
> > > for every iput()?
> >
> > The ordered inode list has incremented the inode->i_count, so we
can''t do
> > the iput code inside clear_inode.  But, we should be able to safely
move
> > the checks to file_release.
>
> Ah okay. file release is called on the last file_close() to this inode.
> But I thought NFS could grab the inode without thr file_open/file_close,
> so we might also need to delete the inode from the ordered tree at
> delete_inode time, like ext3?
I do it right now in unlink, but delete_inode is fine too.

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Mingming

2008-May-21 19:02 UTC

head link

Re: btrfs_put_inode

On Wed, 2008-05-21 at 14:32 -0400, Chris Mason wrote:> On Wednesday 21 May 2008, Ryan Hope wrote:
> > try this patch out,
> > http://zen-sources.org/files/btrfs-stable-for-2.6.26-rc3.patch
> 
> It is a little hard to tell from the big long patch what the incremental 
> difference is ;)
> 
Looks like just break the hook up in super_operations:-)> -chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Mingming

2008-May-21 22:29 UTC

head link

[RFC][PATCH]btrfs delete ordered inode handling fix

Hi Chris,  I thought I spotted a few bugs, While looking at how to
properly remove inode from ordered tree, let me know if I got it right.

* inode->i_count accounting issue
- remove extra i_count_decrease after calling
filemap_file_write_and_wait() in btrfs_write_ordered_inodes()

- Remove extra i_count decrease after each call
btrfs_del_ordered_inode(), it decrease the i_count again. I couldn''t
find the where the i_count being increased. The tree_lock should
protecting it races with btrfs_write_ordered_inodes(), no? 

* There is possible race with inode delete and
btrfs_find_first_ordered_inode(). The inode could possibly in the
process of freeing while we are trying to get hold of it during commit
transaction. The fix is using igrab() instead, and search for next inode
in the tree if the found one is in the middle of being released.

* get rid of btrfs_put_inode(), and move the functionality under the
btrfs_del_ordered_inode() directly.

* Remove the inode from ordered tree at last iput(). Did not do it at
file release() time, as it may remove the inode from the ordered tree
before ensure the ordering of write to the same inode from other
process.

Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but
it would not be hurt to do it again at delete_inode() time.


Here is the patch trying to address above issues... compiled, but not
tested. Is ordered mode on by default?

Regards,
Mingming

diff -r c3290d51e5f9 inode.c
--- a/inode.c	Fri May 16 13:30:15 2008 -0400
+++ b/inode.c	Wed May 21 14:51:22 2008 -0700
@@ -857,15 +857,11 @@ static int btrfs_unlink(struct inode *di
 	nr = trans->blocks_used;
 
 	if (inode->i_nlink == 0) {
-		int found;
 		/* if the inode isn''t linked anywhere,
 		 * we don''t need to worry about
 		 * data=ordered
 		 */
-		found = btrfs_del_ordered_inode(inode);
-		if (found == 1) {
-			atomic_dec(&inode->i_count);
-		}
+		btrfs_del_ordered_inode(inode);
 	}
 
 	btrfs_end_transaction(trans, root);
@@ -1271,24 +1267,6 @@ fail:
 	return err;
 }
 
-void btrfs_put_inode(struct inode *inode)
-{
-	int ret;
-
-	if (!BTRFS_I(inode)->ordered_trans) {
-		return;
-	}
-
-	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
-	    mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))
-		return;
-
-	ret = btrfs_del_ordered_inode(inode);
-	if (ret == 1) {
-		atomic_dec(&inode->i_count);
-	}
-}
-
 void btrfs_delete_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
@@ -1311,6 +1289,7 @@ void btrfs_delete_inode(struct inode *in
 		goto no_delete_lock;
 
 	nr = trans->blocks_used;
+	btrfs_del_ordered_inode(inode);
 	clear_inode(inode);
 
 	btrfs_end_transaction(trans, root);
diff -r c3290d51e5f9 ordered-data.c
--- a/ordered-data.c	Fri May 16 13:30:15 2008 -0400
+++ b/ordered-data.c	Wed May 21 14:55:25 2008 -0700
@@ -166,7 +166,7 @@ int btrfs_find_first_ordered_inode(struc
 				   struct inode **inode)
 {
 	struct tree_entry *entry;
-	struct rb_node *node;
+	struct rb_node *node, *next;
 
 	write_lock(&tree->lock);
 	node = tree_search(&tree->tree, *root_objectid, *objectid);
@@ -174,6 +174,7 @@ int btrfs_find_first_ordered_inode(struc
 		write_unlock(&tree->lock);
 		return 0;
 	}
+again:
 	entry = rb_entry(node, struct tree_entry, rb_node);
 
 	while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
@@ -187,9 +188,20 @@ int btrfs_find_first_ordered_inode(struc
 		return 0;
 	}
 
+	*inode = igrab(entry->inode);
+	if (!*inode) {
+		next = rb_next(node);
+		rb_erase(node, &tree->tree);
+		kfree(entry);
+		if (!next) {
+			write_unlock(&tree->lock);
+			return 0;
+		}
+		node = next;
+		goto again;
+	}
+
 	*root_objectid = entry->root_objectid;
-	*inode = entry->inode;
-	atomic_inc(&entry->inode->i_count);
 	*objectid = entry->objectid;
 	write_unlock(&tree->lock);
 	return 1;
@@ -200,7 +212,7 @@ int btrfs_find_del_first_ordered_inode(s
 				       struct inode **inode)
 {
 	struct tree_entry *entry;
-	struct rb_node *node;
+	struct rb_node *node, *next;
 
 	write_lock(&tree->lock);
 	node = tree_search(&tree->tree, *root_objectid, *objectid);
@@ -208,7 +220,7 @@ int btrfs_find_del_first_ordered_inode(s
 		write_unlock(&tree->lock);
 		return 0;
 	}
-
+retry:
 	entry = rb_entry(node, struct tree_entry, rb_node);
 	while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
 		node = rb_next(node);
@@ -220,11 +232,21 @@ int btrfs_find_del_first_ordered_inode(s
 		write_unlock(&tree->lock);
 		return 0;
 	}
+	*inode = igrab(entry->inode);
+	if (!*inode) {
+		next = rb_next(node);
+		rb_erase(node, &tree->tree);
+		kfree(entry);
+		if (!next) {
+			write_unlock(&tree->lock);
+			return 0;
+		}
+		node = next;
+		goto retry;
+	}
 
 	*root_objectid = entry->root_objectid;
 	*objectid = entry->objectid;
-	*inode = entry->inode;
-	atomic_inc(&entry->inode->i_count);
 	rb_erase(node, &tree->tree);
 	write_unlock(&tree->lock);
 	kfree(entry);
@@ -253,11 +275,19 @@ static int __btrfs_del_ordered_inode(str
 	return 1;
 }
 
-int btrfs_del_ordered_inode(struct inode *inode)
+void btrfs_del_ordered_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 root_objectid = root->root_key.objectid;
 	int ret = 0;
+
+	if (!BTRFS_I(inode)->ordered_trans) {
+		return;
+	}
+
+	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
+		mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))
+		return;
 
 	spin_lock(&root->fs_info->new_trans_lock);
 	if (root->fs_info->running_transaction) {
@@ -267,7 +297,7 @@ int btrfs_del_ordered_inode(struct inode
 						inode->i_ino);
 	}
 	spin_unlock(&root->fs_info->new_trans_lock);
-	return ret;
+	return;
 }
 
 int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode)
diff -r c3290d51e5f9 ordered-data.h
--- a/ordered-data.h	Fri May 16 13:30:15 2008 -0400
+++ b/ordered-data.h	Wed May 21 14:56:49 2008 -0700
@@ -38,6 +38,6 @@ int btrfs_find_first_ordered_inode(struc
 int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				       u64 *root_objectid, u64 *objectid,
 				       struct inode **inode);
-int btrfs_del_ordered_inode(struct inode *inode);
+void btrfs_del_ordered_inode(struct inode *inode);
 int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode);
 #endif
diff -r c3290d51e5f9 super.c
--- a/super.c	Fri May 16 13:30:15 2008 -0400
+++ b/super.c	Wed May 21 12:54:48 2008 -0700
@@ -487,7 +487,6 @@ static void btrfs_unlockfs(struct super_
 
 static struct super_operations btrfs_super_ops = {
 	.delete_inode	= btrfs_delete_inode,
-	.put_inode	= btrfs_put_inode,
 	.put_super	= btrfs_put_super,
 	.write_super	= btrfs_write_super,
 	.sync_fs	= btrfs_sync_fs,
diff -r c3290d51e5f9 transaction.c
--- a/transaction.c	Fri May 16 13:30:15 2008 -0400
+++ b/transaction.c	Wed May 21 14:18:34 2008 -0700
@@ -538,7 +538,6 @@ int btrfs_write_ordered_inodes(struct bt
 			filemap_write_and_wait(inode->i_mapping);
 			atomic_dec(&BTRFS_I(inode)->ordered_writeback);
 		}
-		atomic_dec(&inode->i_count);
 		iput(inode);
 
 		mutex_lock(&root->fs_info->fs_mutex);


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Chris Mason

2008-May-22 14:11 UTC

head link

Re: [RFC][PATCH]btrfs delete ordered inode handling fix

On Wednesday 21 May 2008, Mingming wrote:> Hi Chris,  I thought I spotted a few bugs, While looking at how to
> properly remove inode from ordered tree, let me know if I got it right.
Hi Mingming, thanks for going through this code.  The i_count rules in the 
current code should work like this:

* btrfs_add_ordered_inode calls igrab when the inode is inserted into the 
list.  The whole time the inode is on the list, there''s an extra
reference on
i_count.  There will be no final iput while the inode is on the list.

* btrfs_find_first_ordered_inode and btrfs_find_del_ordered_inode both 
increment i_count before returning the inode.  This gives the caller an extra 
reference on i_count in addition to the one held by the list.

* ordered-data.c never drops i_count.  This allows (and forces) the caller to 
decide when it is safe to call iput because callers have different locks held 
and because the last iput might can in theory trigger delete_inode.

All of the current callers of btrfs_del_ordered_inode already have a i_count 
increased before they call it.  So, they can safely just do an atomic_dec on 
i_count and let their own iputs later do the delete_inode.
>
> * inode->i_count accounting issue
> - remove extra i_count_decrease after calling
> filemap_file_write_and_wait() in btrfs_write_ordered_inodes()
There''s an atomic_dec of i_count to drop the reference from 
btrfs_find_del_first_ordered_inode, and an iput to drop the reference added 
by igrab when the inode was put on the list.
>
> - Remove extra i_count decrease after each call
> btrfs_del_ordered_inode(), it decrease the i_count again. I
couldn''t
> find the where the i_count being increased. The tree_lock should
> protecting it races with btrfs_write_ordered_inodes(), no?
The increase comes from igrab on list insertion
>
> * There is possible race with inode delete and
> btrfs_find_first_ordered_inode(). The inode could possibly in the
> process of freeing while we are trying to get hold of it during commit
> transaction. The fix is using igrab() instead, and search for next inode
> in the tree if the found one is in the middle of being released.
These kinds of races where the main reason why I had the list take a reference 
on the inode.  delete_inode won''t be called while i_count is increased.

Over the long term I''d prefer to move the ordered-data list to a model
where
the list doesn''t have a reference and it is magically removed after all
the
dirty pages are gone (by the end_io_hook handlers in inode.c).  The end_io 
hooks in inode.c may be sufficient for this.
>
> * get rid of btrfs_put_inode(), and move the functionality under the
> btrfs_del_ordered_inode() directly.
I like this change, thanks.
>
> * Remove the inode from ordered tree at last iput(). Did not do it at
> file release() time, as it may remove the inode from the ordered tree
> before ensure the ordering of write to the same inode from other
> process.
>
> Perhaps calling btrfs_del_ordered_inode() under unlink() is enough, but
> it would not be hurt to do it again at delete_inode() time.
I''m afraid we''ll have to do it at file_release time, at least
until the
ordered list is changed not to keep a reference.
>
>
> Here is the patch trying to address above issues... compiled, but not
> tested. Is ordered mode on by default?
>
Yes, there''s actually no way to turn it off ;)

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Mingming

2008-May-22 17:43 UTC

head link

Re: [RFC][PATCH]btrfs delete ordered inode handling fix

On Thu, 2008-05-22 at 10:11 -0400, Chris Mason wrote:> On Wednesday 21 May 2008, Mingming wrote:
> > Hi Chris,  I thought I spotted a few bugs, While looking at how to
> > properly remove inode from ordered tree, let me know if I got it
right.
> 
> Hi Mingming, thanks for going through this code.  The i_count rules in the 
> current code should work like this:
> 
Hi Chris, thanks for your detailed clarification.
> * btrfs_add_ordered_inode calls igrab when the inode is inserted into the 
> list.  The whole time the inode is on the list, there''s an extra
reference on
> i_count.  There will be no final iput while the inode is on the list.
> 
Ah I missed that. That explains all my confusion of the i_count accounting.
> >
> > * There is possible race with inode delete and
> > btrfs_find_first_ordered_inode(). The inode could possibly in the
> > process of freeing while we are trying to get hold of it during commit
> > transaction. The fix is using igrab() instead, and search for next
inode
> > in the tree if the found one is in the middle of being released.
> 
> These kinds of races where the main reason why I had the list take a
reference
> on the inode.  delete_inode won''t be called while i_count is
increased.
> 
> Over the long term I''d prefer to move the ordered-data list to a
model where
> the list doesn''t have a reference and it is magically removed
after all the
> dirty pages are gone (by the end_io_hook handlers in inode.c).  The end_io 
> hooks in inode.c may be sufficient for this.
> 
Make sense.> >
> > * get rid of btrfs_put_inode(), and move the functionality under the
> > btrfs_del_ordered_inode() directly.
> 
> I like this change, thanks.
> 
> >
> > * Remove the inode from ordered tree at last iput(). Did not do it at
> > file release() time, as it may remove the inode from the ordered tree
> > before ensure the ordering of write to the same inode from other
> > process.
> >
> > Perhaps calling btrfs_del_ordered_inode() under unlink() is enough,
but
> > it would not be hurt to do it again at delete_inode() time.
> 
> I''m afraid we''ll have to do it at file_release time, at
least until the
> ordered list is changed not to keep a reference.
> Yes with the i_count logic delete_inode() is not the right place to call
btrfs_del_ordered_inode.

But I am still not quite sure whether it is safe to remove the inode
from the ordered tree at the file_release() time. i.e. whether the dirty
data already being flushed to disk at last file_close()/file_release()
time and when two process open and write to the same inode ...

Regards,
Mingming

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Chris Mason

2008-May-22 17:47 UTC

head link

Re: [RFC][PATCH]btrfs delete ordered inode handling fix

On Thursday 22 May 2008, Mingming wrote:> On Thu, 2008-05-22 at 10:11 -0400, Chris Mason wrote:
> > On Wednesday 21 May 2008, Mingming wrote:
> > > Hi Chris,  I thought I spotted a few bugs, While looking at how
to
> > > properly remove inode from ordered tree, let me know if I got it
right.
> >
> > Hi Mingming, thanks for going through this code.  The i_count rules in
> > the current code should work like this:
>
> Hi Chris, thanks for your detailed clarification.
>
> > * btrfs_add_ordered_inode calls igrab when the inode is inserted into
the
> > list.  The whole time the inode is on the list, there''s an
extra
> > reference on i_count.  There will be no final iput while the inode is
on
> > the list.
>
> Ah I missed that. That explains all my confusion of the i_count accounting.
>
> > > * There is possible race with inode delete and
> > > btrfs_find_first_ordered_inode(). The inode could possibly in the
> > > process of freeing while we are trying to get hold of it during
commit
> > > transaction. The fix is using igrab() instead, and search for
next
> > > inode in the tree if the found one is in the middle of being
released.
> >
> > These kinds of races where the main reason why I had the list take a
> > reference on the inode.  delete_inode won''t be called while
i_count is
> > increased.
> >
> > Over the long term I''d prefer to move the ordered-data list
to a model
> > where the list doesn''t have a reference and it is magically
removed after
> > all the dirty pages are gone (by the end_io_hook handlers in inode.c).
> > The end_io hooks in inode.c may be sufficient for this.
>
> Make sense.
>
> > > * get rid of btrfs_put_inode(), and move the functionality under
the
> > > btrfs_del_ordered_inode() directly.
> >
> > I like this change, thanks.
> >
> > > * Remove the inode from ordered tree at last iput(). Did not do
it at
> > > file release() time, as it may remove the inode from the ordered
tree
> > > before ensure the ordering of write to the same inode from other
> > > process.
> > >
> > > Perhaps calling btrfs_del_ordered_inode() under unlink() is
enough, but
> > > it would not be hurt to do it again at delete_inode() time.
> >
> > I''m afraid we''ll have to do it at file_release time,
at least until the
> > ordered list is changed not to keep a reference.
>
> Yes with the i_count logic delete_inode() is not the right place to call
> btrfs_del_ordered_inode.
>
> But I am still not quite sure whether it is safe to remove the inode
> from the ordered tree at the file_release() time. i.e. whether the dirty
> data already being flushed to disk at last file_close()/file_release()
> time and when two process open and write to the same inode ...
I get around this by testing for dirty/writeback pages before removing the 
inode from the ordered list.  If another writer allocates blocks to the file, 
it will be added back to the list.

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Mingming

2008-May-22 20:39 UTC

head link

Re: [RFC][PATCH]btrfs delete ordered inode handling fix

On Thu, 2008-05-22 at 13:47 -0400, Chris Mason wrote:> On Thursday 22 May 2008, Mingming wrote:
> > On Thu, 2008-05-22 at 10:11 -0400, Chris Mason wrote:
> > > On Wednesday 21 May 2008, Mingming wrote:
> > > > Hi Chris,  I thought I spotted a few bugs, While looking at
how to
> > > > properly remove inode from ordered tree, let me know if I
got it right.
> > >
> > > Hi Mingming, thanks for going through this code.  The i_count
rules in
> > > the current code should work like this:
> >
> > Hi Chris, thanks for your detailed clarification.
> >
> > > * btrfs_add_ordered_inode calls igrab when the inode is inserted
into the
> > > list.  The whole time the inode is on the list, there''s
an extra
> > > reference on i_count.  There will be no final iput while the
inode is on
> > > the list.
> >
> > Ah I missed that. That explains all my confusion of the i_count
accounting.
> >
> > > > * There is possible race with inode delete and
> > > > btrfs_find_first_ordered_inode(). The inode could possibly
in the
> > > > process of freeing while we are trying to get hold of it
during commit
> > > > transaction. The fix is using igrab() instead, and search
for next
> > > > inode in the tree if the found one is in the middle of being
released.
> > >
> > > These kinds of races where the main reason why I had the list
take a
> > > reference on the inode.  delete_inode won''t be called
while i_count is
> > > increased.
> > >
> > > Over the long term I''d prefer to move the ordered-data
list to a model
> > > where the list doesn''t have a reference and it is
magically removed after
> > > all the dirty pages are gone (by the end_io_hook handlers in
inode.c).
> > > The end_io hooks in inode.c may be sufficient for this.
> >
> > Make sense.
> >
> > > > * get rid of btrfs_put_inode(), and move the functionality
under the
> > > > btrfs_del_ordered_inode() directly.
> > >
> > > I like this change, thanks.
> > >
> > > > * Remove the inode from ordered tree at last iput(). Did not
do it at
> > > > file release() time, as it may remove the inode from the
ordered tree
> > > > before ensure the ordering of write to the same inode from
other
> > > > process.
> > > >
> > > > Perhaps calling btrfs_del_ordered_inode() under unlink() is
enough, but
> > > > it would not be hurt to do it again at delete_inode() time.
> > >
> > > I''m afraid we''ll have to do it at file_release
time, at least until the
> > > ordered list is changed not to keep a reference.
> >
> > Yes with the i_count logic delete_inode() is not the right place to
call
> > btrfs_del_ordered_inode.
> >
> > But I am still not quite sure whether it is safe to remove the inode
> > from the ordered tree at the file_release() time. i.e. whether the
dirty
> > data already being flushed to disk at last file_close()/file_release()
> > time and when two process open and write to the same inode ...
> 
> I get around this by testing for dirty/writeback pages before removing the 
> inode from the ordered list.  If another writer allocates blocks to the
file,
> it will be added back to the list.
> 
I see.:) How about patch below?

Mingming

diff -r c3290d51e5f9 file.c
--- a/file.c	Fri May 16 13:30:15 2008 -0400
+++ b/file.c	Thu May 22 13:29:42 2008 -0700
@@ -978,6 +978,12 @@ out_nolock:
 	return num_written ? num_written : err;
 }
 
+static int btrfs_release_file (struct inode * inode, struct file * filp)
+{
+	btrfs_del_ordered_inode(inode);
+	return 0;
+}
+
 static int btrfs_sync_file(struct file *file,
 			   struct dentry *dentry, int datasync)
 {
@@ -1044,6 +1050,7 @@ struct file_operations btrfs_file_operat
 	.write		= btrfs_file_write,
 	.mmap		= btrfs_file_mmap,
 	.open		= generic_file_open,
+	.release	= btrfs_release_file,
 	.fsync		= btrfs_sync_file,
 	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
diff -r c3290d51e5f9 inode.c
--- a/inode.c	Fri May 16 13:30:15 2008 -0400
+++ b/inode.c	Thu May 22 13:31:14 2008 -0700
@@ -857,15 +857,11 @@ static int btrfs_unlink(struct inode *di
 	nr = trans->blocks_used;
 
 	if (inode->i_nlink == 0) {
-		int found;
 		/* if the inode isn''t linked anywhere,
 		 * we don''t need to worry about
 		 * data=ordered
 		 */
-		found = btrfs_del_ordered_inode(inode);
-		if (found == 1) {
-			atomic_dec(&inode->i_count);
-		}
+		btrfs_del_ordered_inode(inode);
 	}
 
 	btrfs_end_transaction(trans, root);
@@ -1271,24 +1267,6 @@ fail:
 	return err;
 }
 
-void btrfs_put_inode(struct inode *inode)
-{
-	int ret;
-
-	if (!BTRFS_I(inode)->ordered_trans) {
-		return;
-	}
-
-	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
-	    mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))
-		return;
-
-	ret = btrfs_del_ordered_inode(inode);
-	if (ret == 1) {
-		atomic_dec(&inode->i_count);
-	}
-}
-
 void btrfs_delete_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
diff -r c3290d51e5f9 ordered-data.c
--- a/ordered-data.c	Fri May 16 13:30:15 2008 -0400
+++ b/ordered-data.c	Thu May 22 13:24:56 2008 -0700
@@ -231,7 +231,7 @@ int btrfs_find_del_first_ordered_inode(s
 	return 1;
 }
 
-static int __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+static void __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				     struct inode *inode,
 				     u64 root_objectid, u64 objectid)
 {
@@ -243,31 +243,38 @@ static int __btrfs_del_ordered_inode(str
 	node = __tree_search(&tree->tree, root_objectid, objectid, &prev);
 	if (!node) {
 		write_unlock(&tree->lock);
-		return 0;
+		return;
 	}
 	rb_erase(node, &tree->tree);
 	BTRFS_I(inode)->ordered_trans = 0;
 	write_unlock(&tree->lock);
+	atomic_dec(&inode->i_count);
 	entry = rb_entry(node, struct tree_entry, rb_node);
 	kfree(entry);
-	return 1;
-}
-
-int btrfs_del_ordered_inode(struct inode *inode)
+	return;
+}
+
+void btrfs_del_ordered_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 root_objectid = root->root_key.objectid;
-	int ret = 0;
+
+	if (!BTRFS_I(inode)->ordered_trans) {
+		return;
+	}
+
+	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
+	    mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))
+		return;
 
 	spin_lock(&root->fs_info->new_trans_lock);
 	if (root->fs_info->running_transaction) {
 		struct btrfs_ordered_inode_tree *tree;
 		tree = &root->fs_info->running_transaction->ordered_inode_tree;
-		ret = __btrfs_del_ordered_inode(tree, inode, root_objectid,
+		 __btrfs_del_ordered_inode(tree, inode, root_objectid,
 						inode->i_ino);
 	}
 	spin_unlock(&root->fs_info->new_trans_lock);
-	return ret;
 }
 
 int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode)
diff -r c3290d51e5f9 ordered-data.h
--- a/ordered-data.h	Fri May 16 13:30:15 2008 -0400
+++ b/ordered-data.h	Thu May 22 13:25:09 2008 -0700
@@ -38,6 +38,6 @@ int btrfs_find_first_ordered_inode(struc
 int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				       u64 *root_objectid, u64 *objectid,
 				       struct inode **inode);
-int btrfs_del_ordered_inode(struct inode *inode);
+void btrfs_del_ordered_inode(struct inode *inode);
 int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode);
 #endif
diff -r c3290d51e5f9 super.c
--- a/super.c	Fri May 16 13:30:15 2008 -0400
+++ b/super.c	Thu May 22 13:30:44 2008 -0700
@@ -487,7 +487,6 @@ static void btrfs_unlockfs(struct super_
 
 static struct super_operations btrfs_super_ops = {
 	.delete_inode	= btrfs_delete_inode,
-	.put_inode	= btrfs_put_inode,
 	.put_super	= btrfs_put_super,
 	.write_super	= btrfs_write_super,
 	.sync_fs	= btrfs_sync_fs,


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Chris Mason

2008-May-22 22:23 UTC

head link

Re: [RFC][PATCH]btrfs delete ordered inode handling fix

On Thursday 22 May 2008, Mingming wrote:

[ ... ]
> > I get around this by testing for dirty/writeback pages before removing
> > the inode from the ordered list.  If another writer allocates blocks
to
> > the file, it will be added back to the list.
>
> I see.:) How about patch below?
Looks good, I''ll give it a shot here.

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs"
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Btrfs devel - Apr 2008 - cloning file data

[Btrfs-devel] cloning file data

[Btrfs-devel] cloning file data

[Btrfs-devel] cloning file data

Re: [Btrfs-devel] cloning file data

Re: [Btrfs-devel] cloning file data

Re: btrfs_put_inode

Re: btrfs_put_inode

Re: btrfs_put_inode

Re: btrfs_put_inode

Re: btrfs_put_inode

Re: btrfs_put_inode

[RFC][PATCH]btrfs delete ordered inode handling fix

Re: [RFC][PATCH]btrfs delete ordered inode handling fix

Re: [RFC][PATCH]btrfs delete ordered inode handling fix

Re: [RFC][PATCH]btrfs delete ordered inode handling fix

Re: [RFC][PATCH]btrfs delete ordered inode handling fix

Re: [RFC][PATCH]btrfs delete ordered inode handling fix