Hi, Here''s an implementation of NFS support for btrfs. It does not work in one particular case as described in http://www.mail-archive.com/linux-btrfs@vger.kernel.org/msg00298.html. This uses the btrfs_iget helper introduced previously. Comments ? --- Signed-off-by: Balaji Rao <balajirrao@gmail.com> diff -r 3f0eee804974 Makefile --- a/Makefile Thu Jun 26 10:34:20 2008 -0400 +++ b/Makefile Mon Jul 21 01:14:45 2008 +0530 @@ -6,7 +6,8 @@ hash.o file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ - extent_io.o volumes.o async-thread.o ioctl.o locking.o + extent_io.o volumes.o async-thread.o ioctl.o locking.o \ + export.o btrfs-$(CONFIG_FS_POSIX_ACL) += acl.o else diff -r 3f0eee804974 export.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/export.c Mon Jul 21 01:14:45 2008 +0530 @@ -0,0 +1,183 @@ +#include <linux/fs.h> +#include <linux/types.h> +#include "ctree.h" +#include "disk-io.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "export.h" + +/* The size of encoded fh is the type number of the fh itself */ +#define BTRFS_FID_NON_CONNECTABLE 5 +#define BTRFS_FID_CONNECTABLE 8 + +static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + int connectable) +{ + struct btrfs_fid *fid = (struct btrfs_fid *)fh; + struct inode *inode = dentry->d_inode; + int len = *max_len; + + if ((len < BTRFS_FID_NON_CONNECTABLE ) || + (connectable && len < BTRFS_FID_CONNECTABLE)) { + return 255; + } + + len = BTRFS_FID_NON_CONNECTABLE; + + fid->objectid = BTRFS_I(inode)->location.objectid; + fid->root_objectid = BTRFS_I(inode)->root->objectid; + fid->gen = inode->i_generation; + + if (connectable && !S_ISDIR(inode->i_mode)) { + struct inode *parent; + + spin_lock(&dentry->d_lock); + + parent = dentry->d_parent->d_inode; + fid->parent_objectid = BTRFS_I(parent)->location.objectid; + fid->parent_gen = parent->i_generation; + + spin_unlock(&dentry->d_lock); + len = BTRFS_FID_CONNECTABLE; + } + *max_len = len; + + /* We return length itself for the type */ + return len; + +} + +static struct dentry * btrfs_get_dentry(struct super_block *sb, + u64 objectid, u64 root_objectid, u32 generation) +{ + struct btrfs_root *root; + struct inode *inode; + struct dentry *result; + struct btrfs_key key; + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + key.offset = 0; + + root = btrfs_lookup_fs_root(btrfs_sb(sb)->fs_info, root_objectid); + inode = btrfs_iget(sb, &key, root, NULL); + if (IS_ERR(inode)) + return (void *)inode; + + if(generation != inode->i_generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + result = d_alloc_anon(inode); + if(!result) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + + return result; +} + +static struct dentry *btrfs_fh_to_parent(struct super_block *sb, + struct fid *fh, int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if (fh_type != BTRFS_FID_CONNECTABLE) { + return NULL; + } + + + root_objectid = fid->root_objectid; + objectid = fid->parent_objectid; + generation = fid->parent_gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation); +} + +static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, + struct fid *fh, int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if (fh_type > BTRFS_FID_CONNECTABLE) { + return NULL; + } + + objectid = fid->objectid; + root_objectid = fid->root_objectid; + generation = fid->gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation); +} + +static struct dentry *btrfs_get_parent(struct dentry *child) +{ + struct inode *dir = child->d_inode; + struct inode *inode; + struct dentry *parent; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + u32 nritems; + int slot; + u64 objectid; + int ret; + + path = btrfs_alloc_path(); + + key.objectid = dir->i_ino; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + BUG_ON(ret == 0); + ret = 0; + + leaf = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(leaf); + if (slot >= nritems) { + goto out; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != dir->i_ino || + key.type != BTRFS_INODE_REF_KEY) { + goto out; + } + + btrfs_free_path(path); + objectid = key.offset; + + /* Build a new key for the inode item */ + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + + parent = d_alloc_anon(inode); + if (!parent) { + iput(inode); + parent = ERR_PTR(-ENOMEM); + } + + return parent; + +out: + btrfs_free_path(path); + return ERR_PTR(-EINVAL); + +} + +const struct export_operations btrfs_export_ops = { + .encode_fh = btrfs_encode_fh, + .fh_to_dentry = btrfs_fh_to_dentry, + .fh_to_parent = btrfs_fh_to_parent, + .get_parent = btrfs_get_parent, +}; diff -r 3f0eee804974 export.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/export.h Mon Jul 21 01:14:45 2008 +0530 @@ -0,0 +1,17 @@ +#ifndef BTRFS_EXPORT_H +#define BTRFS_EXPORT_H + +#include <linux/exportfs.h> + +extern const struct export_operations btrfs_export_ops; + +struct btrfs_fid { + u64 objectid; + u64 root_objectid; + u32 gen; + + u64 parent_objectid; + u32 parent_gen; +} __attribute__ ((packed)); + +#endif diff -r 3f0eee804974 super.c --- a/super.c Thu Jun 26 10:34:20 2008 -0400 +++ b/super.c Mon Jul 21 01:14:45 2008 +0530 @@ -45,6 +45,7 @@ #include "print-tree.h" #include "xattr.h" #include "volumes.h" +#include "export.h" #define BTRFS_SUPER_MAGIC 0x9123683E @@ -298,6 +299,7 @@ sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_magic = BTRFS_SUPER_MAGIC; sb->s_op = &btrfs_super_ops; + sb->s_export_op = &btrfs_export_ops; sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2008-07-21 at 02:01 +0530, Balaji Rao wrote:> Here''s an implementation of NFS support for btrfs. It does not work in > one particular case as described in > http://www.mail-archive.com/linux-btrfs@vger.kernel.org/msg00298.html.I can''t get it to work properly. The simple case of exporting and using a file system works -- but that doesn''t exercise the get_parent() or dentry_to_fh() code paths. To test those, you need to clear the server''s dcache completely -- I prefer just rebooting the server. Note that the first problem you''ll hit is the lack of stable fsid -- because btrfs uses an anonymous superblock, it might cause stale file handles after a reboot, unless your test setup mounts exactly the same anonymous file systems each time. That bit me when I context switched from something else and had debugfs mounted before btrfs, then rebooted and didn''t mount debugfs first. I''ll deal with the fsid problem separately; just be aware of it and avoid it for now. The second problem is that btrfs_fh_to_dentry() fails -- it seems we need to set the key type BTRFS_INODE_ITEM_KEY in btrfs_get_dentry(), rather than BTRFS_INODE_REF_KEY. I still haven''t learned enough to know precisely what the implications of that are -- but unless I make that change, btrfs_lookup_inode() fails and btrfs_read_locked_inode() marks the resulting inode as bad. You can test that by mounting the exported file system, then rebooting the server, then running ''ls'' in the NFS-mounted file system. My next test was to make a deep directory path: mkdir -p a/b/c/d/e/f/g and then change directory into it. Again reboot the server and run ''ls''. This time, I see ''ls: cannot open directory .''. In this csse, it seems to be because btrfs_get_parent() is failing. In my case, it seems to be because the ''if (slot >= nritems)'' check is triggering -- both are set to 14. I''m now trying to work out precisely what that means... -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sunday 17 August 2008 05:23:22 pm David Woodhouse wrote:> On Mon, 2008-07-21 at 02:01 +0530, Balaji Rao wrote: > > Here''s an implementation of NFS support for btrfs. It does not work in > > one particular case as described in > > http://www.mail-archive.com/linux-btrfs@vger.kernel.org/msg00298.html. > > I can''t get it to work properly. The simple case of exporting and using > a file system works -- but that doesn''t exercise the get_parent() or > dentry_to_fh() code paths. To test those, you need to clear the server''s > dcache completely -- I prefer just rebooting the server. >OK. I was not very sure if NFS worked across server reboots.> Note that the first problem you''ll hit is the lack of stable fsid -- > because btrfs uses an anonymous superblock, it might cause stale file > handles after a reboot, unless your test setup mounts exactly the same > anonymous file systems each time. That bit me when I context switched > from something else and had debugfs mounted before btrfs, then rebooted > and didn''t mount debugfs first. I''ll deal with the fsid problem > separately; just be aware of it and avoid it for now. >Hmmm.. how do we deal with that ?> The second problem is that btrfs_fh_to_dentry() fails -- it seems we > need to set the key type BTRFS_INODE_ITEM_KEY in btrfs_get_dentry(), > rather than BTRFS_INODE_REF_KEY. I still haven''t learned enough to know > precisely what the implications of that are -- but unless I make that > change, btrfs_lookup_inode() fails and btrfs_read_locked_inode() marks > the resulting inode as bad. >Right. Again, my bad. That''s a silly mistake, which didn''t surface because of bad testing!> You can test that by mounting the exported file system, then rebooting > the server, then running ''ls'' in the NFS-mounted file system. > > My next test was to make a deep directory path: mkdir -p a/b/c/d/e/f/g > and then change directory into it. Again reboot the server and run ''ls''. > This time, I see ''ls: cannot open directory .''. In this csse, it seems > to be because btrfs_get_parent() is failing. In my case, it seems to be > because the ''if (slot >= nritems)'' check is triggering -- both are set > to 14. I''m now trying to work out precisely what that means...I think this means that the inode ref is not found, which is weird. I remember that it worked with it a really deep path. I''ll try to reproduce the problem and see what''s going on.. -- Thanks, Balaji -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 2008-08-17 at 18:21 +0530, Balaji Rao wrote:> On Sunday 17 August 2008 05:23:22 pm David Woodhouse wrote: > > On Mon, 2008-07-21 at 02:01 +0530, Balaji Rao wrote: > > > Here''s an implementation of NFS support for btrfs. It does not work in > > > one particular case as described in > > > http://www.mail-archive.com/linux-btrfs@vger.kernel.org/msg00298.html. > > > > I can''t get it to work properly. The simple case of exporting and using > > a file system works -- but that doesn''t exercise the get_parent() or > > dentry_to_fh() code paths. To test those, you need to clear the server''s > > dcache completely -- I prefer just rebooting the server. > > > > OK. I was not very sure if NFS worked across server reboots.It definitely should.> > Note that the first problem you''ll hit is the lack of stable fsid -- > > because btrfs uses an anonymous superblock, it might cause stale file > > handles after a reboot, unless your test setup mounts exactly the same > > anonymous file systems each time. That bit me when I context switched > > from something else and had debugfs mounted before btrfs, then rebooted > > and didn''t mount debugfs first. I''ll deal with the fsid problem > > separately; just be aware of it and avoid it for now. > > > Hmmm.. how do we deal with that ?I''m not entirely sure yet. I had a patch to make the kernel automatically set a uuid on the export, which was simple enough. Unfortunately that isn''t sufficient, because although we then return an appropriate root fh to the mount request, we then aren''t capable of _interpreting_ that fh when the client immediately gives it back to us in a subsequent fsinfo request. Because we''re relying on mountd to do the initial fh->dentry conversion for us, and mountd is very limited in its uuid handling -- it assumes that - there is a block device - there is a 1:1 mapping between block device and uuid - libuuid can cope with new file systems.> > The second problem is that btrfs_fh_to_dentry() fails -- it seems we > > need to set the key type BTRFS_INODE_ITEM_KEY in btrfs_get_dentry(), > > rather than BTRFS_INODE_REF_KEY. I still haven''t learned enough to know > > precisely what the implications of that are -- but unless I make that > > change, btrfs_lookup_inode() fails and btrfs_read_locked_inode() marks > > the resulting inode as bad. > > > Right. Again, my bad. That''s a silly mistake, which didn''t surface because of > bad testing! > > > You can test that by mounting the exported file system, then rebooting > > the server, then running ''ls'' in the NFS-mounted file system. > > > > My next test was to make a deep directory path: mkdir -p a/b/c/d/e/f/g > > and then change directory into it. Again reboot the server and run ''ls''. > > This time, I see ''ls: cannot open directory .''. In this csse, it seems > > to be because btrfs_get_parent() is failing. In my case, it seems to be > > because the ''if (slot >= nritems)'' check is triggering -- both are set > > to 14. I''m now trying to work out precisely what that means... > > I think this means that the inode ref is not found, which is weird. I remember > that it worked with it a really deep path. I''ll try to reproduce the problem > and see what''s going on..See below... it seems to be working now. diff --git a/export.c b/export.c index 1b8875c..6209f35 100644 --- a/export.c +++ b/export.c @@ -72,7 +72,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, struct btrfs_key key; key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; root = btrfs_lookup_fs_root(btrfs_sb(sb)->fs_info, root_objectid); @@ -164,14 +164,22 @@ static struct dentry *btrfs_get_parent(struct dentry *child) leaf = path->nodes[0]; slot = path->slots[0]; nritems = btrfs_header_nritems(leaf); - if (slot >= nritems) - goto out; + if (slot >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) { + btrfs_free_path(path); + goto out; + } + leaf = path->nodes[0]; + slot = path->slots[0]; + } + + btrfs_free_path(path); btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) goto out; - btrfs_free_path(path); objectid = key.offset; /* Build a new key for the inode item */ -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sunday 17 August 2008 06:26:03 pm David Woodhouse wrote:> On Sun, 2008-08-17 at 18:21 +0530, Balaji Rao wrote: > > On Sunday 17 August 2008 05:23:22 pm David Woodhouse wrote: > > > On Mon, 2008-07-21 at 02:01 +0530, Balaji Rao wrote: > > > > Here''s an implementation of NFS support for btrfs. It does not work > > > > in one particular case as described in > > > > http://www.mail-archive.com/linux-btrfs@vger.kernel.org/msg00298.html > > > >. > > > > > > I can''t get it to work properly. The simple case of exporting and using > > > a file system works -- but that doesn''t exercise the get_parent() or > > > dentry_to_fh() code paths. To test those, you need to clear the > > > server''s dcache completely -- I prefer just rebooting the server. > > > > OK. I was not very sure if NFS worked across server reboots. > > It definitely should. > > > > Note that the first problem you''ll hit is the lack of stable fsid -- > > > because btrfs uses an anonymous superblock, it might cause stale file > > > handles after a reboot, unless your test setup mounts exactly the same > > > anonymous file systems each time. That bit me when I context switched > > > from something else and had debugfs mounted before btrfs, then rebooted > > > and didn''t mount debugfs first. I''ll deal with the fsid problem > > > separately; just be aware of it and avoid it for now. > > > > Hmmm.. how do we deal with that ? > > I''m not entirely sure yet. I had a patch to make the kernel > automatically set a uuid on the export, which was simple enough. > > Unfortunately that isn''t sufficient, because although we then return an > appropriate root fh to the mount request, we then aren''t capable of > _interpreting_ that fh when the client immediately gives it back to us > in a subsequent fsinfo request. Because we''re relying on mountd to do > the initial fh->dentry conversion for us, and mountd is very limited in > its uuid handling -- it assumes that > - there is a block device > - there is a 1:1 mapping between block device and uuid > - libuuid can cope with new file systems. > > > > The second problem is that btrfs_fh_to_dentry() fails -- it seems we > > > need to set the key type BTRFS_INODE_ITEM_KEY in btrfs_get_dentry(), > > > rather than BTRFS_INODE_REF_KEY. I still haven''t learned enough to know > > > precisely what the implications of that are -- but unless I make that > > > change, btrfs_lookup_inode() fails and btrfs_read_locked_inode() marks > > > the resulting inode as bad. > > > > Right. Again, my bad. That''s a silly mistake, which didn''t surface > > because of bad testing! > > > > > You can test that by mounting the exported file system, then rebooting > > > the server, then running ''ls'' in the NFS-mounted file system. > > > > > > My next test was to make a deep directory path: mkdir -p a/b/c/d/e/f/g > > > and then change directory into it. Again reboot the server and run > > > ''ls''. This time, I see ''ls: cannot open directory .''. In this csse, it > > > seems to be because btrfs_get_parent() is failing. In my case, it seems > > > to be because the ''if (slot >= nritems)'' check is triggering -- both > > > are set to 14. I''m now trying to work out precisely what that means... > > > > I think this means that the inode ref is not found, which is weird. I > > remember that it worked with it a really deep path. I''ll try to reproduce > > the problem and see what''s going on.. > > See below... it seems to be working now. > > diff --git a/export.c b/export.c > index 1b8875c..6209f35 100644 > --- a/export.c > +++ b/export.c > @@ -72,7 +72,7 @@ static struct dentry *btrfs_get_dentry(struct super_block > *sb, u64 objectid, struct btrfs_key key; > > key.objectid = objectid; > - btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); > + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); > key.offset = 0; > > root = btrfs_lookup_fs_root(btrfs_sb(sb)->fs_info, root_objectid); > @@ -164,14 +164,22 @@ static struct dentry *btrfs_get_parent(struct dentry > *child) leaf = path->nodes[0]; > slot = path->slots[0]; > nritems = btrfs_header_nritems(leaf); > - if (slot >= nritems) > - goto out; > + if (slot >= nritems) { > + ret = btrfs_next_leaf(root, path); > + if (ret) { > + btrfs_free_path(path); > + goto out; > + } > + leaf = path->nodes[0]; > + slot = path->slots[0]; > + } > + > + btrfs_free_path(path); > > btrfs_item_key_to_cpu(leaf, &key, slot); > if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) > goto out; > > - btrfs_free_path(path); > objectid = key.offset; > > /* Build a new key for the inode item */OK. I had copied over this code snippet from inode.c:btrfs_inode_by_name, which has the condition ''if (slot >= nritems)'' removed now by this. changeset: 631:87490dc3bb59 user: "Yan Zheng" <yanzheng@21cn.com> date: Thu Jul 24 12:19:32 2008 -0400 summary: Fix .. lookup corner case I think we should refactor btrfs_inode_by_name into something we can use ? -- Thanks, Balaji -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 2008-08-17 at 18:54 +0530, Balaji Rao wrote:> > > OK. I had copied over this code snippet from > inode.c:btrfs_inode_by_name, > which has the condition ''if (slot >= nritems)'' removed now by this. > > changeset: 631:87490dc3bb59 > user: "Yan Zheng" <yanzheng@21cn.com> > date: Thu Jul 24 12:19:32 2008 -0400 > summary: Fix .. lookup corner case > > I think we should refactor btrfs_inode_by_name into something we can > use ?I''m not entirely sure why btrfs_inode_by_name() handles "." and ".." at all. We haven''t needed that since the 2.2 kernel, have we? I think we should just remove that code completely -- and remove the similar code from btrfs_real_readdir() while we''re at it -- just use parent_ino(filp->f_path.dentry) instead. In _that_ case, we know the dcache is connected. -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 2008-08-17 at 18:54 +0530, Balaji Rao wrote:> > OK. I had copied over this code snippet from > inode.c:btrfs_inode_by_name, > which has the condition ''if (slot >= nritems)'' removed now by this. > > changeset: 631:87490dc3bb59 > user: "Yan Zheng" <yanzheng@21cn.com> > date: Thu Jul 24 12:19:32 2008 -0400 > summary: Fix .. lookup corner caseEr, isn''t that just moving the error case around? That''s commit 3dcd1334c286fa4467219302ff2f9a4a190fbb9c in the git tree: http://git.kernel.org/?p=linux/kernel/git/dwmw2/btrfs-kernel-unstable.git;a=commitdiff;h=3dcd1334 Your version fails if the item we want is in slot 0, because we don''t jump forward to the next leaf. The new version fails if it''s in the _last_ slot -- the return from btrfs_search_slot() now points to the first slot in the next leaf, and we treat that as an error instead of rewinding to the one we want. It''s just the same error, but in reverse. Or am I missing something? -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 2008-08-17 at 14:30 +0100, David Woodhouse wrote:> I think we should just remove that code completely -- and remove the > similar code from btrfs_real_readdir() while we''re at it -- just use > parent_ino(filp->f_path.dentry) instead. In _that_ case, we know the > dcache is connected.>From 6a3710b961c61fef038cca1722176b171697b3da Mon Sep 17 00:00:00 2001From: David Woodhouse <David.Woodhouse@intel.com> Date: Sun, 17 Aug 2008 15:14:48 +0100 Subject: [PATCH] Remove special cases for "." and ".." We never get asked by the VFS to lookup either of them, and we can handle the readdir() case a lot more simply, too. Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> --- inode.c | 52 ++-------------------------------------------------- 1 files changed, 2 insertions(+), 50 deletions(-) diff --git a/inode.c b/inode.c index fe6d4ae..411dfc4 100644 --- a/inode.c +++ b/inode.c @@ -1742,42 +1742,9 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, struct btrfs_root *root = BTRFS_I(dir)->root; int ret = 0; - if (namelen == 1 && strcmp(name, ".") == 0) { - location->objectid = dir->i_ino; - location->type = BTRFS_INODE_ITEM_KEY; - location->offset = 0; - return 0; - } path = btrfs_alloc_path(); BUG_ON(!path); - if (namelen == 2 && strcmp(name, "..") == 0) { - struct btrfs_key key; - struct extent_buffer *leaf; - int slot; - - key.objectid = dir->i_ino; - key.offset = (u64)-1; - btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); - if (ret < 0 || path->slots[0] == 0) - goto out_err; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - BUG_ON(ret == 0); - ret = 0; - leaf = path->nodes[0]; - slot = path->slots[0] - 1; - - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid != dir->i_ino || - key.type != BTRFS_INODE_REF_KEY) { - goto out_err; - } - location->objectid = key.offset; - location->type = BTRFS_INODE_ITEM_KEY; - location->offset = 0; - goto out; - } - di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name, namelen, 0); if (IS_ERR(di)) @@ -2000,29 +1967,14 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, /* special case for .., just use the back ref */ if (filp->f_pos == 1) { - btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); - key.offset = (u64)-1; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0 || path->slots[0] == 0) { - btrfs_release_path(root, path); - goto read_dir_items; - } - BUG_ON(ret == 0); - leaf = path->nodes[0]; - slot = path->slots[0] - 1; - btrfs_item_key_to_cpu(leaf, &found_key, slot); - btrfs_release_path(root, path); - if (found_key.objectid != key.objectid || - found_key.type != BTRFS_INODE_REF_KEY) - goto read_dir_items; + u64 pino = parent_ino(filp->f_path.dentry); over = filldir(dirent, "..", 2, - 2, found_key.offset, DT_DIR); + 2, pino, DT_DIR); if (over) goto nopos; filp->f_pos = 2; } -read_dir_items: btrfs_set_key_type(&key, key_type); key.offset = filp->f_pos; -- 1.5.5.1 -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 2008-08-17 at 15:17 +0100, David Woodhouse wrote:> Subject: [PATCH] Remove special cases for "." and ".." > > We never get asked by the VFS to lookup either of them, and we can > handle the readdir() case a lot more simply, too.And a few other minor cleanups... some cosmetic, but note the WARN_ON(). From: David Woodhouse <David.Woodhouse@intel.com> Date: Sun, 17 Aug 2008 17:08:36 +0100 Subject: [PATCH] Minor cleanup of btrfs_real_readdir() Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> --- inode.c | 42 +++++++++++++++++++++++++++--------------- 1 files changed, 27 insertions(+), 15 deletions(-) diff --git a/inode.c b/inode.c index 411dfc4..187e1f8 100644 --- a/inode.c +++ b/inode.c @@ -1960,34 +1960,34 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, return 0; filp->f_pos = 1; } - - key.objectid = inode->i_ino; - path = btrfs_alloc_path(); - path->reada = 2; - /* special case for .., just use the back ref */ if (filp->f_pos == 1) { u64 pino = parent_ino(filp->f_path.dentry); over = filldir(dirent, "..", 2, 2, pino, DT_DIR); if (over) - goto nopos; + return 0; filp->f_pos = 2; } + path = btrfs_alloc_path(); + path->reada = 2; + btrfs_set_key_type(&key, key_type); key.offset = filp->f_pos; + key.objectid = inode->i_ino; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto err; advance = 0; - while(1) { + + while (1) { leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); slot = path->slots[0]; if (advance || slot >= nritems) { - if (slot >= nritems -1) { + if (slot >= nritems - 1) { ret = btrfs_next_leaf(root, path); if (ret) break; @@ -2011,19 +2011,23 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, continue; filp->f_pos = found_key.offset; - advance = 1; + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); di_cur = 0; di_total = btrfs_item_size(leaf, item); - while(di_cur < di_total) { + + while (di_cur < di_total) { struct btrfs_key location; name_len = btrfs_dir_name_len(leaf, di); - if (name_len < 32) { + if (name_len <= sizeof(tmp_name)) { name_ptr = tmp_name; } else { name_ptr = kmalloc(name_len, GFP_NOFS); - BUG_ON(!name_ptr); + if (!name_ptr) { + ret = -ENOMEM; + goto err; + } } read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), name_len); @@ -2031,8 +2035,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; btrfs_dir_item_key_to_cpu(leaf, di, &location); over = filldir(dirent, name_ptr, name_len, - found_key.offset, - location.objectid, + found_key.offset, location.objectid, d_type); if (name_ptr != tmp_name) @@ -2040,12 +2043,21 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, if (over) goto nopos; + di_len = btrfs_dir_name_len(leaf, di) + - btrfs_dir_data_len(leaf, di) +sizeof(*di); + btrfs_dir_data_len(leaf, di) + sizeof(*di); di_cur += di_len; di = (struct btrfs_dir_item *)((char *)di + di_len); + + /* If there''s more than one directory entry in each + item, then the offset isn''t unique. Seeking in + directories will be broken. Can this ever actually + happen? */ + WARN_ON(di_cur != di_total); } } + + /* Reached end of directory/root. Bump pos past the last item. */ if (key_type == BTRFS_DIR_INDEX_KEY) filp->f_pos = INT_LIMIT(typeof(filp->f_pos)); else -- 1.5.5.1 -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 2008-08-17 at 18:21 +0530, Balaji Rao wrote:> > Note that the first problem you''ll hit is the lack of stable fsid -- > > because btrfs uses an anonymous superblock, it might cause stale file > > handles after a reboot, unless your test setup mounts exactly the same > > anonymous file systems each time. That bit me when I context switched > > from something else and had debugfs mounted before btrfs, then rebooted > > and didn''t mount debugfs first. I''ll deal with the fsid problem > > separately; just be aware of it and avoid it for now. > > > Hmmm.. how do we deal with that ?First we teach nfs-utils to derive a UUID from the ''f_fsid'' field it''s already being given by the kernel when it calls statfs(): git.infradead.org/users/dwmw2/nfs-utils.git Then we make btrfs put something suitable into the f_fsid field in btrfs_statfs(). The patch below works OK, but doesn''t yet handle subvolumes -- it gives the same fsid for all subvolumes. But then, since we''re also returning the same {dev,ino#} for all subvolumes, nfs exporting isn''t the _only_ thing that''s going to get confused... From: David Woodhouse <David.Woodhouse@intel.com> Date: Mon, 18 Aug 2008 12:01:52 +0100 Subject: [PATCH] Fill f_fsid field in btrfs_statfs() Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> --- super.c | 6 ++++++ 1 files changed, 6 insertions(+), 0 deletions(-) diff --git a/super.c b/super.c index e830e0e..6446ab7 100644 --- a/super.c +++ b/super.c @@ -489,6 +489,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct btrfs_root *root = btrfs_sb(dentry->d_sb); struct btrfs_super_block *disk_super = &root->fs_info->super_copy; int bits = dentry->d_sb->s_blocksize_bits; + __be32 *fsid = (__be32 *)root->fs_info->fsid; buf->f_namelen = BTRFS_NAME_LEN; buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; @@ -497,6 +498,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = buf->f_bfree; buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; + /* We treat it as constant endianness (it doesn''t matter _which_) + because we want the fsid to come out the same whether mounted + on a big-endian or little-endian host */ + buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); + buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); return 0; } -- 1.5.5.1 -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2008-08-18 at 12:51 +0100, David Woodhouse wrote:> The patch below works OK, but doesn''t yet handle > subvolumes -- it gives the same fsid for all subvolumes.Is this the correct fix? diff --git a/super.c b/super.c index 6446ab7..55f4d00 100644 --- a/super.c +++ b/super.c @@ -503,6 +503,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) on a big-endian or little-endian host */ buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); + /* Mask in the root object ID too, to disambiguate subvols */ + buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32; + buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid; + return 0; } -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 2008-08-17 at 17:10 +0100, David Woodhouse wrote:> On Sun, 2008-08-17 at 15:17 +0100, David Woodhouse wrote: > > Subject: [PATCH] Remove special cases for "." and ".." > > > > We never get asked by the VFS to lookup either of them, and we can > > handle the readdir() case a lot more simply, too. > > And a few other minor cleanups... some cosmetic, but note the > WARN_ON().> + > + /* If there''s more than one directory entry in each > + item, then the offset isn''t unique. Seeking in > + directories will be broken. Can this ever actually > + happen? */ > + WARN_ON(di_cur != di_total); > }It can happen today for the tree of tree roots (directory of subvolumes and named snapshots). It happens when the name hashes collide. For real directories we use the sequence number index instead of the name hash index, so it normally won''t happen. Longer term the directory of subvolumes is going to be a real directory and this special case will go away. -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2008-08-18 at 14:46 -0400, Chris Mason wrote:> It can happen today for the tree of tree roots (directory of > subvolumes and named snapshots). It happens when the name hashes > collide. For real directories we use the sequence number index > instead of the name hash index, so it normally won''t happen.Ok, that''s fine then. I''ve removed the WARN_ON() from the patch in my git tree. This is what I have outstanding for you at git://git.infradead.org/users/dwmw2/btrfs-kernel-unstable: Balaji Rao (2): Introduce btrfs_iget helper NFS support for btrfs - v3 David Woodhouse (7): Implement our own copy of the nfsd readdir hack, for older kernels Discard sector data in __free_extent() Remove special cases for "." and ".." Minor cleanup of btrfs_real_readdir() Optimise NFS readdir hack slightly; don''t call readdir() again when done Fill f_fsid field in btrfs_statfs() Mask root object ID into f_fsid in btrfs_statfs() Makefile | 2 +- compat.h | 17 ++++ ctree.h | 2 + export.c | 208 +++++++++++++++++++++++++++++++++++++++++++++++ export.h | 19 +++++ extent-tree.c | 25 ++++++ inode.c | 250 +++++++++++++++++++++++++++++++++++++------------------- super.c | 12 +++ 8 files changed, 449 insertions(+), 86 deletions(-) -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2008-08-18 at 13:10 +0100, David Woodhouse wrote:> On Mon, 2008-08-18 at 12:51 +0100, David Woodhouse wrote: > > The patch below works OK, but doesn''t yet handle > > subvolumes -- it gives the same fsid for all subvolumes. > > Is this the correct fix? >This looks sane: -chris> diff --git a/super.c b/super.c > index 6446ab7..55f4d00 100644 > --- a/super.c > +++ b/super.c > @@ -503,6 +503,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) > on a big-endian or little-endian host */ > buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); > buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); > + /* Mask in the root object ID too, to disambiguate subvols */ > + buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32; > + buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid; > + > return 0; > } > >-- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, 2008-08-17 at 14:40 +0100, David Woodhouse wrote:> On Sun, 2008-08-17 at 18:54 +0530, Balaji Rao wrote: > > > > OK. I had copied over this code snippet from > > inode.c:btrfs_inode_by_name, > > which has the condition ''if (slot >= nritems)'' removed now by this. > > > > changeset: 631:87490dc3bb59 > > user: "Yan Zheng" <yanzheng@21cn.com> > > date: Thu Jul 24 12:19:32 2008 -0400 > > summary: Fix .. lookup corner case > > Er, isn''t that just moving the error case around? > > That''s commit 3dcd1334c286fa4467219302ff2f9a4a190fbb9c in the git tree: > http://git.kernel.org/?p=linux/kernel/git/dwmw2/btrfs-kernel-unstable.git;a=commitdiff;h=3dcd1334 > > Your version fails if the item we want is in slot 0, because we don''t > jump forward to the next leaf. > > The new version fails if it''s in the _last_ slot -- the return from > btrfs_search_slot() now points to the first slot in the next leaf, and > we treat that as an error instead of rewinding to the one we want. It''s > just the same error, but in reverse. > > Or am I missing something? >The search key in Yan''s patch changes to (u64)-1. We know that if we''re in slot 0, there''s nothing after us in the tree. -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2008-08-18 at 20:08 +0100, David Woodhouse wrote:> On Mon, 2008-08-18 at 14:46 -0400, Chris Mason wrote: > > It can happen today for the tree of tree roots (directory of > > subvolumes and named snapshots). It happens when the name hashes > > collide. For real directories we use the sequence number index > > instead of the name hash index, so it normally won''t happen. > > Ok, that''s fine then. I''ve removed the WARN_ON() from the patch in my > git tree. This is what I have outstanding for you at > git://git.infradead.org/users/dwmw2/btrfs-kernel-unstable: >Ok, just to double check after rereading the thread, are there known bugs in this stuff? They look sane to me. -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2008-08-18 at 15:24 -0400, Chris Mason wrote:> Ok, just to double check after rereading the thread, are there known > bugs in this stuff? They look sane to me.No, no known bugs. This is now working nicely for me -- I can export two separate subvolumes by NFS, change into a deep subdirectory and reboot the server, and it all works fine again when the server comes back. -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2008-08-18 at 15:23 -0400, Chris Mason wrote:> The search key in Yan''s patch changes to (u64)-1. We know that if > we''re in slot 0, there''s nothing after us in the tree.But because we searched for a reference to (u64)-1, the item we want is actually _earlier_ in the tree, not later. -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2008-08-18 at 20:33 +0100, David Woodhouse wrote:> On Mon, 2008-08-18 at 15:23 -0400, Chris Mason wrote: > > The search key in Yan''s patch changes to (u64)-1. We know that if > > we''re in slot 0, there''s nothing after us in the tree. > > But because we searched for a reference to (u64)-1, the item we want is > actually _earlier_ in the tree, not later. >Lets pretend I had put in commments something like the code below. The important part is that directories have only one link, so they have only one backref. /* * we''re a directory and we have at most 1 back reference * to our one and only parent directory */ if (namelen == 2 && strcmp(name, "..") == 0) { struct btrfs_key key; struct extent_buffer *leaf; int slot; key.objectid = dir->i_ino; /* after the search, we''ll be * one slot after the last back reference for this * inode. */ key.offset = (u64)-1; btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); /* normally, I would check the return values after the * search. But this time I merged the patch wrong and * made a bug. If path->slots[0] == 0 after the * search, there won''t be any keys smaller to ours * in the tree. */ if (ret < 0 || path->slots[0] == 0) goto out_err; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); BUG_ON(ret == 0); ret = 0; leaf = path->nodes[0]; /* go back one slot and find our only backref */ slot = path->slots[0] - 1; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) { goto out_err; } location->objectid = key.offset; location->type = BTRFS_INODE_ITEM_KEY; location->offset = 0; goto out; -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2008-08-18 at 15:47 -0400, Chris Mason wrote:> Lets pretend I had put in commments something like the code below. > The important part is that directories have only one link, so they > have only one backref.OK. Now can I rip that code out anyway? The VFS will never call btrfs_lookup() for ".." -- not since the 2.2 kernel :) I''m still a little confused about precisely what btrfs_search_slot() returns when it doesn''t find a match -- that''s probably where the documentation would be more useful. What I have in btrfs_get_parent() is correct, I think -- but could probably be simplified to your version, searching for (u64)-1 and then not needing to call btrfs_next_leaf(). Something like this? (And am I fixing a use-after-free bug by moving that btrfs_free_path() down by a line, at the end?) diff --git a/export.c b/export.c index 797b4cb..830e87a 100644 --- a/export.c +++ b/export.c @@ -156,27 +156,27 @@ static struct dentry *btrfs_get_parent(struct dentry *child) key.objectid = dir->i_ino; btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); - key.offset = 0; + key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - BUG_ON(ret == 0); - ret = 0; leaf = path->nodes[0]; slot = path->slots[0]; - nritems = btrfs_header_nritems(leaf); - if (slot >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret) { - btrfs_free_path(path); - goto out; - } - leaf = path->nodes[0]; - slot = path->slots[0]; + if (ret < 0 && slot == 0) { + btrfs_free_path(path); + goto out; } + /* + This will be in the slot _before_ the one that btrfs_search_slot() + returns. And for some reason which dwmw2 doesn''t quite understand, + that''ll definitely be in the same leaf that btrfs_search_slot() + returned -- even if btrfs_search_slot() had to look in the _next_ + leaf to find the first key which is greater than what we asked for + */ + slot--; + btrfs_item_key_to_cpu(leaf, &key, slot); btrfs_free_path(path); - btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) goto out; -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2008-08-18 at 21:20 +0100, David Woodhouse wrote:> On Mon, 2008-08-18 at 15:47 -0400, Chris Mason wrote: > > Lets pretend I had put in commments something like the code below. > > The important part is that directories have only one link, so they > > have only one backref. > > OK. Now can I rip that code out anyway? The VFS will never call > btrfs_lookup() for ".." -- not since the 2.2 kernel :) > > I''m still a little confused about precisely what btrfs_search_slot() > returns when it doesn''t find a match -- that''s probably where the > documentation would be more useful. >if btrfs_search_slot returns < 0, there was an error if btrfs_search_slot returns > 1, path->slots[0] is the spot in the tree where you''d want to insert the item. In this case, if path->slots[0] == 0, there are no keys in the tree smaller than your search key. If path->slots[0] == btrfs_header_nritems(path->nodes[0]), there are no items in this node that have a key > than your search key.> What I have in btrfs_get_parent() is correct, I think -- but could > probably be simplified to your version, searching for (u64)-1 and then > not needing to call btrfs_next_leaf().Yes.> > Something like this? > > (And am I fixing a use-after-free bug by moving that btrfs_free_path() > down by a line, at the end?) >Most definitely ;) One comment inline below> diff --git a/export.c b/export.c > index 797b4cb..830e87a 100644 > --- a/export.c > +++ b/export.c > @@ -156,27 +156,27 @@ static struct dentry *btrfs_get_parent(struct dentry *child) > > key.objectid = dir->i_ino; > btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); > - key.offset = 0; > + key.offset = (u64)-1; > ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); > - BUG_ON(ret == 0); > - ret = 0; > > leaf = path->nodes[0]; > slot = path->slots[0]; > - nritems = btrfs_header_nritems(leaf); > - if (slot >= nritems) { > - ret = btrfs_next_leaf(root, path); > - if (ret) { > - btrfs_free_path(path); > - goto out; > - } > - leaf = path->nodes[0]; > - slot = path->slots[0]; > + if (ret < 0 && slot == 0) {^^^^^^^^^^^^^ should be ||, and should set ret to something bad if slot == 0> + btrfs_free_path(path); > + goto out; > } > + /* > + This will be in the slot _before_ the one that btrfs_search_slot() > + returns. And for some reason which dwmw2 doesn''t quite understand, > + that''ll definitely be in the same leaf that btrfs_search_slot() > + returned -- even if btrfs_search_slot() had to look in the _next_ > + leaf to find the first key which is greater than what we asked for > + */ > + slot--; > > + btrfs_item_key_to_cpu(leaf, &key, slot); > btrfs_free_path(path); > > - btrfs_item_key_to_cpu(leaf, &key, slot); > if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) > goto out; > > >-chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2008-08-18 at 16:32 -0400, Chris Mason wrote:> On Mon, 2008-08-18 at 21:20 +0100, David Woodhouse wrote: > > On Mon, 2008-08-18 at 15:47 -0400, Chris Mason wrote: > > > Lets pretend I had put in commments something like the code below. > > > The important part is that directories have only one link, so they > > > have only one backref. > > > > OK. Now can I rip that code out anyway? The VFS will never call > > btrfs_lookup() for ".." -- not since the 2.2 kernel :) > > > > I''m still a little confused about precisely what btrfs_search_slot() > > returns when it doesn''t find a match -- that''s probably where the > > documentation would be more useful. > > > > if btrfs_search_slot returns < 0, there was an error > if btrfs_search_slot returns > 1, path->slots[0] is the spot in the tree > where you''d want to insert the item.What if the parent inode actually _is_ inode #0xffffffffffffffff? Can that happen? In that case it would return zero, and I shouldn''t subtract 1 from the slot number -- I''ve actually found what I''m looking for?> In this case, if path->slots[0] == 0, there are no keys in the tree > smaller than your search key. ^^^^OK... what if the place I''d want to insert the item is the first slot in some node? There are keys in the _tree_ which are smaller, but just not in this node? I think that''s where the root of my confusion lies.> If path->slots[0] == btrfs_header_nritems(path->nodes[0]), there are > no items in this node that have a key > than your search key.^^^^ Not in this node, but maybe in the next one? That''s why my own fix for the bug involved using btrfs_next_leaf() and using the first item from that one?> > + if (ret < 0 && slot == 0) { > > ^^^^^^^^^^^^^ should be ||, and should set ret to > something bad if slot == 0Er, yes. Moment of stupidity there :) We were ignoring ''ret'' anyway -- none of this stuff should ever happen, and it''s all just ''return ERR_PTR(-EINVAL)'' at the out: label. I''ve tested this, and added it to my tree: From: David Woodhouse <David.Woodhouse@intel.com> Date: Mon, 18 Aug 2008 22:50:22 +0100 Subject: [PATCH] Simplify btrfs_get_parent(), fix use-after-free bug Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> --- export.c | 26 +++++++++++--------------- 1 files changed, 11 insertions(+), 15 deletions(-) diff --git a/export.c b/export.c index 797b4cb..a913b9b 100644 --- a/export.c +++ b/export.c @@ -147,7 +147,6 @@ static struct dentry *btrfs_get_parent(struct dentry *child) struct btrfs_key key; struct btrfs_path *path; struct extent_buffer *leaf; - u32 nritems; int slot; u64 objectid; int ret; @@ -156,27 +155,24 @@ static struct dentry *btrfs_get_parent(struct dentry *child) key.objectid = dir->i_ino; btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); - key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - BUG_ON(ret == 0); - ret = 0; + key.offset = (u64)-1; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); leaf = path->nodes[0]; slot = path->slots[0]; - nritems = btrfs_header_nritems(leaf); - if (slot >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret) { - btrfs_free_path(path); - goto out; - } - leaf = path->nodes[0]; - slot = path->slots[0]; + if (ret < 0 || slot == 0) { + btrfs_free_path(path); + goto out; } + /* btrfs_search_slot() returns the slot where we''d want to insert + an INODE_REF_KEY for parent inode #0xFFFFFFFFFFFFFFFF. The _real_ + one, telling us what the parent inode _actually_ is, will be in + the slot _before_ the one that btrfs_search_slot() returns. */ + slot--; + btrfs_item_key_to_cpu(leaf, &key, slot); btrfs_free_path(path); - btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) goto out; -- 1.5.5.1 -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Aug 18, 2008 at 09:20:39PM +0100, David Woodhouse wrote:> On Mon, 2008-08-18 at 15:47 -0400, Chris Mason wrote: > > Lets pretend I had put in commments something like the code below. > > The important part is that directories have only one link, so they > > have only one backref. > > OK. Now can I rip that code out anyway? The VFS will never call > btrfs_lookup() for ".." -- not since the 2.2 kernel :)If you get_parent doesn''t call into the lowlevel lookup code it doesn''t need to handle "..". But most filesystems end up reusing the lookup code for get_parent. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2008-08-18 at 20:16 -0400, Christoph Hellwig wrote:> If you get_parent doesn''t call into the lowlevel lookup code it doesn''t > need to handle "..". But most filesystems end up reusing the lookup > code for get_parent.We don''t do that in btrfs. It doesn''t really seem like a useful thing to do. -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2008-08-18 at 22:52 +0100, David Woodhouse wrote:> On Mon, 2008-08-18 at 16:32 -0400, Chris Mason wrote: > > On Mon, 2008-08-18 at 21:20 +0100, David Woodhouse wrote: > > > On Mon, 2008-08-18 at 15:47 -0400, Chris Mason wrote: > > > > Lets pretend I had put in commments something like the code below. > > > > The important part is that directories have only one link, so they > > > > have only one backref. > > > > > > OK. Now can I rip that code out anyway? The VFS will never call > > > btrfs_lookup() for ".." -- not since the 2.2 kernel :) > > > > > > I''m still a little confused about precisely what btrfs_search_slot() > > > returns when it doesn''t find a match -- that''s probably where the > > > documentation would be more useful. > > > > > > > if btrfs_search_slot returns < 0, there was an error > > if btrfs_search_slot returns > 1, path->slots[0] is the spot in the tree > > where you''d want to insert the item. > > What if the parent inode actually _is_ inode #0xffffffffffffffff? Can > that happen? In that case it would return zero, and I shouldn''t subtract > 1 from the slot number -- I''ve actually found what I''m looking for? >The max inode will be 2^64 - 1> > In this case, if path->slots[0] == 0, there are no keys in the tree > > smaller than your search key. ^^^^ > > OK... what if the place I''d want to insert the item is the first slot in > some node? There are keys in the _tree_ which are smaller, but just not > in this node? I think that''s where the root of my confusion lies.In this case, you''ll always end up in the last slot of the previous node (which is what you were seeing when you put in the fix).> > > If path->slots[0] == btrfs_header_nritems(path->nodes[0]), there are > > no items in this node that have a key > than your search key. > ^^^^ > Not in this node, but maybe in the next one? That''s why my own fix for > the bug involved using btrfs_next_leaf() and using the first item from > that one?Correct.> > > > + if (ret < 0 && slot == 0) { > > > > ^^^^^^^^^^^^^ should be ||, and should set ret to > > something bad if slot == 0 > > Er, yes. Moment of stupidity there :) > > We were ignoring ''ret'' anyway -- none of this stuff should ever happen, > and it''s all just ''return ERR_PTR(-EINVAL)'' at the out: label. > > I''ve tested this, and added it to my tree: >Looks good, thanks. -chris -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2008-08-19 at 07:54 -0400, Chris Mason wrote:> > > What if the parent inode actually _is_ inode #0xffffffffffffffff? Can > > that happen? In that case it would return zero, and I shouldn''t subtract > > 1 from the slot number -- I''ve actually found what I''m looking for? > > > > The max inode will be 2^64 - 1Which is what we''re searching for -- so it''s _possible_, albeit vanishingly unlikely, that btrfs_search_slot() will actually return zero, having found precisely what we wanted? And in that case, path->slots[0] being zero is fine. And we shouldn''t be subtracting one from it to find the slot we want? -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2008-08-19 at 15:49 +0100, David Woodhouse wrote:> On Tue, 2008-08-19 at 07:54 -0400, Chris Mason wrote: > > > > > What if the parent inode actually _is_ inode #0xffffffffffffffff? Can > > > that happen? In that case it would return zero, and I shouldn''t subtract > > > 1 from the slot number -- I''ve actually found what I''m looking for? > > > > > > > The max inode will be 2^64 - 1 > > Which is what we''re searching for -- so it''s _possible_, albeit > vanishingly unlikely, that btrfs_search_slot() will actually return > zero, having found precisely what we wanted? > > And in that case, path->slots[0] being zero is fine. And we shouldn''t be > subtracting one from it to find the slot we want?Subject: [PATCH] Clean up btrfs_get_parent() a little more, fix a free-after-free bug Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> --- export.c | 33 +++++++++++++++++++-------------- 1 files changed, 19 insertions(+), 14 deletions(-) diff --git a/export.c b/export.c index 36cbc68..5c75cbd 100644 --- a/export.c +++ b/export.c @@ -165,23 +165,32 @@ static struct dentry *btrfs_get_parent(struct dentry *child) key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - leaf = path->nodes[0]; - slot = path->slots[0]; - if (ret < 0 || slot == 0) { + if (ret < 0) { + /* Error */ btrfs_free_path(path); - goto out; + return ERR_PTR(ret); + } + if (ret) { + leaf = path->nodes[0]; + slot = path->slots[0]; + /* btrfs_search_slot() returns the slot where we''d want to + insert a backref for parent inode #0xFFFFFFFFFFFFFFFF. + The _real_ backref, telling us what the parent inode + _actually_ is, will be in the slot _before_ the one + that btrfs_search_slot() returns. */ + if (!slot) { + /* Unless there is _no_ key in the tree before... */ + btrfs_free_path(path); + return ERR_PTR(-EIO); + } + slot--; } - /* btrfs_search_slot() returns the slot where we''d want to insert - an INODE_REF_KEY for parent inode #0xFFFFFFFFFFFFFFFF. The _real_ - one, telling us what the parent inode _actually_ is, will be in - the slot _before_ the one that btrfs_search_slot() returns. */ - slot--; btrfs_item_key_to_cpu(leaf, &key, slot); btrfs_free_path(path); if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) - goto out; + return ERR_PTR(-EINVAL); objectid = key.offset; @@ -201,10 +210,6 @@ static struct dentry *btrfs_get_parent(struct dentry *child) parent = ERR_PTR(-ENOMEM); return parent; - -out: - btrfs_free_path(path); - return ERR_PTR(-EINVAL); } const struct export_operations btrfs_export_ops = { -- 1.5.5.1 -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html