Hello, Here''s a quick implementation of NFS support for btrfs. It works mostly. But its still a bit buggy. I suspect it has to do something with inode locking. When I unmount and mount a couple of times, it stops working i.e, when I do an ''ls'', it keeps waiting indefinitely. This is seen when accessing the filesystem from a local mount point as well. Can anybody please review and tell me what stupid mistake I''m committing ? Thanks and Regards, Balaji Rao National Institute of Technology, Karnataka --- #include <linux/fs.h> #include <linux/types.h> #include "ctree.h" #include "disk-io.h" #include "btrfs_inode.h" #include "print-tree.h" #include "export.h" static int btrfs_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, int connectable) { struct inode *inode = dentry->d_inode; int len = *max_len; u64 objectid, root_objectid; u32 generation; __le32 *fh = (__force __le32 *) fh_in; if ((len < 5 )|| (connectable && len < 8)) { return 255; } objectid = BTRFS_I(inode)->location.objectid; root_objectid = BTRFS_I(inode)->root->objectid; generation = inode->i_generation; len = 5; fh[0] = cpu_to_le32((u32)(objectid >> 32)); fh[1] = cpu_to_le32((u32)(objectid & 0xffffffff)); fh[2] = cpu_to_le32((u32)(root_objectid >> 32)); fh[3] = cpu_to_le32((u32)(root_objectid & 0xffffffff)); fh[4] = cpu_to_le32(generation); if (connectable && !S_ISDIR(inode->i_mode)) { struct inode *parent; spin_lock(&dentry->d_lock); parent = dentry->d_parent->d_inode; objectid = BTRFS_I(parent)->location.objectid; generation = parent->i_generation; fh[5] = cpu_to_le32((u32)(objectid >> 32)); fh[6] = cpu_to_le32((u32)(objectid & 0xffffffff)); fh[7] = cpu_to_le32(generation); spin_unlock(&dentry->d_lock); len += 3; } *max_len = len; return len; } static struct dentry * btrfs_get_dentry(struct super_block *sb, u64 objectid, u64 root_objectid, u32 generation) { struct inode *inode; struct dentry *result; inode = btrfs_ilookup(sb, objectid, root_objectid); if (IS_ERR(inode)) return (void *)inode; if(generation != inode->i_generation) { iput(inode); return ERR_PTR(-ESTALE); } result = d_alloc_anon(inode); if(!result) { iput(inode); return ERR_PTR(-ENOMEM); } return result; } static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { u64 objectid, root_objectid; u32 generation; if (fh_len < 8 || fh_type != 8) { return NULL; } root_objectid = (u64)le32_to_cpu(fid->raw[2]) << 32; root_objectid |= (u64)le32_to_cpu(fid->raw[3]); objectid = (u64)le32_to_cpu(fid->raw[5]) << 32; objectid |= (u64)le32_to_cpu(fid->raw[6]); generation = le32_to_cpu(fid->raw[7]); return btrfs_get_dentry(sb, objectid, root_objectid, generation); } static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { u64 objectid, root_objectid; u32 generation; if (fh_len < 5 || fh_type > 5) { return NULL; } objectid = (u64)le32_to_cpu(fid->raw[0]) << 32; objectid |= (u64)le32_to_cpu(fid->raw[1]); root_objectid = (u64)le32_to_cpu(fid->raw[2]) << 32; root_objectid |= (u64)le32_to_cpu(fid->raw[3]); generation = le32_to_cpu(fid->raw[4]); return btrfs_get_dentry(sb, objectid, root_objectid, generation); } static struct dentry *btrfs_get_parent(struct dentry *child) { struct inode *dir = child->d_inode; struct inode *inode; struct dentry *parent; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_key key; struct btrfs_path *path; struct extent_buffer *leaf; u32 nritems; int slot; u64 objectid; int ret; path = btrfs_alloc_path(); key.objectid = dir->i_ino; btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); BUG_ON(ret == 0); ret = 0; leaf = path->nodes[0]; slot = path->slots[0]; nritems = btrfs_header_nritems(leaf); if (slot >= nritems) return ERR_PTR(-EINVAL); btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) { return ERR_PTR(-EINVAL); } btrfs_free_path(path); objectid = key.offset; inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); parent = d_alloc_anon(inode); if (!parent) { iput(inode); parent = ERR_PTR(-ENOMEM); } return parent; } const struct export_operations btrfs_export_ops = { .encode_fh = btrfs_encode_fh, .fh_to_dentry = btrfs_fh_to_dentry, .fh_to_parent = btrfs_fh_to_parent, .get_parent = btrfs_get_parent, }; -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Jun 29, 2008 at 05:05:31AM +0530, Balaji Rao wrote:> Hello, > > Here''s a quick implementation of NFS support for btrfs. It works mostly. But > its still a bit buggy. I suspect it has to do something with inode locking. > When I unmount and mount a couple of times, it stops working i.e, when I do an > ''ls'', it keeps waiting indefinitely. This is seen when accessing the filesystem > from a local mount point as well. > > Can anybody please review and tell me what stupid mistake I''m committing ? >You may want to send things in patch form. I''d recommend using the hg quilt plugin, its nice for adding patches easily.> > #include <linux/fs.h> > #include <linux/types.h> > #include "ctree.h" > #include "disk-io.h" > #include "btrfs_inode.h" > #include "print-tree.h" > #include "export.h" > > static int btrfs_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, > int connectable) > { struct inode *inode = dentry->d_inode; > int len = *max_len; > u64 objectid, root_objectid; > u32 generation; > __le32 *fh = (__force __le32 *) fh_in; > > if ((len < 5 )|| (connectable && len < 8)) { > return 255; > }Magic numbers are a no-no.> > objectid = BTRFS_I(inode)->location.objectid; > root_objectid = BTRFS_I(inode)->root->objectid; > generation = inode->i_generation; > > len = 5; > fh[0] = cpu_to_le32((u32)(objectid >> 32)); > fh[1] = cpu_to_le32((u32)(objectid & 0xffffffff)); > fh[2] = cpu_to_le32((u32)(root_objectid >> 32)); > fh[3] = cpu_to_le32((u32)(root_objectid & 0xffffffff)); > fh[4] = cpu_to_le32(generation); > > if (connectable && !S_ISDIR(inode->i_mode)) { > struct inode *parent; > > spin_lock(&dentry->d_lock); > > parent = dentry->d_parent->d_inode; > objectid = BTRFS_I(parent)->location.objectid; > generation = parent->i_generation; > > fh[5] = cpu_to_le32((u32)(objectid >> 32)); > fh[6] = cpu_to_le32((u32)(objectid & 0xffffffff)); > fh[7] = cpu_to_le32(generation); > > spin_unlock(&dentry->d_lock); > > len += 3;there is no need to take the d_lock here.> } > > *max_len = len; > return len; > > } > > static struct dentry * btrfs_get_dentry(struct super_block *sb, > u64 objectid, u64 root_objectid, u32 generation) > { > struct inode *inode; > struct dentry *result; > > inode = btrfs_ilookup(sb, objectid, root_objectid); > if (IS_ERR(inode)) > return (void *)inode; > > if(generation != inode->i_generation) { > iput(inode); > return ERR_PTR(-ESTALE); > } > > result = d_alloc_anon(inode); > if(!result) { > iput(inode); > return ERR_PTR(-ENOMEM); > } > > return result; > } > > static struct dentry *btrfs_fh_to_parent(struct super_block *sb, > struct fid *fid, int fh_len, int fh_type) > { > u64 objectid, root_objectid; > u32 generation; > > if (fh_len < 8 || fh_type != 8) { > return NULL; > } > > > root_objectid = (u64)le32_to_cpu(fid->raw[2]) << 32; > root_objectid |= (u64)le32_to_cpu(fid->raw[3]); > objectid = (u64)le32_to_cpu(fid->raw[5]) << 32; > objectid |= (u64)le32_to_cpu(fid->raw[6]); > generation = le32_to_cpu(fid->raw[7]); > > return btrfs_get_dentry(sb, objectid, root_objectid, generation); > } > > static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, > struct fid *fid, int fh_len, int fh_type) > { > u64 objectid, root_objectid; > u32 generation; > > if (fh_len < 5 || fh_type > 5) { > return NULL; > } > > objectid = (u64)le32_to_cpu(fid->raw[0]) << 32; > objectid |= (u64)le32_to_cpu(fid->raw[1]); > root_objectid = (u64)le32_to_cpu(fid->raw[2]) << 32; > root_objectid |= (u64)le32_to_cpu(fid->raw[3]); > generation = le32_to_cpu(fid->raw[4]); > > return btrfs_get_dentry(sb, objectid, root_objectid, generation); > } > > static struct dentry *btrfs_get_parent(struct dentry *child) > { > struct inode *dir = child->d_inode; > struct inode *inode; > struct dentry *parent; > struct btrfs_root *root = BTRFS_I(dir)->root; > struct btrfs_key key; > struct btrfs_path *path; > struct extent_buffer *leaf; > u32 nritems; > int slot; > u64 objectid; > int ret; > > path = btrfs_alloc_path(); > > key.objectid = dir->i_ino; > btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); > key.offset = 0; > ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); > BUG_ON(ret == 0); > ret = 0; > > leaf = path->nodes[0]; > slot = path->slots[0]; > nritems = btrfs_header_nritems(leaf); > if (slot >= nritems) > return ERR_PTR(-EINVAL); > > btrfs_item_key_to_cpu(leaf, &key, slot); > if (key.objectid != dir->i_ino || > key.type != BTRFS_INODE_REF_KEY) { > return ERR_PTR(-EINVAL); > } >in both of the above error cases you don''t free the path, you need to do that.> btrfs_free_path(path); > objectid = key.offset; > inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); > > parent = d_alloc_anon(inode); > if (!parent) { > iput(inode); > parent = ERR_PTR(-ENOMEM); > } > > return parent; > } > > const struct export_operations btrfs_export_ops = { > .encode_fh = btrfs_encode_fh, > .fh_to_dentry = btrfs_fh_to_dentry, > .fh_to_parent = btrfs_fh_to_parent, > .get_parent = btrfs_get_parent, > };I don''t know much about NFS so I can''t really comment on much of anything else other than the obvious problems, but that should point you in a general direction. Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 30, 2008 at 08:50:46PM +0530, Balaji Rao R wrote:> On Mon, Jun 30, 2008 at 8:20 PM, Josef Bacik <jbacik@redhat.com> wrote: > > On Sun, Jun 29, 2008 at 05:05:31AM +0530, Balaji Rao wrote: > >> Hello, > >> > >> Here''s a quick implementation of NFS support for btrfs. It works mostly. But > >> its still a bit buggy. I suspect it has to do something with inode locking. > >> When I unmount and mount a couple of times, it stops working i.e, when I do an > >> ''ls'', it keeps waiting indefinitely. This is seen when accessing the filesystem > >> from a local mount point as well. > >> > >> Can anybody please review and tell me what stupid mistake I''m committing ? > >> > > > > You may want to send things in patch form. I''d recommend using the hg quilt > > plugin, its nice for adding patches easily. > Yes, since this was just an RFC, i thought it would be better to send > just the file I created. > > > >> > >> #include <linux/fs.h> > >> #include <linux/types.h> > >> #include "ctree.h" > >> #include "disk-io.h" > >> #include "btrfs_inode.h" > >> #include "print-tree.h" > >> #include "export.h" > >> > >> static int btrfs_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, > >> int connectable) > >> { struct inode *inode = dentry->d_inode; > >> int len = *max_len; > >> u64 objectid, root_objectid; > >> u32 generation; > >> __le32 *fh = (__force __le32 *) fh_in; > >> > >> if ((len < 5 )|| (connectable && len < 8)) { > >> return 255; > >> } > > > > Magic numbers are a no-no. > OK :) Will change > > > >> > >> objectid = BTRFS_I(inode)->location.objectid; > >> root_objectid = BTRFS_I(inode)->root->objectid; > >> generation = inode->i_generation; > >> > >> len = 5; > >> fh[0] = cpu_to_le32((u32)(objectid >> 32)); > >> fh[1] = cpu_to_le32((u32)(objectid & 0xffffffff)); > >> fh[2] = cpu_to_le32((u32)(root_objectid >> 32)); > >> fh[3] = cpu_to_le32((u32)(root_objectid & 0xffffffff)); > >> fh[4] = cpu_to_le32(generation); > >> > >> if (connectable && !S_ISDIR(inode->i_mode)) { > >> struct inode *parent; > >> > >> spin_lock(&dentry->d_lock); > >> > >> parent = dentry->d_parent->d_inode; > >> objectid = BTRFS_I(parent)->location.objectid; > >> generation = parent->i_generation; > >> > >> fh[5] = cpu_to_le32((u32)(objectid >> 32)); > >> fh[6] = cpu_to_le32((u32)(objectid & 0xffffffff)); > >> fh[7] = cpu_to_le32(generation); > >> > >> spin_unlock(&dentry->d_lock); > >> > >> len += 3; > > > > there is no need to take the d_lock here. > OK. Will remove it. It was done at fs/ocfs2/export.c. So probably i > thought it was necessary. > > <snip> >Ahh see this is where me not being familiar with NFS gets me in trouble, you do need the d_lock in that case. Sorry about that. Josef -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 30, 2008 at 8:20 PM, Josef Bacik <jbacik@redhat.com> wrote:> On Sun, Jun 29, 2008 at 05:05:31AM +0530, Balaji Rao wrote: >> Hello, >> >> Here''s a quick implementation of NFS support for btrfs. It works mostly. But >> its still a bit buggy. I suspect it has to do something with inode locking. >> When I unmount and mount a couple of times, it stops working i.e, when I do an >> ''ls'', it keeps waiting indefinitely. This is seen when accessing the filesystem >> from a local mount point as well. >> >> Can anybody please review and tell me what stupid mistake I''m committing ? >> > > You may want to send things in patch form. I''d recommend using the hg quilt > plugin, its nice for adding patches easily.Yes, since this was just an RFC, i thought it would be better to send just the file I created.> >> >> #include <linux/fs.h> >> #include <linux/types.h> >> #include "ctree.h" >> #include "disk-io.h" >> #include "btrfs_inode.h" >> #include "print-tree.h" >> #include "export.h" >> >> static int btrfs_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, >> int connectable) >> { struct inode *inode = dentry->d_inode; >> int len = *max_len; >> u64 objectid, root_objectid; >> u32 generation; >> __le32 *fh = (__force __le32 *) fh_in; >> >> if ((len < 5 )|| (connectable && len < 8)) { >> return 255; >> } > > Magic numbers are a no-no.OK :) Will change> >> >> objectid = BTRFS_I(inode)->location.objectid; >> root_objectid = BTRFS_I(inode)->root->objectid; >> generation = inode->i_generation; >> >> len = 5; >> fh[0] = cpu_to_le32((u32)(objectid >> 32)); >> fh[1] = cpu_to_le32((u32)(objectid & 0xffffffff)); >> fh[2] = cpu_to_le32((u32)(root_objectid >> 32)); >> fh[3] = cpu_to_le32((u32)(root_objectid & 0xffffffff)); >> fh[4] = cpu_to_le32(generation); >> >> if (connectable && !S_ISDIR(inode->i_mode)) { >> struct inode *parent; >> >> spin_lock(&dentry->d_lock); >> >> parent = dentry->d_parent->d_inode; >> objectid = BTRFS_I(parent)->location.objectid; >> generation = parent->i_generation; >> >> fh[5] = cpu_to_le32((u32)(objectid >> 32)); >> fh[6] = cpu_to_le32((u32)(objectid & 0xffffffff)); >> fh[7] = cpu_to_le32(generation); >> >> spin_unlock(&dentry->d_lock); >> >> len += 3; > > there is no need to take the d_lock here.OK. Will remove it. It was done at fs/ocfs2/export.c. So probably i thought it was necessary. <snip>>> static struct dentry *btrfs_get_parent(struct dentry *child) >> { >> struct inode *dir = child->d_inode; >> struct inode *inode; >> struct dentry *parent; >> struct btrfs_root *root = BTRFS_I(dir)->root; >> struct btrfs_key key; >> struct btrfs_path *path; >> struct extent_buffer *leaf; >> u32 nritems; >> int slot; >> u64 objectid; >> int ret; >> >> path = btrfs_alloc_path(); >> >> key.objectid = dir->i_ino; >> btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); >> key.offset = 0; >> ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); >> BUG_ON(ret == 0); >> ret = 0; >> >> leaf = path->nodes[0]; >> slot = path->slots[0]; >> nritems = btrfs_header_nritems(leaf); >> if (slot >= nritems) >> return ERR_PTR(-EINVAL); >> >> btrfs_item_key_to_cpu(leaf, &key, slot); >> if (key.objectid != dir->i_ino || >> key.type != BTRFS_INODE_REF_KEY) { >> return ERR_PTR(-EINVAL); >> } >> > > in both of the above error cases you don''t free the path, you need to do that.Yes. Will do. Thanks for the review! -- warm regards Balaji Rao -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Jun 29, 2008 at 05:05:31AM +0530, Balaji Rao wrote:> len = 5; > fh[0] = cpu_to_le32((u32)(objectid >> 32)); > fh[1] = cpu_to_le32((u32)(objectid & 0xffffffff)); > fh[2] = cpu_to_le32((u32)(root_objectid >> 32)); > fh[3] = cpu_to_le32((u32)(root_objectid & 0xffffffff)); > fh[4] = cpu_to_le32(generation);Please use C struct types for the file handles. Take a look at fs/xfs/linux-2.6/xfs_export.[ch] for an example.> static struct dentry * btrfs_get_dentry(struct super_block *sb, > u64 objectid, u64 root_objectid, u32 generation) > { > struct inode *inode; > struct dentry *result; > > inode = btrfs_ilookup(sb, objectid, root_objectid); > if (IS_ERR(inode)) > return (void *)inode;This one will only find inodes when they already are in memory. You need to something similar to btrfs_lookup minus the btrfs_inode_by_name call here. In fact it would make a lot of sense to factor that part of btrfs_lookup out into a btrfs_iget helper that takes a struct btrfs_key pointer and returns the inode for you. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jun 30, 2008 at 11:08:11AM -0400, Josef Bacik wrote:> > >> > > >> spin_unlock(&dentry->d_lock); > > >> > > >> len += 3; > > > > > > there is no need to take the d_lock here. > > OK. Will remove it. It was done at fs/ocfs2/export.c. So probably i > > thought it was necessary. > > > > <snip> > > > > Ahh see this is where me not being familiar with NFS gets me in trouble, you do > need the d_lock in that case. Sorry about that.It''s not really about nfs, dereferencing ->d_parent needs d_lock. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Balaji Rao <balajirrao@gmail.com> This patch introduces a btrfs_iget helper to be used in NFS support. Signed-off-by: Balaji Rao <balajirrao@gmail.com> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> --- ctree.h | 2 ++ inode.c | 55 +++++++++++++++++++++++++++++++++++++------------------ 2 files changed, 39 insertions(+), 18 deletions(-) diff --git a/ctree.h b/ctree.h index c88f1e1..411d576 100644 --- a/ctree.h +++ b/ctree.h @@ -1698,6 +1698,8 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, struct btrfs_root *root); struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, u64 root_objectid); +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *is_new); int btrfs_commit_write(struct file *file, struct page *page, unsigned from, unsigned to); struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, diff --git a/inode.c b/inode.c index a26d365..5344526 100644 --- a/inode.c +++ b/inode.c @@ -1881,6 +1881,33 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, return inode; } +/* Get an inode object given its location and corresponding root. + * Returns in *is_new if the inode was read from disk + */ +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *is_new) +{ + struct inode *inode; + + inode = btrfs_iget_locked(s, location->objectid, root); + if (!inode) + return ERR_PTR(-EACCES); + + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + if (is_new) + *is_new = 1; + } else { + if (is_new) + *is_new = 0; + } + + return inode; +} + static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { @@ -1889,7 +1916,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, struct btrfs_root *root = bi->root; struct btrfs_root *sub_root = root; struct btrfs_key location; - int ret, do_orphan = 0; + int ret, new, do_orphan = 0; if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -1907,23 +1934,15 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(ret); if (ret > 0) return ERR_PTR(-ENOENT); - - inode = btrfs_iget_locked(dir->i_sb, location.objectid, - sub_root); - if (!inode) - return ERR_PTR(-EACCES); - if (inode->i_state & I_NEW) { - /* the inode and parent dir are two different roots */ - if (sub_root != root) { - igrab(inode); - sub_root->inode = inode; - do_orphan = 1; - } - BTRFS_I(inode)->root = sub_root; - memcpy(&BTRFS_I(inode)->location, &location, - sizeof(location)); - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); + inode = btrfs_iget(dir->i_sb, &location, sub_root, &new); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + /* the inode and parent dir are two different roots */ + if (new && root != sub_root) { + igrab(inode); + sub_root->inode = inode; + do_orphan = 1; } } -- 1.5.5.1 -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Balaji Rao <balajirrao@gmail.com> Here''s an implementation of NFS support for btrfs. It relies on the fixes which are going in to 2.6.28 for the NFS readdir/lookup deadlock. This uses the btrfs_iget helper introduced previously. [dwmw2: Tidy up a little, switch to d_obtain_alias() w/compat.] Signed-off-by: Balaji Rao <balajirrao@gmail.com> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> --- Makefile | 2 +- compat.h | 10 ++++ export.c | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ export.h | 17 ++++++ super.c | 2 + 5 files changed, 204 insertions(+), 1 deletions(-) create mode 100644 export.c create mode 100644 export.h diff --git a/Makefile b/Makefile index a4b3817..75f8818 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - ref-cache.o + ref-cache.o export.o btrfs-$(CONFIG_FS_POSIX_ACL) += acl.o else diff --git a/compat.h b/compat.h index b3349a6..1040f07 100644 --- a/compat.h +++ b/compat.h @@ -5,6 +5,16 @@ #define trylock_page(page) (!TestSetPageLocked(page)) #endif +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27) +static inline struct dentry *d_obtain_alias(struct inode *inode) +{ + struct dentry *d = d_alloc_anon(inode); + if (!d) + iput(inode); + return d; +} +#endif + /* * Even if AppArmor isn''t enabled, it still has different prototypes. * Add more distro/version pairs here to declare which has AppArmor applied. diff --git a/export.c b/export.c new file mode 100644 index 0000000..253080a --- /dev/null +++ b/export.c @@ -0,0 +1,174 @@ +#include <linux/fs.h> +#include <linux/types.h> +#include "ctree.h" +#include "disk-io.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "export.h" +#include "compat.h" + +/* The size of encoded fh is the type number of the fh itself */ +#define BTRFS_FID_NON_CONNECTABLE 5 +#define BTRFS_FID_CONNECTABLE 8 + +static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + int connectable) +{ + struct btrfs_fid *fid = (struct btrfs_fid *)fh; + struct inode *inode = dentry->d_inode; + int len = *max_len; + + if ((len < BTRFS_FID_NON_CONNECTABLE) || + (connectable && len < BTRFS_FID_CONNECTABLE)) + return 255; + + len = BTRFS_FID_NON_CONNECTABLE; + + fid->objectid = BTRFS_I(inode)->location.objectid; + fid->root_objectid = BTRFS_I(inode)->root->objectid; + fid->gen = inode->i_generation; + + if (connectable && !S_ISDIR(inode->i_mode)) { + struct inode *parent; + + spin_lock(&dentry->d_lock); + + parent = dentry->d_parent->d_inode; + fid->parent_objectid = BTRFS_I(parent)->location.objectid; + fid->parent_gen = parent->i_generation; + + spin_unlock(&dentry->d_lock); + len = BTRFS_FID_CONNECTABLE; + } + *max_len = len; + + /* We return length itself for the type */ + return len; + +} + +static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation) +{ + struct btrfs_root *root; + struct inode *inode; + struct dentry *result; + struct btrfs_key key; + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + key.offset = 0; + + root = btrfs_lookup_fs_root(btrfs_sb(sb)->fs_info, root_objectid); + inode = btrfs_iget(sb, &key, root, NULL); + if (IS_ERR(inode)) + return (void *)inode; + + if (generation != inode->i_generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + result = d_alloc_anon(inode); + if (!result) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + + return result; +} + +static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if (fh_type != BTRFS_FID_CONNECTABLE) + return NULL; + + root_objectid = fid->root_objectid; + objectid = fid->parent_objectid; + generation = fid->parent_gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation); +} + +static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if (fh_type > BTRFS_FID_CONNECTABLE) + return NULL; + + objectid = fid->objectid; + root_objectid = fid->root_objectid; + generation = fid->gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation); +} + +static struct dentry *btrfs_get_parent(struct dentry *child) +{ + struct inode *dir = child->d_inode; + struct inode *inode; + struct dentry *parent; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + u32 nritems; + int slot; + u64 objectid; + int ret; + + path = btrfs_alloc_path(); + + key.objectid = dir->i_ino; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + BUG_ON(ret == 0); + ret = 0; + + leaf = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(leaf); + if (slot >= nritems) + goto out; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) + goto out; + + btrfs_free_path(path); + objectid = key.offset; + + /* Build a new key for the inode item */ + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + + parent = d_obtain_alias(inode); + if (!parent) + parent = ERR_PTR(-ENOMEM); + + return parent; + +out: + btrfs_free_path(path); + return ERR_PTR(-EINVAL); +} + +const struct export_operations btrfs_export_ops = { + .encode_fh = btrfs_encode_fh, + .fh_to_dentry = btrfs_fh_to_dentry, + .fh_to_parent = btrfs_fh_to_parent, + .get_parent = btrfs_get_parent, +}; diff --git a/export.h b/export.h new file mode 100644 index 0000000..019ca3a --- /dev/null +++ b/export.h @@ -0,0 +1,17 @@ +#ifndef BTRFS_EXPORT_H +#define BTRFS_EXPORT_H + +#include <linux/exportfs.h> + +extern const struct export_operations btrfs_export_ops; + +struct btrfs_fid { + u64 objectid; + u64 root_objectid; + u32 gen; + + u64 parent_objectid; + u32 parent_gen; +} __attribute__ ((packed)); + +#endif diff --git a/super.c b/super.c index eb4b357..e830e0e 100644 --- a/super.c +++ b/super.c @@ -46,6 +46,7 @@ #include "xattr.h" #include "volumes.h" #include "version.h" +#include "export.h" #define BTRFS_SUPER_MAGIC 0x9123683E @@ -303,6 +304,7 @@ static int btrfs_fill_super(struct super_block * sb, sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_magic = BTRFS_SUPER_MAGIC; sb->s_op = &btrfs_super_ops; + sb->s_export_op = &btrfs_export_ops; sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; sb->s_flags |= MS_POSIXACL; -- 1.5.5.1 -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Woodhouse
2008-Aug-12 13:46 UTC
[PATCH 3/3] Implement our own copy of the nfsd readdir hack, for older kernels
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> --- inode.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 102 insertions(+), 2 deletions(-) diff --git a/inode.c b/inode.c index 5344526..faa5543 100644 --- a/inode.c +++ b/inode.c @@ -1956,7 +1956,8 @@ static unsigned char btrfs_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; -static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int btrfs_real_readdir(struct file *filp, void *dirent, + filldir_t filldir) { struct inode *inode = filp->f_dentry->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2105,6 +2106,101 @@ err: return ret; } +/* Kernels earlier than 2.6.28 still have the NFS deadlock where nfsd + will call the file system''s ->lookup() method from within its + filldir callback, which in turn was called from the file system''s + ->readdir() method. And will deadlock for many file systems. */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) + +struct nfshack_dirent { + u64 ino; + loff_t offset; + int namlen; + unsigned int d_type; + char name[]; +}; + +struct nfshack_readdir { + char *dirent; + size_t used; +}; + + + +static int btrfs_nfshack_filldir(void *__buf, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct nfshack_readdir *buf = __buf; + struct nfshack_dirent *de = (void *)(buf->dirent + buf->used); + unsigned int reclen; + + reclen = ALIGN(sizeof(struct nfshack_dirent) + namlen, sizeof(u64)); + if (buf->used + reclen > PAGE_SIZE) + return -EINVAL; + + de->namlen = namlen; + de->offset = offset; + de->ino = ino; + de->d_type = d_type; + memcpy(de->name, name, namlen); + buf->used += reclen; + + return 0; +} + +static int btrfs_nfshack_readdir(struct file *file, void *dirent, + filldir_t filldir) +{ + struct nfshack_readdir buf; + struct nfshack_dirent *de; + int err; + int size; + loff_t offset; + + buf.dirent = (void *)__get_free_page(GFP_KERNEL); + if (!buf.dirent) + return -ENOMEM; + + offset = file->f_pos; + + while (1) { + unsigned int reclen; + + buf.used = 0; + + err = btrfs_real_readdir(file, &buf, btrfs_nfshack_filldir); + if (err) + break; + + size = buf.used; + + if (!size) + break; + + de = (struct nfshack_dirent *)buf.dirent; + while (size > 0) { + offset = de->offset; + + if (filldir(dirent, de->name, de->namlen, de->offset, + de->ino, de->d_type)) + goto done; + offset = file->f_pos; + + reclen = ALIGN(sizeof(*de) + de->namlen, + sizeof(u64)); + size -= reclen; + de = (struct nfshack_dirent *)((char *)de + reclen); + } + } + + done: + free_page((unsigned long)buf.dirent); + file->f_pos = offset; + + return err; +} +#endif + int btrfs_write_inode(struct inode *inode, int wait) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3661,7 +3757,11 @@ static struct inode_operations btrfs_dir_ro_inode_operations = { static struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = btrfs_readdir, +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) + .readdir = btrfs_nfshack_readdir, +#else /* NFSd readdir/lookup deadlock is fixed */ + .readdir = btrfs_real_readdir, +#endif .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_ioctl, -- 1.5.5.1 -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, > + struct btrfs_root *root, int *is_new) > +{ > + struct inode *inode; > + > + inode = btrfs_iget_locked(s, location->objectid, root); > + if (!inode) > + return ERR_PTR(-EACCES); > + > + if (inode->i_state & I_NEW) { > + BTRFS_I(inode)->root = root; > + memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); > + btrfs_read_locked_inode(inode); > + unlock_new_inode(inode); > + if (is_new) > + *is_new = 1; > + } else { > + if (is_new) > + *is_new = 0; > + } > + > + return inode;Please leave the unlock_new_inode to the caller and kill the is_new parameter. Gives much nicer to read code, and generally making sure the inode is 100% ready and correct before unlocking it is a good idea, too so that e.g. no one can actually access an inode marked bad due to a generation number mismatch before it''s marked bad.> +} > + > static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, > struct nameidata *nd) > { > @@ -1889,7 +1916,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, > struct btrfs_root *root = bi->root; > struct btrfs_root *sub_root = root; > struct btrfs_key location; > - int ret, do_orphan = 0; > + int ret, new, do_orphan = 0; > > if (dentry->d_name.len > BTRFS_NAME_LEN) > return ERR_PTR(-ENAMETOOLONG); > @@ -1907,23 +1934,15 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, > return ERR_PTR(ret); > if (ret > 0) > return ERR_PTR(-ENOENT); > - > - inode = btrfs_iget_locked(dir->i_sb, location.objectid, > - sub_root); > - if (!inode) > - return ERR_PTR(-EACCES); > - if (inode->i_state & I_NEW) { > - /* the inode and parent dir are two different roots */ > - if (sub_root != root) { > - igrab(inode); > - sub_root->inode = inode; > - do_orphan = 1; > - } > - BTRFS_I(inode)->root = sub_root; > - memcpy(&BTRFS_I(inode)->location, &location, > - sizeof(location)); > - btrfs_read_locked_inode(inode); > - unlock_new_inode(inode); > + inode = btrfs_iget(dir->i_sb, &location, sub_root, &new); > + if (IS_ERR(inode)) > + return ERR_CAST(inode); > + > + /* the inode and parent dir are two different roots */ > + if (new && root != sub_root) { > + igrab(inode); > + sub_root->inode = inode; > + do_orphan = 1; > } > } > > -- > 1.5.5.1 > > > -- > David Woodhouse Open Source Technology Centre > David.Woodhouse@intel.com Intel Corporation > > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html---end quoted text--- -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Aug 12, 2008 at 02:46:46PM +0100, David Woodhouse wrote:> +static inline struct dentry *d_obtain_alias(struct inode *inode) > +{ > + struct dentry *d = d_alloc_anon(inode); > + if (!d) > + iput(inode); > + return d; > +} > +#endifI''m not sure when al wants to merge with Linus, but the for-next naming makes it sound like the tree is the .28 queue. Also please take the full implementation of d_obtain_alias from http://git.kernel.org/?p=linux/kernel/git/viro/vfs-2.6.git;a=commitdiff;h=10cdb734be3c4175b977ba18eafbaba8e5716291 please. Your light implementation will crash and burn when btrfs_iget returns an error.> +/* The size of encoded fh is the type number of the fh itself */ > +#define BTRFS_FID_NON_CONNECTABLE 5 > +#define BTRFS_FID_CONNECTABLE 8That was an assumption in the very old code, but I think it''s a very bad idea. Just give your filehandles a uniqueue type number, e.g. in the 0x4? range so that people looking at nfs traffic using a packet analyzer know what kind of fhs they are actually dealing with.> + if (IS_ERR(inode)) > + return (void *)inode; > + > + if (generation != inode->i_generation) { > + iput(inode); > + return ERR_PTR(-ESTALE); > + } > + > + result = d_alloc_anon(inode); > + if (!result) { > + iput(inode); > + return ERR_PTR(-ENOMEM); > + }Didn''t you intend to use d_obtain_alias? -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2008-08-12 at 14:51 -0400, Christoph Hellwig wrote:> On Tue, Aug 12, 2008 at 02:46:46PM +0100, David Woodhouse wrote: > > +static inline struct dentry *d_obtain_alias(struct inode *inode) > > +{ > > + struct dentry *d = d_alloc_anon(inode); > > + if (!d) > > + iput(inode); > > + return d; > > +} > > +#endif > > I''m not sure when al wants to merge with Linus, but the for-next naming > makes it sound like the tree is the .28 queue. Also please take the > full implementation of d_obtain_alias from > > http://git.kernel.org/?p=linux/kernel/git/viro/vfs-2.6.git;a=commitdiff;h=10cdb734be3c4175b977ba18eafbaba8e5716291 > > please. Your light implementation will crash and burn when btrfs_iget > returns an error.Well spotted; thanks.> > +/* The size of encoded fh is the type number of the fh itself */ > > +#define BTRFS_FID_NON_CONNECTABLE 5 > > +#define BTRFS_FID_CONNECTABLE 8 > > That was an assumption in the very old code, but I think it''s a very > bad idea. Just give your filehandles a uniqueue type number, e.g. in > the 0x4? range so that people looking at nfs traffic using a packet > analyzer know what kind of fhs they are actually dealing with.Ok, I''ll submit the patch to include/linux/exportfs.h separately -- the numbers might as well be reserved immediately. Btw, I''m not convinced nfsd does the right thing when we return 255 from ->encode_fh(). It seems to leak a refcount somewhere in nfs code, so I can''t stop the server and can''t unmount the file system.> > + if (IS_ERR(inode)) > > + return (void *)inode; > > + > > + if (generation != inode->i_generation) { > > + iput(inode); > > + return ERR_PTR(-ESTALE); > > + } > > + > > + result = d_alloc_anon(inode); > > + if (!result) { > > + iput(inode); > > + return ERR_PTR(-ENOMEM); > > + } > > Didn''t you intend to use d_obtain_alias?Oops, yes. I did use it once; I missed the second one though. I''ll merge this into the patch in question... diff --git a/compat.h b/compat.h index 1040f07..d45fb37 100644 --- a/compat.h +++ b/compat.h @@ -8,7 +8,14 @@ #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27) static inline struct dentry *d_obtain_alias(struct inode *inode) { - struct dentry *d = d_alloc_anon(inode); + struct dentry *d; + + if (!inode) + return NULL; + if (IS_ERR(inode)) + return ERR_CAST(inode); + + d = d_alloc_anon(inode); if (!d) iput(inode); return d; diff --git a/export.c b/export.c index 253080a..34e4631 100644 --- a/export.c +++ b/export.c @@ -7,9 +7,13 @@ #include "export.h" #include "compat.h" -/* The size of encoded fh is the type number of the fh itself */ -#define BTRFS_FID_NON_CONNECTABLE 5 -#define BTRFS_FID_CONNECTABLE 8 +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) +#define FILEID_BTRFS_WITHOUT_PARENT 0x4e +#define FILEID_BTRFS_WITH_PARENT 0x4f +#endif + +#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, parent_objectid)/4) +#define BTRFS_FID_SIZE_CONNECTABLE (sizeof(struct btrfs_fid)/4) static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, int connectable) @@ -17,12 +21,14 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, struct btrfs_fid *fid = (struct btrfs_fid *)fh; struct inode *inode = dentry->d_inode; int len = *max_len; + int type; - if ((len < BTRFS_FID_NON_CONNECTABLE) || - (connectable && len < BTRFS_FID_CONNECTABLE)) + if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || + (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) return 255; - len = BTRFS_FID_NON_CONNECTABLE; + len = BTRFS_FID_SIZE_NON_CONNECTABLE; + type = FILEID_BTRFS_WITHOUT_PARENT; fid->objectid = BTRFS_I(inode)->location.objectid; fid->root_objectid = BTRFS_I(inode)->root->objectid; @@ -38,13 +44,12 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, fid->parent_gen = parent->i_generation; spin_unlock(&dentry->d_lock); - len = BTRFS_FID_CONNECTABLE; + len = BTRFS_FID_SIZE_CONNECTABLE; + type = FILEID_BTRFS_WITH_PARENT; } - *max_len = len; - - /* We return length itself for the type */ - return len; + *max_len = len; + return type; } static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, @@ -69,11 +74,9 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, return ERR_PTR(-ESTALE); } - result = d_alloc_anon(inode); - if (!result) { - iput(inode); + result = d_obtain_alias(inode); + if (!result) return ERR_PTR(-ENOMEM); - } return result; } @@ -85,7 +88,8 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, u64 objectid, root_objectid; u32 generation; - if (fh_type != BTRFS_FID_CONNECTABLE) + if (fh_type != FILEID_BTRFS_WITH_PARENT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE) return NULL; root_objectid = fid->root_objectid; @@ -102,7 +106,10 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, u64 objectid, root_objectid; u32 generation; - if (fh_type > BTRFS_FID_CONNECTABLE) + if ((fh_type != FILEID_BTRFS_WITH_PARENT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE) && + (fh_type != FILEID_BTRFS_WITHOUT_PARENT || + fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE)) return NULL; objectid = fid->objectid; -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2008-08-12 at 14:46 -0400, Christoph Hellwig wrote:> Please leave the unlock_new_inode to the caller and kill the is_new > parameter. Gives much nicer to read code, and generally making sure the > inode is 100% ready and correct before unlocking it is a good idea, too > so that e.g. no one can actually access an inode marked bad due to a > generation number mismatch before it''s marked bad.Unconvinced -- I don''t like the idea that we''d sometimes return a locked inode, sometimes not, and force the callers to handle that. And the inode _is_ ready and correct. We don''t mark inodes bad because of a generation number mismatch -- we just return -ESTALE and drop our refcount on the inode. The inode itself is fine and there''s no issue with it being unlocked. The code in btrfs_lookup() which goes... /* the inode and parent dir are two different roots */ if (new && root != sub_root) { igrab(inode); sub_root->inode = inode; do_orphan = 1; } ... should also be fine when the inode is unlocked, too. -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 2008-08-13 at 10:07 +0100, David Woodhouse wrote:> /* the inode and parent dir are two different roots */ > if (new && root != sub_root) { > igrab(inode); > sub_root->inode = inode; > do_orphan = 1; > }Hmmmm.... looking at that, don''t we need to store the _parent_ root object ID in the file handle too, if it can be different from the child? Something like this, perhaps. Given that I think we''ve already given up on exporting by NFSv2, do we care about another 8 bytes in the fh size? diff --git a/export.c b/export.c index 34e4631..1b8875c 100644 --- a/export.c +++ b/export.c @@ -8,12 +8,14 @@ #include "compat.h" #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) -#define FILEID_BTRFS_WITHOUT_PARENT 0x4e -#define FILEID_BTRFS_WITH_PARENT 0x4f +#define FILEID_BTRFS_WITHOUT_PARENT 0x4d +#define FILEID_BTRFS_WITH_PARENT 0x4e +#define FILEID_BTRFS_WITH_PARENT_ROOT 0x4f #endif #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, parent_objectid)/4) -#define BTRFS_FID_SIZE_CONNECTABLE (sizeof(struct btrfs_fid)/4) +#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, parent_root_objectid)/4) +#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid)/4) static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, int connectable) @@ -36,16 +38,25 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, if (connectable && !S_ISDIR(inode->i_mode)) { struct inode *parent; + u64 parent_root_id; spin_lock(&dentry->d_lock); parent = dentry->d_parent->d_inode; fid->parent_objectid = BTRFS_I(parent)->location.objectid; fid->parent_gen = parent->i_generation; + parent_root_id = BTRFS_I(parent)->root->objectid; spin_unlock(&dentry->d_lock); - len = BTRFS_FID_SIZE_CONNECTABLE; - type = FILEID_BTRFS_WITH_PARENT; + + if (parent_root_id != fid->root_objectid) { + fid->parent_root_objectid = parent_root_id; + len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; + type = FILEID_BTRFS_WITH_PARENT_ROOT; + } else { + len = BTRFS_FID_SIZE_CONNECTABLE; + type = FILEID_BTRFS_WITH_PARENT; + } } *max_len = len; @@ -88,11 +99,17 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, u64 objectid, root_objectid; u32 generation; - if (fh_type != FILEID_BTRFS_WITH_PARENT || - fh_len != BTRFS_FID_SIZE_CONNECTABLE) + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE) + return NULL; + root_objectid = fid->root_objectid; + } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) + return NULL; + root_objectid = fid->parent_root_objectid; + } else return NULL; - root_objectid = fid->root_objectid; objectid = fid->parent_objectid; generation = fid->parent_gen; @@ -108,6 +125,8 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, if ((fh_type != FILEID_BTRFS_WITH_PARENT || fh_len != BTRFS_FID_SIZE_CONNECTABLE) && + (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) && (fh_type != FILEID_BTRFS_WITHOUT_PARENT || fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE)) return NULL; diff --git a/export.h b/export.h index 019ca3a..074348a 100644 --- a/export.h +++ b/export.h @@ -12,6 +12,8 @@ struct btrfs_fid { u64 parent_objectid; u32 parent_gen; + + u64 parent_root_objectid; } __attribute__ ((packed)); #endif -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Balaji Rao <balajirrao@gmail.com> Here''s an implementation of NFS support for btrfs. It relies on the fixes which are going in to 2.6.28 for the NFS readdir/lookup deadlock. This uses the btrfs_iget helper introduced previously. [dwmw2: Tidy up a little, switch to d_obtain_alias() w/compat routine, change fh_type, store parent''s root object ID where needed] Signed-off-by: Balaji Rao <balajirrao@gmail.com> Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> --- This replaces patch 2 of the sequence I sent out yesterday; the other two patches remain the same. Makefile | 2 +- compat.h | 17 +++++ export.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ export.h | 19 ++++++ super.c | 2 + 5 files changed, 239 insertions(+), 1 deletions(-) create mode 100644 export.c create mode 100644 export.h diff --git a/Makefile b/Makefile index a4b3817..75f8818 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o bit-radix.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - ref-cache.o + ref-cache.o export.o btrfs-$(CONFIG_FS_POSIX_ACL) += acl.o else diff --git a/compat.h b/compat.h index b3349a6..d45fb37 100644 --- a/compat.h +++ b/compat.h @@ -5,6 +5,23 @@ #define trylock_page(page) (!TestSetPageLocked(page)) #endif +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27) +static inline struct dentry *d_obtain_alias(struct inode *inode) +{ + struct dentry *d; + + if (!inode) + return NULL; + if (IS_ERR(inode)) + return ERR_CAST(inode); + + d = d_alloc_anon(inode); + if (!d) + iput(inode); + return d; +} +#endif + /* * Even if AppArmor isn''t enabled, it still has different prototypes. * Add more distro/version pairs here to declare which has AppArmor applied. diff --git a/export.c b/export.c new file mode 100644 index 0000000..1b8875c --- /dev/null +++ b/export.c @@ -0,0 +1,200 @@ +#include <linux/fs.h> +#include <linux/types.h> +#include "ctree.h" +#include "disk-io.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "export.h" +#include "compat.h" + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) +#define FILEID_BTRFS_WITHOUT_PARENT 0x4d +#define FILEID_BTRFS_WITH_PARENT 0x4e +#define FILEID_BTRFS_WITH_PARENT_ROOT 0x4f +#endif + +#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, parent_objectid)/4) +#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, parent_root_objectid)/4) +#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid)/4) + +static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + int connectable) +{ + struct btrfs_fid *fid = (struct btrfs_fid *)fh; + struct inode *inode = dentry->d_inode; + int len = *max_len; + int type; + + if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || + (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) + return 255; + + len = BTRFS_FID_SIZE_NON_CONNECTABLE; + type = FILEID_BTRFS_WITHOUT_PARENT; + + fid->objectid = BTRFS_I(inode)->location.objectid; + fid->root_objectid = BTRFS_I(inode)->root->objectid; + fid->gen = inode->i_generation; + + if (connectable && !S_ISDIR(inode->i_mode)) { + struct inode *parent; + u64 parent_root_id; + + spin_lock(&dentry->d_lock); + + parent = dentry->d_parent->d_inode; + fid->parent_objectid = BTRFS_I(parent)->location.objectid; + fid->parent_gen = parent->i_generation; + parent_root_id = BTRFS_I(parent)->root->objectid; + + spin_unlock(&dentry->d_lock); + + if (parent_root_id != fid->root_objectid) { + fid->parent_root_objectid = parent_root_id; + len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; + type = FILEID_BTRFS_WITH_PARENT_ROOT; + } else { + len = BTRFS_FID_SIZE_CONNECTABLE; + type = FILEID_BTRFS_WITH_PARENT; + } + } + + *max_len = len; + return type; +} + +static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation) +{ + struct btrfs_root *root; + struct inode *inode; + struct dentry *result; + struct btrfs_key key; + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + key.offset = 0; + + root = btrfs_lookup_fs_root(btrfs_sb(sb)->fs_info, root_objectid); + inode = btrfs_iget(sb, &key, root, NULL); + if (IS_ERR(inode)) + return (void *)inode; + + if (generation != inode->i_generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + result = d_obtain_alias(inode); + if (!result) + return ERR_PTR(-ENOMEM); + + return result; +} + +static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE) + return NULL; + root_objectid = fid->root_objectid; + } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) + return NULL; + root_objectid = fid->parent_root_objectid; + } else + return NULL; + + objectid = fid->parent_objectid; + generation = fid->parent_gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation); +} + +static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if ((fh_type != FILEID_BTRFS_WITH_PARENT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE) && + (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) && + (fh_type != FILEID_BTRFS_WITHOUT_PARENT || + fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE)) + return NULL; + + objectid = fid->objectid; + root_objectid = fid->root_objectid; + generation = fid->gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation); +} + +static struct dentry *btrfs_get_parent(struct dentry *child) +{ + struct inode *dir = child->d_inode; + struct inode *inode; + struct dentry *parent; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + u32 nritems; + int slot; + u64 objectid; + int ret; + + path = btrfs_alloc_path(); + + key.objectid = dir->i_ino; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + BUG_ON(ret == 0); + ret = 0; + + leaf = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(leaf); + if (slot >= nritems) + goto out; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) + goto out; + + btrfs_free_path(path); + objectid = key.offset; + + /* Build a new key for the inode item */ + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + + parent = d_obtain_alias(inode); + if (!parent) + parent = ERR_PTR(-ENOMEM); + + return parent; + +out: + btrfs_free_path(path); + return ERR_PTR(-EINVAL); +} + +const struct export_operations btrfs_export_ops = { + .encode_fh = btrfs_encode_fh, + .fh_to_dentry = btrfs_fh_to_dentry, + .fh_to_parent = btrfs_fh_to_parent, + .get_parent = btrfs_get_parent, +}; diff --git a/export.h b/export.h new file mode 100644 index 0000000..074348a --- /dev/null +++ b/export.h @@ -0,0 +1,19 @@ +#ifndef BTRFS_EXPORT_H +#define BTRFS_EXPORT_H + +#include <linux/exportfs.h> + +extern const struct export_operations btrfs_export_ops; + +struct btrfs_fid { + u64 objectid; + u64 root_objectid; + u32 gen; + + u64 parent_objectid; + u32 parent_gen; + + u64 parent_root_objectid; +} __attribute__ ((packed)); + +#endif diff --git a/super.c b/super.c index eb4b357..e830e0e 100644 --- a/super.c +++ b/super.c @@ -46,6 +46,7 @@ #include "xattr.h" #include "volumes.h" #include "version.h" +#include "export.h" #define BTRFS_SUPER_MAGIC 0x9123683E @@ -303,6 +304,7 @@ static int btrfs_fill_super(struct super_block * sb, sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_magic = BTRFS_SUPER_MAGIC; sb->s_op = &btrfs_super_ops; + sb->s_export_op = &btrfs_export_ops; sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; sb->s_flags |= MS_POSIXACL; -- 1.5.5.1 -- David Woodhouse Open Source Technology Centre David.Woodhouse@intel.com Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html