wengang wang
2009-Feb-20 09:23 UTC
[Ocfs2-devel] [PATCH 1/1] OCFS2: anti stale inode for nfs (V4)
changes from v3:
1, move codes that checks inode allocation bit to subfunction
ocfs2_test_inode_bit().
2, release the suballoc lock just after we get it. we should release it asap
and doing so doesn't affect functionility.
3, add inode alloc slot validation.
Signed-off-by: Wengang Wang <wen.gang.wang at oracle.com>
--
dlmglue.c | 45 +++++++++++++++++
dlmglue.h | 2
export.c | 75 +++++++++++++++++++++++++---
inode.c | 24 ++++++++-
inode.h | 1
ocfs2.h | 1
ocfs2_lockid.h | 4 +
suballoc.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
suballoc.h | 2
9 files changed, 295 insertions(+), 9 deletions(-)
Index: dlmglue.h
==================================================================--- dlmglue.h
(revision 139)
+++ dlmglue.h (working copy)
@@ -115,6 +115,8 @@ void ocfs2_super_unlock(struct ocfs2_sup
int ex);
int ocfs2_rename_lock(struct ocfs2_super *osb);
void ocfs2_rename_unlock(struct ocfs2_super *osb);
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
int ocfs2_dentry_lock(struct dentry *dentry, int ex);
void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
int ocfs2_file_lock(struct file *file, int ex, int trylock);
Index: export.c
==================================================================--- export.c
(revision 139)
+++ export.c (working copy)
@@ -38,6 +38,7 @@
#include "inode.h"
#include "buffer_head_io.h"
+#include "suballoc.h"
struct ocfs2_inode_handle
{
@@ -49,29 +50,87 @@ static struct dentry *ocfs2_get_dentry(s
struct ocfs2_inode_handle *handle)
{
struct inode *inode;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ u64 blkno = handle->ih_blkno;
+ int status, set;
struct dentry *result;
mlog_entry("(0x%p, 0x%p)\n", sb, handle);
- if (handle->ih_blkno == 0) {
- mlog_errno(-ESTALE);
- return ERR_PTR(-ESTALE);
+ if (blkno == 0) {
+ mlog(0, "nfs wants inode with blkno: 0\n");
+ result = ERR_PTR(-ESTALE);
+ goto bail;
+ }
+
+ inode = ocfs2_ilookup(sb, blkno);
+ /* found in-memory inode, goes to check generation */
+ if (inode)
+ goto check_gen;
+
+ /* takes nfs_sync_lock in EX mode */
+ status = ocfs2_nfs_sync_lock(osb, 1);
+ if (status < 0) {
+ mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
+ goto check_err;
+ }
+
+ status = ocfs2_test_inode_bit(osb, blkno, &set);
+ if (status < 0) {
+ if (status == -EINVAL) {
+ /* meta block never be re-allocated as data block.
+ * nfsd gives us wrong blkno */
+ status = -EEXIST;
+ } else {
+ mlog(ML_ERROR, "test inode bit failed %d\n", status);
+ }
+ goto unlock_nfs_sync;
+ }
+
+ /* allocate bit is clear, inode is a stale inode */
+ if (!set) {
+ status = -ESTALE;
+ goto unlock_nfs_sync;
}
- inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0);
+ inode = ocfs2_iget(osb, blkno, 0, 0);
- if (IS_ERR(inode))
- return (void *)inode;
+unlock_nfs_sync:
+ ocfs2_nfs_sync_unlock(osb, 1);
+check_err:
+ if (status < 0) {
+ if (status == -ESTALE) {
+ mlog(0, "stale inode ino: %llu generation: %u\n",
+ blkno, handle->ih_generation);
+ }
+ result = ERR_PTR(status);
+ goto bail;
+ }
+
+ if (IS_ERR(inode)) {
+ mlog_errno((int)inode);
+ result = (void *)inode;
+ goto bail;
+ }
+
+check_gen:
if (handle->ih_generation != inode->i_generation) {
iput(inode);
- return ERR_PTR(-ESTALE);
+ mlog(0, "stale inode ino: %llu generation: %u\n", blkno,
+ handle->ih_generation);
+ result = ERR_PTR(-ESTALE);
+ goto bail;
}
result = d_obtain_alias(inode);
- if (!IS_ERR(result))
+ if (!IS_ERR(result)) {
result->d_op = &ocfs2_dentry_ops;
+ } else {
+ mlog_errno((int)result);
+ }
+bail:
mlog_exit_ptr(result);
return result;
}
Index: inode.c
==================================================================--- inode.c
(revision 139)
+++ inode.c (working copy)
@@ -112,6 +112,17 @@ void ocfs2_get_inode_flags(struct ocfs2_
oi->ip_attr |= OCFS2_DIRSYNC_FL;
}
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
+{
+ struct ocfs2_find_inode_args args;
+
+ args.fi_blkno = blkno;
+ args.fi_flags = 0;
+ args.fi_ino = ino_from_blkno(sb, blkno);
+ args.fi_sysfile_type = 0;
+
+ return ilookup5(sb, blkno, ocfs2_find_actor, &args);
+}
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
int sysfile_type)
{
@@ -949,6 +960,13 @@ void ocfs2_delete_inode(struct inode *in
goto bail;
}
+ /* Lock down the nfs_sync lock in PR mode */
+ status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0);
+ if (status < 0) {
+ mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status);
+ ocfs2_cleanup_delete_inode(inode, 0);
+ goto bail_unblock;
+ }
/* Lock down the inode. This gives us an up to date view of
* it's metadata (for verification), and allows us to
* serialize delete_inode on multiple nodes.
@@ -962,7 +980,7 @@ void ocfs2_delete_inode(struct inode *in
if (status != -ENOENT)
mlog_errno(status);
ocfs2_cleanup_delete_inode(inode, 0);
- goto bail_unblock;
+ goto bail_unlock_nfs_sync;
}
/* Query the cluster. This will be the final decision made
@@ -1005,6 +1023,10 @@ void ocfs2_delete_inode(struct inode *in
bail_unlock_inode:
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
+
+bail_unlock_nfs_sync:
+ ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0);
+
bail_unblock:
status = sigprocmask(SIG_SETMASK, &oldset, NULL);
if (status < 0)
Index: inode.h
==================================================================--- inode.h
(revision 139)
+++ inode.h (working copy)
@@ -124,6 +124,7 @@ void ocfs2_drop_inode(struct inode *inod
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_SYSFILE 0x1
#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2
+struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
int sysfile_type);
int ocfs2_inode_init_private(struct inode *inode);
Index: ocfs2_lockid.h
==================================================================---
ocfs2_lockid.h (revision 139)
+++ ocfs2_lockid.h (working copy)
@@ -47,6 +47,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_OPEN,
OCFS2_LOCK_TYPE_FLOCK,
OCFS2_LOCK_TYPE_QINFO,
+ OCFS2_LOCK_TYPE_NFS_SYNC,
OCFS2_NUM_LOCK_TYPES
};
@@ -81,6 +82,9 @@ static inline char ocfs2_lock_type_char(
case OCFS2_LOCK_TYPE_QINFO:
c = 'Q';
break;
+ case OCFS2_LOCK_TYPE_NFS_SYNC:
+ c = 'Y';
+ break;
default:
c = '\0';
}
Index: suballoc.c
==================================================================--- suballoc.c
(revision 139)
+++ suballoc.c (working copy)
@@ -2116,3 +2116,153 @@ out:
return ret;
}
+
+/* reads(hit disk) the inode specified by blkno to get suballoc_slot
+ * and suballoc_bit
+ * */
+static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
+ u16 *suballoc_slot, u16 *suballoc_bit)
+{
+ int status;
+ struct buffer_head *inode_bh = NULL;
+ struct ocfs2_dinode *inode_fe;
+
+ mlog_entry("blkno: %llu\n", blkno);
+
+ /* dirty read disk */
+ status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
+ if (status < 0)
+ goto bail;
+
+ inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
+ if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
+ status = -EINVAL;
+ goto bail;
+ }
+
+ if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT &&
+ (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots -1)
{
+ mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u"
+ "this may be caused by file system crash", blkno,
+ (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
+ status = -EINVAL;
+ goto bail;
+ }
+ if (suballoc_slot)
+ *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
+ if (suballoc_bit)
+ *suballoc_bit= le16_to_cpu(inode_fe->i_suballoc_bit);
+
+bail:
+ brelse(inode_bh);
+
+ mlog_exit(status);
+ return status;
+}
+
+/* test whether bit is SET in allocator bitmap or not.
+ * on success, 0 is returned and *res is 1 for SET; 0 otherwise.
+ * when fails, errno is returned and *res is meaningless.
+ * calls this after you have cluster locked against suballoc, or you may
+ * get a result based on non-up2date contents
+ * */
+static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, struct inode
*suballoc,
+ struct buffer_head *alloc_bh, u64 blkno, u16 bit,
+ int *res)
+{
+ struct ocfs2_dinode *alloc_fe;
+ struct ocfs2_group_desc *group;
+ struct buffer_head *group_bh = NULL;
+ u64 bg_blkno;
+ int status;
+
+ mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit);
+
+ alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
+ BUG_ON((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
+
+ bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
+ status = ocfs2_read_blocks_sync(osb, bg_blkno, 1, &group_bh);
+ if (status < 0)
+ goto bail;
+
+ status = ocfs2_check_group_descriptor(osb->sb, alloc_fe, group_bh);
+ if (status < 0)
+ goto bail;
+
+ group = (struct ocfs2_group_desc *) group_bh->b_data;
+ *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
+
+bail:
+ brelse(group_bh);
+
+ mlog_exit(status);
+ return status;
+}
+
+/* test if the bit, which is for the inode specified by blkno, in suballoc is
+ * set or not.
+ * on success, 0 is returned and *res is 1 for SET; 0 otherwise.
+ * when fails, errno is returned and *res is meaningless.
+ * MAKE SURE to hold nfs_sync_lock to revent ocfs2_delete_inode() on another
+ * node from accessing the same suballoc concurrently.
+ * */
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
+{
+ int status;
+ u16 suballoc_bit = 0, suballoc_slot = 0;
+ struct inode *inode_alloc_inode;
+ struct buffer_head *alloc_bh = NULL;
+
+ /* MAKE SURE nfs_sync_lock is held */
+
+ mlog_entry("blkno: %llu", blkno);
+
+ status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
+ &suballoc_bit);
+ if (status < 0) {
+ mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
+ goto bail;
+ }
+
+ inode_alloc_inode + ocfs2_get_system_file_inode(osb,
INODE_ALLOC_SYSTEM_INODE,
+ suballoc_slot);
+ if (!inode_alloc_inode) {
+ /* the error code could be inaccurate, but we are not able to
+ * get the correct one. */
+ status = -EEXIST;
+ mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
+ (u32)suballoc_slot);
+ goto bail;
+ }
+
+ mutex_lock(&inode_alloc_inode->i_mutex);
+ /* lock down the suballoc lock it to cause other nodes flush disk and
+ * then release it immediately to let allocation on other nodes to go
+ * asap. the reason we can read the group without lock is that we just
+ * want to know if the bit is clear and there can't be a concurrent
+ * clearing action because we hold the nfs_sync_lock. sure that there
+ * can be a concurrent setting action then. it doesn't matter, we can
+ * check generation later. */
+ status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
+ if (status < 0) {
+ mutex_unlock(&inode_alloc_inode->i_mutex);
+ mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
+ (u32)suballoc_slot, status);
+ goto bail;
+ }
+ ocfs2_inode_unlock(inode_alloc_inode, 0);
+ mutex_unlock(&inode_alloc_inode->i_mutex);
+
+ status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
+ blkno, suballoc_bit, res);
+ if (status < 0)
+ mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
+
+ iput(inode_alloc_inode);
+ brelse(alloc_bh);
+bail:
+ mlog_exit(status);
+ return status;
+}
Index: suballoc.h
==================================================================--- suballoc.h
(revision 139)
+++ suballoc.h (working copy)
@@ -186,4 +186,6 @@ int ocfs2_lock_allocators(struct inode *
u32 clusters_to_add, u32 extents_to_split,
struct ocfs2_alloc_context **data_ac,
struct ocfs2_alloc_context **meta_ac);
+
+int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res);
#endif /* _CHAINALLOC_H_ */
Index: ocfs2.h
==================================================================--- ocfs2.h
(revision 139)
+++ ocfs2.h (working copy)
@@ -305,6 +305,7 @@ struct ocfs2_super
struct ocfs2_cluster_connection *cconn;
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
+ struct ocfs2_lock_res osb_nfs_sync_lockres;
struct ocfs2_dlm_debug *osb_dlm_debug;
struct dentry *osb_debug_root;
Index: dlmglue.c
==================================================================--- dlmglue.c
(revision 139)
+++ dlmglue.c (working copy)
@@ -244,6 +244,10 @@ static struct ocfs2_lock_res_ops ocfs2_r
.flags = 0,
};
+static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
+ .flags = 0,
+};
+
static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
.get_osb = ocfs2_get_dentry_osb,
.post_unlock = ocfs2_dentry_post_unlock,
@@ -617,6 +621,17 @@ static void ocfs2_rename_lock_res_init(s
&ocfs2_rename_lops, osb);
}
+static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
+ struct ocfs2_super *osb)
+{
+ /* nfs_sync lockres doesn't come from a slab so we call init
+ * once on it manually. */
+ ocfs2_lock_res_init_once(res);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
+ ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
+ &ocfs2_nfs_sync_lops, osb);
+}
+
void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_file_private *fp)
{
@@ -2412,6 +2427,33 @@ void ocfs2_rename_unlock(struct ocfs2_su
ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
}
+int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
+{
+ int status;
+ struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+
+ if (ocfs2_is_hard_readonly(osb))
+ return -EROFS;
+
+ if (ocfs2_mount_local(osb))
+ return 0;
+
+ status = ocfs2_cluster_lock(osb, lockres, ex?LKM_EXMODE:LKM_PRMODE, 0,
+ 0);
+ if (status < 0)
+ mlog_errno(status);
+
+ return status;
+}
+
+void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
+{
+ struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
+
+ if (!ocfs2_mount_local(osb))
+ ocfs2_cluster_unlock(osb, lockres, ex?LKM_EXMODE:LKM_PRMODE);
+}
+
int ocfs2_dentry_lock(struct dentry *dentry, int ex)
{
int ret;
@@ -2793,6 +2835,7 @@ int ocfs2_dlm_init(struct ocfs2_super *o
local:
ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
+ ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
osb->cconn = conn;
@@ -2828,6 +2871,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_sup
ocfs2_lock_res_free(&osb->osb_super_lockres);
ocfs2_lock_res_free(&osb->osb_rename_lockres);
+ ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
osb->cconn = NULL;
@@ -3010,6 +3054,7 @@ static void ocfs2_drop_osb_locks(struct
{
ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
+ ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
}
int ocfs2_drop_inode_locks(struct inode *inode)
Joel Becker
2009-Feb-25 02:08 UTC
[Ocfs2-devel] [PATCH 1/1] OCFS2: anti stale inode for nfs (V4)
On Fri, Feb 20, 2009 at 05:23:50PM +0800, wengang wang wrote:> changes from v3: > 1, move codes that checks inode allocation bit to subfunction > ocfs2_test_inode_bit(). > > 2, release the suballoc lock just after we get it. we should release it asap > and doing so doesn't affect functionility. > > 3, add inode alloc slot validation.Almost there!> + /* takes nfs_sync_lock in EX mode */ > + status = ocfs2_nfs_sync_lock(osb, 1); > + if (status < 0) { > + mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status); > + goto check_err; > + } > + > + status = ocfs2_test_inode_bit(osb, blkno, &set); > + if (status < 0) { > + if (status == -EINVAL) { > + /* meta block never be re-allocated as data block. > + * nfsd gives us wrong blkno */ > + status = -EEXIST; > + } else { > + mlog(ML_ERROR, "test inode bit failed %d\n", status); > + } > + goto unlock_nfs_sync; > + }This looks very nice, but it should return -ESTALE instead of -EEXIST; it most likely does not exist, and nfs knows how to handle that error.> result = d_obtain_alias(inode); > - if (!IS_ERR(result)) > + if (!IS_ERR(result)) { > result->d_op = &ocfs2_dentry_ops; > + } else { > + mlog_errno((int)result); > + }Use PTR_ERR(result), not (int)result.> Index: suballoc.c > ==================================================================> --- suballoc.c (revision 139) > +++ suballoc.c (working copy) > @@ -2116,3 +2116,153 @@ out: > > return ret; > } > + > +/* reads(hit disk) the inode specified by blkno to get suballoc_slot > + * and suballoc_bit > + * */ > +static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, > + u16 *suballoc_slot, u16 *suballoc_bit) > +{ > + int status; > + struct buffer_head *inode_bh = NULL; > + struct ocfs2_dinode *inode_fe; > + > + mlog_entry("blkno: %llu\n", blkno); > + > + /* dirty read disk */ > + status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); > + if (status < 0) > + goto bail; > + > + inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; > + if (!OCFS2_IS_VALID_DINODE(inode_fe)) { > + status = -EINVAL; > + goto bail; > + }Let's add an ML_ERROR here: "invalid inode %llu requested".> + if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT && > + (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots -1) { > + mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u" > + "this may be caused by file system crash", blkno, > + (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); > + status = -EINVAL; > + goto bail; > + }Don't print "this may be caused by file system crash", as it is much more likely that nfs is asking for a bad block. Let's jsut say "inode %llu has an invalid suballoc slot %u".> +/* test if the bit, which is for the inode specified by blkno, in suballoc is > + * set or not. > + * on success, 0 is returned and *res is 1 for SET; 0 otherwise. > + * when fails, errno is returned and *res is meaningless. > + * MAKE SURE to hold nfs_sync_lock to revent ocfs2_delete_inode() on another > + * node from accessing the same suballoc concurrently. > + * */ > +int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) > +{ > + int status; > + u16 suballoc_bit = 0, suballoc_slot = 0; > + struct inode *inode_alloc_inode; > + struct buffer_head *alloc_bh = NULL; > + > + /* MAKE SURE nfs_sync_lock is held */ > + > + mlog_entry("blkno: %llu", blkno); > + > + status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, > + &suballoc_bit); > + if (status < 0) { > + mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); > + goto bail; > + } > + > + inode_alloc_inode > + ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, > + suballoc_slot); > + if (!inode_alloc_inode) { > + /* the error code could be inaccurate, but we are not able to > + * get the correct one. */ > + status = -EEXIST; > + mlog(ML_ERROR, "unable to get alloc inode in slot %u\n", > + (u32)suballoc_slot); > + goto bail; > + } > + > + mutex_lock(&inode_alloc_inode->i_mutex); > + /* lock down the suballoc lock it to cause other nodes flush disk and > + * then release it immediately to let allocation on other nodes to go > + * asap. the reason we can read the group without lock is that we just > + * want to know if the bit is clear and there can't be a concurrent > + * clearing action because we hold the nfs_sync_lock. sure that there > + * can be a concurrent setting action then. it doesn't matter, we can > + * check generation later. */ > + status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); > + if (status < 0) { > + mutex_unlock(&inode_alloc_inode->i_mutex); > + mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", > + (u32)suballoc_slot, status); > + goto bail; > + }You need to test the suballoc bit inside the lock. The current code is unsafe because while the inode set value won't change in a way that matters, the structure of the alloc inode's groups may. So move the ocfs2_test_suballoc_bit() call inside the lock here: + status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, + blkno, suballoc_bit, res); + if (status < 0) + mlog(ML_ERROR, "test suballoc bit failed %d\n", status); + ocfs2_inode_unlock(inode_alloc_inode, 0); + mutex_unlock(&inode_alloc_inode->i_mutex); Update the comment, of course. Your points about the nfs_sync_lock are correct. The only change is that we read the group under the lock.> + iput(inode_alloc_inode); > + brelse(alloc_bh); > +bail: > + mlog_exit(status); > + return status; > +}Otherwise this looks good. Joel -- #!/bin/perl -sp0777i<X+d*lMLa^*lN%0]dsXx++lMlN/dsM0<j]dsj $/=unpack('H*',$_);$_=`echo 16dio\U$k"SK$/SM$n\EsN0p[lN*1 lK[d2%Sa2/d0$^Ixp"|dc`;s/\W//g;$_=pack('H*',/((..)*)$/) Joel Becker Principal Software Developer Oracle E-mail: joel.becker at oracle.com Phone: (650) 506-8127
Joel Becker
2009-Feb-25 02:10 UTC
[Ocfs2-devel] [PATCH 1/1] OCFS2: anti stale inode for nfs (V4)
On Fri, Feb 20, 2009 at 05:23:50PM +0800, wengang wang wrote:> changes from v3: > 1, move codes that checks inode allocation bit to subfunction > ocfs2_test_inode_bit(). > > 2, release the suballoc lock just after we get it. we should release it asap > and doing so doesn't affect functionility. > > 3, add inode alloc slot validation.One last thing, sorry I missed it.> +static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, struct inode *suballoc, > + struct buffer_head *alloc_bh, u64 blkno, u16 bit, > + int *res) > +{ > + struct ocfs2_dinode *alloc_fe; > + struct ocfs2_group_desc *group; > + struct buffer_head *group_bh = NULL; > + u64 bg_blkno; > + int status; > + > + mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit); > + > + alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data; > + BUG_ON((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)); > + > + bg_blkno = ocfs2_which_suballoc_group(blkno, bit); > + status = ocfs2_read_blocks_sync(osb, bg_blkno, 1, &group_bh); > + if (status < 0) > + goto bail; > + > + status = ocfs2_check_group_descriptor(osb->sb, alloc_fe, group_bh); > + if (status < 0) > + goto bail;Just use ocfs2_read_group_descriptor() here. The locking code will make sure it reads from disk if necessary. Joel -- One look at the From: understanding has blossomed .procmailrc grows - Alexander Viro Joel Becker Principal Software Developer Oracle E-mail: joel.becker at oracle.com Phone: (650) 506-8127
Tao Ma
2009-Feb-25 02:36 UTC
[Ocfs2-devel] [PATCH 1/1] OCFS2: anti stale inode for nfs (V4)
Hi wengang, thanks for the work. Just one comment. See below. wengang wang wrote:> changes from v3: > 1, move codes that checks inode allocation bit to subfunction > ocfs2_test_inode_bit(). > > 2, release the suballoc lock just after we get it. we should release it asap > and doing so doesn't affect functionility. > > 3, add inode alloc slot validation. > > Signed-off-by: Wengang Wang <wen.gang.wang at oracle.com><snip>> +/* reads(hit disk) the inode specified by blkno to get suballoc_slot > + * and suballoc_bit > + * */ > +static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, > + u16 *suballoc_slot, u16 *suballoc_bit) > +{ > + int status; > + struct buffer_head *inode_bh = NULL; > + struct ocfs2_dinode *inode_fe; > + > + mlog_entry("blkno: %llu\n", blkno); > + > + /* dirty read disk */ > + status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); > + if (status < 0) > + goto bail; > + > + inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; > + if (!OCFS2_IS_VALID_DINODE(inode_fe)) { > + status = -EINVAL; > + goto bail; > + } > + > + if (le16_to_cpu(inode_fe->i_suballoc_slot) != OCFS2_INVALID_SLOT && > + (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots -1) { > + mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u" > + "this may be caused by file system crash", blkno, > + (u32)le16_to_cpu(inode_fe->i_suballoc_slot)); > + status = -EINVAL; > + goto bail; > + } > + if (suballoc_slot) > + *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); > + if (suballoc_bit) > + *suballoc_bit= le16_to_cpu(inode_fe->i_suballoc_bit); > + > +bail: > + brelse(inode_bh); > + > + mlog_exit(status); > + return status; > +} > + > +/* test whether bit is SET in allocator bitmap or not. > + * on success, 0 is returned and *res is 1 for SET; 0 otherwise. > + * when fails, errno is returned and *res is meaningless. > + * calls this after you have cluster locked against suballoc, or you may > + * get a result based on non-up2date contents > + * */ > +static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, struct inode *suballoc, > + struct buffer_head *alloc_bh, u64 blkno, u16 bit, > + int *res) > +{ > + struct ocfs2_dinode *alloc_fe; > + struct ocfs2_group_desc *group; > + struct buffer_head *group_bh = NULL; > + u64 bg_blkno; > + int status; > + > + mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit); > + > + alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data; > + BUG_ON((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain));here, we shouldn't BUG_ON. It actually isn't a kernel bug. the 'bit' is got from the disk by function ocfs2_get_suballoc_slot_bit. So maybe a corrupt inode, maybe something else, but never a bug of ocfs2. ;) So just return error please. Regards, Tao