wengang wang
2008-Dec-12 09:00 UTC
[Ocfs2-devel] [PATCH 1/1] OCFS2: anti stale inode for nfs (V2).
Changes from V1: 1) separate lines into sub-functions. 2) use mlog(0, .. instead of mlog(ML_NOTICE, .. for stale error. #patch based on ocfs2 1.4 git. Signed-off-by: Wengang Wang <wen.gang.wang at oracle.com> -- cluster/masklog.h | 2 - dlmglue.c | 31 +++++++++------ export.c | 92 +++++++++++++++++++++++++++++++++++++++++----- inode.c | 107 +++++++++++++++++++++++++++++++++++------------------- inode.h | 1 suballoc.c | 80 ++++++++++++++++++++++++++++++++++++++++ suballoc.h | 9 ++++ 7 files changed, 262 insertions(+), 60 deletions(-) Index: fs/ocfs2/cluster/masklog.h ==================================================================--- fs/ocfs2/cluster/masklog.h (revision 38) +++ fs/ocfs2/cluster/masklog.h (working copy) @@ -114,7 +114,7 @@ #define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ /* bits that are infrequently given and frequently matched in the high word */ #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ -#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ +#define ML_NOTICE 0x0000000200000000ULL /* sent to KERN_NOTICE */ #define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) Index: fs/ocfs2/export.c ==================================================================--- fs/ocfs2/export.c (revision 38) +++ fs/ocfs2/export.c (working copy) @@ -38,6 +38,8 @@ #include "inode.h" #include "buffer_head_io.h" +#include "sysfile.h" +#include "suballoc.h" struct ocfs2_inode_handle { @@ -48,24 +50,92 @@ struct ocfs2_inode_handle static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp) { struct ocfs2_inode_handle *handle = vobjp; - struct inode *inode; + struct ocfs2_super *osb = OCFS2_SB(sb); struct dentry *result; - + struct inode *inode, *inode_alloc_inode; + struct buffer_head *alloc_bh = NULL; + u64 blkno = handle->ih_blkno; + unsigned short suballoc_bit, suballoc_slot; + int status, set; + mlog_entry("(0x%p, 0x%p)\n", sb, handle); - if (handle->ih_blkno == 0) { - mlog_errno(-ESTALE); - return ERR_PTR(-ESTALE); + if (blkno == 0) { + mlog(0, "nfs wants inode with blkno: 0\n"); + result = ERR_PTR(-ESTALE); + goto bail; + } + + inode = ocfs2_ilookup(sb, ino_from_blkno(sb, blkno)); + /* found in-memory inode, goes to check generation */ + if (inode) + goto check_gen; + + status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, + &suballoc_bit, 1); + if (status < 0) + goto check_err; + + inode_alloc_inode + ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, + suballoc_slot); + if (!inode_alloc_inode) { + status = -EEXIST; + goto check_err; } - inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0); + mutex_lock(&inode_alloc_inode->i_mutex); + status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); + if (status < 0) + goto unlock_mutex; + + status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, + blkno, suballoc_bit, &set); + if (status < 0) + goto inode_unlock; + + /* allocate bit is clear, inode is a stale inode */ + if (!set) { + status = -ESTALE; + goto inode_unlock; + } + + inode = ocfs2_iget(OCFS2_SB(sb), blkno, 0, 0); + +inode_unlock: + ocfs2_inode_unlock(inode_alloc_inode, 0); +unlock_mutex: + mutex_unlock(&inode_alloc_inode->i_mutex); + iput(inode_alloc_inode); + brelse(alloc_bh); + +check_err: + + if (status < 0) { + if (status == -ESTALE) { + mlog(0, "stale inode ino: %llu generation: %u\n", + blkno, handle->ih_generation); + } else { + mlog_errno(status); + } - if (IS_ERR(inode)) - return (void *)inode; + result = ERR_PTR(status); + goto bail; + } + if (IS_ERR(inode)) { + mlog_errno((int)inode); + result = (void *)inode; + goto bail; + } + +check_gen: if (handle->ih_generation != inode->i_generation) { iput(inode); - return ERR_PTR(-ESTALE); + mlog(0, "stale inode ino: %llu generation: %u\n", blkno, + handle->ih_generation); + result = ERR_PTR(-ESTALE); + goto bail; } result = d_alloc_anon(inode); @@ -73,10 +143,12 @@ static struct dentry *ocfs2_get_dentry(s if (!result) { iput(inode); mlog_errno(-ENOMEM); - return ERR_PTR(-ENOMEM); + result = ERR_PTR(-ENOMEM); + goto bail; } result->d_op = &ocfs2_dentry_ops; +bail: mlog_exit_ptr(result); return result; } Index: fs/ocfs2/inode.c ==================================================================--- fs/ocfs2/inode.c (revision 38) +++ fs/ocfs2/inode.c (working copy) @@ -111,6 +111,17 @@ void ocfs2_get_inode_flags(struct ocfs2_ oi->ip_attr |= OCFS2_DIRSYNC_FL; } +struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) +{ + struct ocfs2_find_inode_args args; + + args.fi_blkno = blkno; + args.fi_flags = 0; + args.fi_ino = ino_from_blkno(sb, blkno); + args.fi_sysfile_type = 0; + + return ilookup5(sb, blkno, ocfs2_find_actor, &args); +} struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, int sysfile_type) { @@ -571,41 +582,26 @@ out: return status; } -static int ocfs2_remove_inode(struct inode *inode, +/* callers must have cluster lock inode_alloc_inode taken before calling + * ocfs2_remove_inode + * */ +static int ocfs2_remove_inode(struct inode *inode_alloc_inode, + struct buffer_head *suballoc_bh, + struct inode *inode, struct buffer_head *di_bh, struct inode *orphan_dir_inode, struct buffer_head *orphan_dir_bh) { int status; - struct inode *inode_alloc_inode = NULL; - struct buffer_head *inode_alloc_bh = NULL; handle_t *handle; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; - inode_alloc_inode - ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, - le16_to_cpu(di->i_suballoc_slot)); - if (!inode_alloc_inode) { - status = -EEXIST; - mlog_errno(status); - goto bail; - } - - mutex_lock(&inode_alloc_inode->i_mutex); - status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1); - if (status < 0) { - mutex_unlock(&inode_alloc_inode->i_mutex); - - mlog_errno(status); - goto bail; - } - handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); - goto bail_unlock; + goto bail; } status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, @@ -635,19 +631,13 @@ static int ocfs2_remove_inode(struct ino ocfs2_remove_from_cache(inode, di_bh); status = ocfs2_free_dinode(handle, inode_alloc_inode, - inode_alloc_bh, di); + suballoc_bh, di); if (status < 0) mlog_errno(status); bail_commit: ocfs2_commit_trans(osb, handle); -bail_unlock: - ocfs2_inode_unlock(inode_alloc_inode, 1); - mutex_unlock(&inode_alloc_inode->i_mutex); - brelse(inode_alloc_bh); bail: - iput(inode_alloc_inode); - return status; } @@ -687,7 +677,9 @@ static void ocfs2_signal_wipe_completion wake_up(&osb->osb_wipe_event); } -static int ocfs2_wipe_inode(struct inode *inode, +static int ocfs2_wipe_inode(struct inode *inode_alloc_inode, + struct buffer_head *suballoc_bh, + struct inode *inode, struct buffer_head *di_bh) { int status, orphaned_slot; @@ -734,8 +726,8 @@ static int ocfs2_wipe_inode(struct inode goto bail_unlock_dir; } - status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, - orphan_dir_bh); + status = ocfs2_remove_inode(inode_alloc_inode, suballoc_bh, inode, + di_bh, orphan_dir_inode, orphan_dir_bh); if (status < 0) mlog_errno(status); @@ -904,7 +896,9 @@ void ocfs2_delete_inode(struct inode *in { int wipe, status; sigset_t blocked, oldset; - struct buffer_head *di_bh = NULL; + struct buffer_head *di_bh = NULL, *suballoc_bh = NULL; + struct inode *inode_alloc_inode = NULL; + unsigned short suballoc_slot; mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); @@ -933,6 +927,40 @@ void ocfs2_delete_inode(struct inode *in goto bail; } + /* lock the suballoc inode first before locking against inode + * its self + * */ + + /* here we don't have cluster locked against inode, so we may + * reads out non-up2date contents. + * but because suballoc_slot never changes, reading out non-up2date + * contents is no problem. + * */ + status = ocfs2_get_suballoc_slot_bit(OCFS2_SB(inode->i_sb), + OCFS2_I(inode)->ip_blkno, + &suballoc_slot, NULL, 0); + if (status < 0) { + mlog_errno(status); + goto bail_sigmask; + } + + inode_alloc_inode + ocfs2_get_system_file_inode(OCFS2_SB(inode->i_sb), + INODE_ALLOC_SYSTEM_INODE, + suballoc_slot); + if (!inode_alloc_inode) { + status = -EEXIST; + mlog_errno(status); + goto bail_sigmask; + } + + mutex_lock(&inode_alloc_inode->i_mutex); + status = ocfs2_inode_lock(inode_alloc_inode, &suballoc_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail_mutex; + } + /* Lock down the inode. This gives us an up to date view of * it's metadata (for verification), and allows us to * serialize delete_inode on multiple nodes. @@ -946,7 +974,7 @@ void ocfs2_delete_inode(struct inode *in if (status != -ENOENT) mlog_errno(status); ocfs2_cleanup_delete_inode(inode, 0); - goto bail_unblock; + goto bail_unlock_alloc_inode; } /* Query the cluster. This will be the final decision made @@ -968,7 +996,8 @@ void ocfs2_delete_inode(struct inode *in ocfs2_cleanup_delete_inode(inode, 0); - status = ocfs2_wipe_inode(inode, di_bh); + status = ocfs2_wipe_inode(inode_alloc_inode, suballoc_bh, inode, + di_bh); if (status < 0) { if (status != -EDEADLK) mlog_errno(status); @@ -989,7 +1018,13 @@ void ocfs2_delete_inode(struct inode *in bail_unlock_inode: ocfs2_inode_unlock(inode, 1); brelse(di_bh); -bail_unblock: +bail_unlock_alloc_inode: + ocfs2_inode_unlock(inode_alloc_inode, 1); + brelse(suballoc_bh); +bail_mutex: + mutex_unlock(&inode_alloc_inode->i_mutex); + iput(inode_alloc_inode); +bail_sigmask: status = sigprocmask(SIG_SETMASK, &oldset, NULL); if (status < 0) mlog_errno(status); Index: fs/ocfs2/suballoc.c ==================================================================--- fs/ocfs2/suballoc.c (revision 38) +++ fs/ocfs2/suballoc.c (working copy) @@ -1891,3 +1891,83 @@ static inline void ocfs2_debug_suballoc_ (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); } } + +/* reads(hit disk when dirty_read is true) the inode specified by blkno to get + * suballoc_slot and suballoc_bit + * */ +int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, + unsigned short *suballoc_slot, + unsigned short *suballoc_bit, + int dirty_read) +{ + int status; + struct buffer_head *inode_bh = NULL; + struct ocfs2_dinode *inode_fe; + int flags = 0; + + mlog_entry("blkno: %llu\n", blkno); + if (!dirty_read) + flags |= OCFS2_BH_CACHED; + status = ocfs2_read_block(osb, blkno, &inode_bh, flags, NULL); + if (status < 0) + goto bail; + + inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; + if (!OCFS2_IS_VALID_DINODE(inode_fe)) { + status = -EINVAL; + goto bail; + } + + if (suballoc_slot) + *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); + if (suballoc_bit) + *suballoc_bit= le16_to_cpu(inode_fe->i_suballoc_bit); + +bail: + brelse(inode_bh); + + mlog_exit(status); + return status; +} + +/* test bit bit is SET in allocator bitmap or not. + * on success, 0 is returned and *res is 1 for SET; 0 otherwise. + * when fails, errno is returned and *res is meaningless. + * calls this after you have cluster locked against suballoc, or you may + * get a result based on non-up2date contents + * */ +int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, struct inode *suballoc, + struct buffer_head *alloc_bh, u64 blkno, + unsigned short bit, int *res) +{ + struct ocfs2_dinode *alloc_fe; + struct ocfs2_group_desc *group; + struct buffer_head *group_bh = NULL; + u64 bg_blkno; + int status; + + mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit); + BUG_ON(!res); + + alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data; + BUG_ON((bit + 1) > + ocfs2_bits_per_group(&alloc_fe->id2.i_chain)); + + bg_blkno = ocfs2_which_suballoc_group(blkno, bit); + status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED, + suballoc); + if (status < 0) + goto bail; + + group = (struct ocfs2_group_desc *) group_bh->b_data; + status = ocfs2_check_group_descriptor(osb->sb, alloc_fe, group); + if (status < 0) + goto bail; + + *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap); +bail: + brelse(group_bh); + + mlog_exit(status); + return status; +} Index: fs/ocfs2/suballoc.h ==================================================================--- fs/ocfs2/suballoc.h (revision 38) +++ fs/ocfs2/suballoc.h (working copy) @@ -156,4 +156,13 @@ u64 ocfs2_which_cluster_group(struct ino int ocfs2_check_group_descriptor(struct super_block *sb, struct ocfs2_dinode *di, struct ocfs2_group_desc *gd); + +int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, + unsigned short *suballoc_slot, + unsigned short *suballoc_bit, + int dirty_read); + +int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, struct inode *suballoc, + struct buffer_head *alloc_bh, u64 blkno, + unsigned short bit, int *res); #endif /* _CHAINALLOC_H_ */ Index: fs/ocfs2/dlmglue.c ==================================================================--- fs/ocfs2/dlmglue.c (revision 38) +++ fs/ocfs2/dlmglue.c (working copy) @@ -1940,19 +1940,24 @@ static int ocfs2_inode_lock_update(struc status = -EIO; goto bail_refresh; } - mlog_bug_on_msg(inode->i_generation !- le32_to_cpu(fe->i_generation), - "Invalid dinode %llu disk generation: %u " - "inode->i_generation: %u\n", - (unsigned long long)oi->ip_blkno, - le32_to_cpu(fe->i_generation), - inode->i_generation); - mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) || - !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)), - "Stale dinode %llu dtime: %llu flags: 0x%x\n", - (unsigned long long)oi->ip_blkno, - (unsigned long long)le64_to_cpu(fe->i_dtime), - le32_to_cpu(fe->i_flags)); + if (inode->i_generation != le32_to_cpu(fe->i_generation)) { + mlog(0, "Stale inode %llu disk generation: %u " + "inode generation: %u\n", + (unsigned long long)oi->ip_blkno, + le32_to_cpu(fe->i_generation), + inode->i_generation); + status = -ESTALE; + goto bail_refresh; + } + if (le64_to_cpu(fe->i_dtime) || + !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + mlog(0, "Stale inode %llu dtime: %llu flags: 0x%x\n", + (unsigned long long)oi->ip_blkno, + (unsigned long long)le64_to_cpu(fe->i_dtime), + le32_to_cpu(fe->i_flags)); + status = -ESTALE; + goto bail_refresh; + } ocfs2_refresh_inode(inode, fe); ocfs2_track_lock_refresh(lockres); Index: fs/ocfs2/inode.h ==================================================================--- fs/ocfs2/inode.h (revision 38) +++ fs/ocfs2/inode.h (working copy) @@ -126,6 +126,7 @@ void ocfs2_drop_inode(struct inode *inod /* Flags for ocfs2_iget() */ #define OCFS2_FI_FLAG_SYSFILE 0x1 #define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 +struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, int sysfile_type); int ocfs2_inode_init_private(struct inode *inode);
wengang wang
2008-Dec-25 01:31 UTC
[Ocfs2-devel] [PATCH 1/1] OCFS2: anti stale inode for nfs (V2).
How about this patch? thanks, wengang. wengang wang wrote:> Changes from V1: > 1) separate lines into sub-functions. > 2) use mlog(0, .. instead of mlog(ML_NOTICE, .. for stale error. > > #patch based on ocfs2 1.4 git. > > Signed-off-by: Wengang Wang <wen.gang.wang at oracle.com> > -- > cluster/masklog.h | 2 - > dlmglue.c | 31 +++++++++------ > export.c | 92 +++++++++++++++++++++++++++++++++++++++++----- > inode.c | 107 +++++++++++++++++++++++++++++++++++------------------- > inode.h | 1 > suballoc.c | 80 ++++++++++++++++++++++++++++++++++++++++ > suballoc.h | 9 ++++ > 7 files changed, 262 insertions(+), 60 deletions(-) > > Index: fs/ocfs2/cluster/masklog.h > ==================================================================> --- fs/ocfs2/cluster/masklog.h (revision 38) > +++ fs/ocfs2/cluster/masklog.h (working copy) > @@ -114,7 +114,7 @@ > #define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ > /* bits that are infrequently given and frequently matched in the high word */ > #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ > -#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ > +#define ML_NOTICE 0x0000000200000000ULL /* sent to KERN_NOTICE */ > #define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ > > #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) > Index: fs/ocfs2/export.c > ==================================================================> --- fs/ocfs2/export.c (revision 38) > +++ fs/ocfs2/export.c (working copy) > @@ -38,6 +38,8 @@ > #include "inode.h" > > #include "buffer_head_io.h" > +#include "sysfile.h" > +#include "suballoc.h" > > struct ocfs2_inode_handle > { > @@ -48,24 +50,92 @@ struct ocfs2_inode_handle > static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp) > { > struct ocfs2_inode_handle *handle = vobjp; > - struct inode *inode; > + struct ocfs2_super *osb = OCFS2_SB(sb); > struct dentry *result; > - > + struct inode *inode, *inode_alloc_inode; > + struct buffer_head *alloc_bh = NULL; > + u64 blkno = handle->ih_blkno; > + unsigned short suballoc_bit, suballoc_slot; > + int status, set; > + > mlog_entry("(0x%p, 0x%p)\n", sb, handle); > > - if (handle->ih_blkno == 0) { > - mlog_errno(-ESTALE); > - return ERR_PTR(-ESTALE); > + if (blkno == 0) { > + mlog(0, "nfs wants inode with blkno: 0\n"); > + result = ERR_PTR(-ESTALE); > + goto bail; > + } > + > + inode = ocfs2_ilookup(sb, ino_from_blkno(sb, blkno)); > + /* found in-memory inode, goes to check generation */ > + if (inode) > + goto check_gen; > + > + status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, > + &suballoc_bit, 1); > + if (status < 0) > + goto check_err; > + > + inode_alloc_inode > + ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, > + suballoc_slot); > + if (!inode_alloc_inode) { > + status = -EEXIST; > + goto check_err; > } > > - inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0, 0); > + mutex_lock(&inode_alloc_inode->i_mutex); > + status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); > + if (status < 0) > + goto unlock_mutex; > + > + status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, > + blkno, suballoc_bit, &set); > + if (status < 0) > + goto inode_unlock; > + > + /* allocate bit is clear, inode is a stale inode */ > + if (!set) { > + status = -ESTALE; > + goto inode_unlock; > + } > + > + inode = ocfs2_iget(OCFS2_SB(sb), blkno, 0, 0); > + > +inode_unlock: > + ocfs2_inode_unlock(inode_alloc_inode, 0); > +unlock_mutex: > + mutex_unlock(&inode_alloc_inode->i_mutex); > + iput(inode_alloc_inode); > + brelse(alloc_bh); > + > +check_err: > + > + if (status < 0) { > + if (status == -ESTALE) { > + mlog(0, "stale inode ino: %llu generation: %u\n", > + blkno, handle->ih_generation); > + } else { > + mlog_errno(status); > + } > > - if (IS_ERR(inode)) > - return (void *)inode; > + result = ERR_PTR(status); > + goto bail; > + } > > + if (IS_ERR(inode)) { > + mlog_errno((int)inode); > + result = (void *)inode; > + goto bail; > + } > + > +check_gen: > if (handle->ih_generation != inode->i_generation) { > iput(inode); > - return ERR_PTR(-ESTALE); > + mlog(0, "stale inode ino: %llu generation: %u\n", blkno, > + handle->ih_generation); > + result = ERR_PTR(-ESTALE); > + goto bail; > } > > result = d_alloc_anon(inode); > @@ -73,10 +143,12 @@ static struct dentry *ocfs2_get_dentry(s > if (!result) { > iput(inode); > mlog_errno(-ENOMEM); > - return ERR_PTR(-ENOMEM); > + result = ERR_PTR(-ENOMEM); > + goto bail; > } > result->d_op = &ocfs2_dentry_ops; > > +bail: > mlog_exit_ptr(result); > return result; > } > Index: fs/ocfs2/inode.c > ==================================================================> --- fs/ocfs2/inode.c (revision 38) > +++ fs/ocfs2/inode.c (working copy) > @@ -111,6 +111,17 @@ void ocfs2_get_inode_flags(struct ocfs2_ > oi->ip_attr |= OCFS2_DIRSYNC_FL; > } > > +struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) > +{ > + struct ocfs2_find_inode_args args; > + > + args.fi_blkno = blkno; > + args.fi_flags = 0; > + args.fi_ino = ino_from_blkno(sb, blkno); > + args.fi_sysfile_type = 0; > + > + return ilookup5(sb, blkno, ocfs2_find_actor, &args); > +} > struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, > int sysfile_type) > { > @@ -571,41 +582,26 @@ out: > return status; > } > > -static int ocfs2_remove_inode(struct inode *inode, > +/* callers must have cluster lock inode_alloc_inode taken before calling > + * ocfs2_remove_inode > + * */ > +static int ocfs2_remove_inode(struct inode *inode_alloc_inode, > + struct buffer_head *suballoc_bh, > + struct inode *inode, > struct buffer_head *di_bh, > struct inode *orphan_dir_inode, > struct buffer_head *orphan_dir_bh) > { > int status; > - struct inode *inode_alloc_inode = NULL; > - struct buffer_head *inode_alloc_bh = NULL; > handle_t *handle; > struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); > struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; > > - inode_alloc_inode > - ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, > - le16_to_cpu(di->i_suballoc_slot)); > - if (!inode_alloc_inode) { > - status = -EEXIST; > - mlog_errno(status); > - goto bail; > - } > - > - mutex_lock(&inode_alloc_inode->i_mutex); > - status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1); > - if (status < 0) { > - mutex_unlock(&inode_alloc_inode->i_mutex); > - > - mlog_errno(status); > - goto bail; > - } > - > handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS); > if (IS_ERR(handle)) { > status = PTR_ERR(handle); > mlog_errno(status); > - goto bail_unlock; > + goto bail; > } > > status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, > @@ -635,19 +631,13 @@ static int ocfs2_remove_inode(struct ino > ocfs2_remove_from_cache(inode, di_bh); > > status = ocfs2_free_dinode(handle, inode_alloc_inode, > - inode_alloc_bh, di); > + suballoc_bh, di); > if (status < 0) > mlog_errno(status); > > bail_commit: > ocfs2_commit_trans(osb, handle); > -bail_unlock: > - ocfs2_inode_unlock(inode_alloc_inode, 1); > - mutex_unlock(&inode_alloc_inode->i_mutex); > - brelse(inode_alloc_bh); > bail: > - iput(inode_alloc_inode); > - > return status; > } > > @@ -687,7 +677,9 @@ static void ocfs2_signal_wipe_completion > wake_up(&osb->osb_wipe_event); > } > > -static int ocfs2_wipe_inode(struct inode *inode, > +static int ocfs2_wipe_inode(struct inode *inode_alloc_inode, > + struct buffer_head *suballoc_bh, > + struct inode *inode, > struct buffer_head *di_bh) > { > int status, orphaned_slot; > @@ -734,8 +726,8 @@ static int ocfs2_wipe_inode(struct inode > goto bail_unlock_dir; > } > > - status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, > - orphan_dir_bh); > + status = ocfs2_remove_inode(inode_alloc_inode, suballoc_bh, inode, > + di_bh, orphan_dir_inode, orphan_dir_bh); > if (status < 0) > mlog_errno(status); > > @@ -904,7 +896,9 @@ void ocfs2_delete_inode(struct inode *in > { > int wipe, status; > sigset_t blocked, oldset; > - struct buffer_head *di_bh = NULL; > + struct buffer_head *di_bh = NULL, *suballoc_bh = NULL; > + struct inode *inode_alloc_inode = NULL; > + unsigned short suballoc_slot; > > mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); > > @@ -933,6 +927,40 @@ void ocfs2_delete_inode(struct inode *in > goto bail; > } > > + /* lock the suballoc inode first before locking against inode > + * its self > + * */ > + > + /* here we don't have cluster locked against inode, so we may > + * reads out non-up2date contents. > + * but because suballoc_slot never changes, reading out non-up2date > + * contents is no problem. > + * */ > + status = ocfs2_get_suballoc_slot_bit(OCFS2_SB(inode->i_sb), > + OCFS2_I(inode)->ip_blkno, > + &suballoc_slot, NULL, 0); > + if (status < 0) { > + mlog_errno(status); > + goto bail_sigmask; > + } > + > + inode_alloc_inode > + ocfs2_get_system_file_inode(OCFS2_SB(inode->i_sb), > + INODE_ALLOC_SYSTEM_INODE, > + suballoc_slot); > + if (!inode_alloc_inode) { > + status = -EEXIST; > + mlog_errno(status); > + goto bail_sigmask; > + } > + > + mutex_lock(&inode_alloc_inode->i_mutex); > + status = ocfs2_inode_lock(inode_alloc_inode, &suballoc_bh, 1); > + if (status < 0) { > + mlog_errno(status); > + goto bail_mutex; > + } > + > /* Lock down the inode. This gives us an up to date view of > * it's metadata (for verification), and allows us to > * serialize delete_inode on multiple nodes. > @@ -946,7 +974,7 @@ void ocfs2_delete_inode(struct inode *in > if (status != -ENOENT) > mlog_errno(status); > ocfs2_cleanup_delete_inode(inode, 0); > - goto bail_unblock; > + goto bail_unlock_alloc_inode; > } > > /* Query the cluster. This will be the final decision made > @@ -968,7 +996,8 @@ void ocfs2_delete_inode(struct inode *in > > ocfs2_cleanup_delete_inode(inode, 0); > > - status = ocfs2_wipe_inode(inode, di_bh); > + status = ocfs2_wipe_inode(inode_alloc_inode, suballoc_bh, inode, > + di_bh); > if (status < 0) { > if (status != -EDEADLK) > mlog_errno(status); > @@ -989,7 +1018,13 @@ void ocfs2_delete_inode(struct inode *in > bail_unlock_inode: > ocfs2_inode_unlock(inode, 1); > brelse(di_bh); > -bail_unblock: > +bail_unlock_alloc_inode: > + ocfs2_inode_unlock(inode_alloc_inode, 1); > + brelse(suballoc_bh); > +bail_mutex: > + mutex_unlock(&inode_alloc_inode->i_mutex); > + iput(inode_alloc_inode); > +bail_sigmask: > status = sigprocmask(SIG_SETMASK, &oldset, NULL); > if (status < 0) > mlog_errno(status); > Index: fs/ocfs2/suballoc.c > ==================================================================> --- fs/ocfs2/suballoc.c (revision 38) > +++ fs/ocfs2/suballoc.c (working copy) > @@ -1891,3 +1891,83 @@ static inline void ocfs2_debug_suballoc_ > (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); > } > } > + > +/* reads(hit disk when dirty_read is true) the inode specified by blkno to get > + * suballoc_slot and suballoc_bit > + * */ > +int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, > + unsigned short *suballoc_slot, > + unsigned short *suballoc_bit, > + int dirty_read) > +{ > + int status; > + struct buffer_head *inode_bh = NULL; > + struct ocfs2_dinode *inode_fe; > + int flags = 0; > + > + mlog_entry("blkno: %llu\n", blkno); > + if (!dirty_read) > + flags |= OCFS2_BH_CACHED; > + status = ocfs2_read_block(osb, blkno, &inode_bh, flags, NULL); > + if (status < 0) > + goto bail; > + > + inode_fe = (struct ocfs2_dinode *) inode_bh->b_data; > + if (!OCFS2_IS_VALID_DINODE(inode_fe)) { > + status = -EINVAL; > + goto bail; > + } > + > + if (suballoc_slot) > + *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); > + if (suballoc_bit) > + *suballoc_bit= le16_to_cpu(inode_fe->i_suballoc_bit); > + > +bail: > + brelse(inode_bh); > + > + mlog_exit(status); > + return status; > +} > + > +/* test bit bit is SET in allocator bitmap or not. > + * on success, 0 is returned and *res is 1 for SET; 0 otherwise. > + * when fails, errno is returned and *res is meaningless. > + * calls this after you have cluster locked against suballoc, or you may > + * get a result based on non-up2date contents > + * */ > +int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, struct inode *suballoc, > + struct buffer_head *alloc_bh, u64 blkno, > + unsigned short bit, int *res) > +{ > + struct ocfs2_dinode *alloc_fe; > + struct ocfs2_group_desc *group; > + struct buffer_head *group_bh = NULL; > + u64 bg_blkno; > + int status; > + > + mlog_entry("blkno: %llu bit: %u\n", blkno, (unsigned int)bit); > + BUG_ON(!res); > + > + alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data; > + BUG_ON((bit + 1) > > + ocfs2_bits_per_group(&alloc_fe->id2.i_chain)); > + > + bg_blkno = ocfs2_which_suballoc_group(blkno, bit); > + status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED, > + suballoc); > + if (status < 0) > + goto bail; > + > + group = (struct ocfs2_group_desc *) group_bh->b_data; > + status = ocfs2_check_group_descriptor(osb->sb, alloc_fe, group); > + if (status < 0) > + goto bail; > + > + *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap); > +bail: > + brelse(group_bh); > + > + mlog_exit(status); > + return status; > +} > Index: fs/ocfs2/suballoc.h > ==================================================================> --- fs/ocfs2/suballoc.h (revision 38) > +++ fs/ocfs2/suballoc.h (working copy) > @@ -156,4 +156,13 @@ u64 ocfs2_which_cluster_group(struct ino > int ocfs2_check_group_descriptor(struct super_block *sb, > struct ocfs2_dinode *di, > struct ocfs2_group_desc *gd); > + > +int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, > + unsigned short *suballoc_slot, > + unsigned short *suballoc_bit, > + int dirty_read); > + > +int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, struct inode *suballoc, > + struct buffer_head *alloc_bh, u64 blkno, > + unsigned short bit, int *res); > #endif /* _CHAINALLOC_H_ */ > Index: fs/ocfs2/dlmglue.c > ==================================================================> --- fs/ocfs2/dlmglue.c (revision 38) > +++ fs/ocfs2/dlmglue.c (working copy) > @@ -1940,19 +1940,24 @@ static int ocfs2_inode_lock_update(struc > status = -EIO; > goto bail_refresh; > } > - mlog_bug_on_msg(inode->i_generation !> - le32_to_cpu(fe->i_generation), > - "Invalid dinode %llu disk generation: %u " > - "inode->i_generation: %u\n", > - (unsigned long long)oi->ip_blkno, > - le32_to_cpu(fe->i_generation), > - inode->i_generation); > - mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) || > - !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)), > - "Stale dinode %llu dtime: %llu flags: 0x%x\n", > - (unsigned long long)oi->ip_blkno, > - (unsigned long long)le64_to_cpu(fe->i_dtime), > - le32_to_cpu(fe->i_flags)); > + if (inode->i_generation != le32_to_cpu(fe->i_generation)) { > + mlog(0, "Stale inode %llu disk generation: %u " > + "inode generation: %u\n", > + (unsigned long long)oi->ip_blkno, > + le32_to_cpu(fe->i_generation), > + inode->i_generation); > + status = -ESTALE; > + goto bail_refresh; > + } > + if (le64_to_cpu(fe->i_dtime) || > + !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { > + mlog(0, "Stale inode %llu dtime: %llu flags: 0x%x\n", > + (unsigned long long)oi->ip_blkno, > + (unsigned long long)le64_to_cpu(fe->i_dtime), > + le32_to_cpu(fe->i_flags)); > + status = -ESTALE; > + goto bail_refresh; > + } > > ocfs2_refresh_inode(inode, fe); > ocfs2_track_lock_refresh(lockres); > Index: fs/ocfs2/inode.h > ==================================================================> --- fs/ocfs2/inode.h (revision 38) > +++ fs/ocfs2/inode.h (working copy) > @@ -126,6 +126,7 @@ void ocfs2_drop_inode(struct inode *inod > /* Flags for ocfs2_iget() */ > #define OCFS2_FI_FLAG_SYSFILE 0x1 > #define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 > +struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); > struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, > int sysfile_type); > int ocfs2_inode_init_private(struct inode *inode); > > _______________________________________________ > Ocfs2-devel mailing list > Ocfs2-devel at oss.oracle.com > http://oss.oracle.com/mailman/listinfo/ocfs2-devel >
Joel Becker
2009-Jan-30 01:11 UTC
[Ocfs2-devel] [PATCH 1/1] OCFS2: anti stale inode for nfs (V2).
On Fri, Dec 12, 2008 at 05:00:44PM +0800, wengang wang wrote:> Changes from V1: > 1) separate lines into sub-functions. > 2) use mlog(0, .. instead of mlog(ML_NOTICE, .. for stale error.Wengang, Sorry it took me so long to review this. You're going to be slightly amused - we're going to be using some of your original ideas, though with modification.> Index: fs/ocfs2/export.c > ==================================================================> --- fs/ocfs2/export.c (revision 38) > +++ fs/ocfs2/export.c (working copy) > @@ -38,6 +38,8 @@ > #include "inode.h" > > #include "buffer_head_io.h" > +#include "sysfile.h" > +#include "suballoc.h" > > struct ocfs2_inode_handle > { > @@ -48,24 +50,92 @@ struct ocfs2_inode_handle > static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp) > { > struct ocfs2_inode_handle *handle = vobjp; > - struct inode *inode; > + struct ocfs2_super *osb = OCFS2_SB(sb); > struct dentry *result; > - > + struct inode *inode, *inode_alloc_inode; > + struct buffer_head *alloc_bh = NULL; > + u64 blkno = handle->ih_blkno; > + unsigned short suballoc_bit, suballoc_slot; > + int status, set; > + > mlog_entry("(0x%p, 0x%p)\n", sb, handle); > > - if (handle->ih_blkno == 0) { > - mlog_errno(-ESTALE); > - return ERR_PTR(-ESTALE); > + if (blkno == 0) { > + mlog(0, "nfs wants inode with blkno: 0\n"); > + result = ERR_PTR(-ESTALE); > + goto bail; > + } > + > + inode = ocfs2_ilookup(sb, ino_from_blkno(sb, blkno));You need to call ocfs2_ilookup() with the real blkno, not the ino_from_blkno() value. You'll find that ocfs2_ilookup() is doing ino_from_blkno for args.fi_ino. Otherwise, your changes to ocfs2_get_dentry() fit your patch nicely.> @@ -933,6 +927,40 @@ void ocfs2_delete_inode(struct inode *in > goto bail; > } > > + /* lock the suballoc inode first before locking against inode > + * its self > + * */ > + > + /* here we don't have cluster locked against inode, so we may > + * reads out non-up2date contents. > + * but because suballoc_slot never changes, reading out non-up2date > + * contents is no problem. > + * */ > + status = ocfs2_get_suballoc_slot_bit(OCFS2_SB(inode->i_sb), > + OCFS2_I(inode)->ip_blkno, > + &suballoc_slot, NULL, 0); > + if (status < 0) { > + mlog_errno(status); > + goto bail_sigmask; > + } > + > + inode_alloc_inode > + ocfs2_get_system_file_inode(OCFS2_SB(inode->i_sb), > + INODE_ALLOC_SYSTEM_INODE, > + suballoc_slot); > + if (!inode_alloc_inode) { > + status = -EEXIST; > + mlog_errno(status); > + goto bail_sigmask; > + } > + > + mutex_lock(&inode_alloc_inode->i_mutex); > + status = ocfs2_inode_lock(inode_alloc_inode, &suballoc_bh, 1); > + if (status < 0) { > + mlog_errno(status); > + goto bail_mutex; > + } > +Here is where the current patch runs into problems. You moved the locking of the alloc inode up to the top of ocfs2_delete_inode() in order to avoid the ABBA deadlock with ocfs2_get_dentry(). Unfortunately, this is a performance disaster. When a node closes a file that needs deletion, it gets the inode lock and checks to see if it can wipe. Only the last closer can wipe - the others do nothing. But with this change, even nodes that will skip wiping have to lock the alloc inode. Thus, the alloc inode lock is pinged around the cluster for every delete. This will really hurt. What we'd like to do is keep the lock of the alloc_inode inside ocfs2_remove_inode(), and yet avoid the ABBA deadlock. Mark proposed the answer, which is a little bit like your original solution. If you recall, you originally wanted to add a dealloc lock to every inode. This isn't workable, because it's extra work for the "normal" case of deletion, and it adds space usage for every inode. In addition, your old code forced a checkpoint for every deletion, regardless of whether it was needed. Now, the change to ocfs2_get_dentry() to get the suballoc lock provides the appropriate journal flushing. It only happens when ocfs2_get_dentry() is caled, and doesn't hurt the "normal" case. But what about the deadlock? We'd like you to add a lock on the ocfs2_super, the "nfs_sync_lock". This will be a new lock type, and there is only one per filesystem. During ocfs2_delete_inode(), we hold this lock as a PR. This means all nodes can do ocfs2_delete_inode() concurrently. In fact, due to lock caching, all nodes will acquire the lock once and never release it unless ocfs2_get_dentry() is called. Thus, the normal case has no performance impact. In ocfs2_get_dentry(), if ocfs2_ilookup() fails, we take the nfs_sync_lock in EX mode before we take the alloc_inode lock. Because the nfs_sync_lock blocks out ocfs2_delete_inode(), there is no ordering concern with the alloc_inode and inode locks. There can be no ABBA deadlock. Becasue we only take the lock when ocfs2_ilookup() fails, we only will slow down when the inode is not in cache. That only happens when an NFS server reboots or a client is silent for a long time. Thus, the normal NFS case is not impacted by this change either.> Index: fs/ocfs2/dlmglue.c > ==================================================================> --- fs/ocfs2/dlmglue.c (revision 38) > +++ fs/ocfs2/dlmglue.c (working copy) > @@ -1940,19 +1940,24 @@ static int ocfs2_inode_lock_update(struc > status = -EIO; > goto bail_refresh; > } > - mlog_bug_on_msg(inode->i_generation !> - le32_to_cpu(fe->i_generation), > - "Invalid dinode %llu disk generation: %u " > - "inode->i_generation: %u\n", > - (unsigned long long)oi->ip_blkno, > - le32_to_cpu(fe->i_generation), > - inode->i_generation); > - mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) || > - !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)), > - "Stale dinode %llu dtime: %llu flags: 0x%x\n", > - (unsigned long long)oi->ip_blkno, > - (unsigned long long)le64_to_cpu(fe->i_dtime), > - le32_to_cpu(fe->i_flags)); > + if (inode->i_generation != le32_to_cpu(fe->i_generation)) { > + mlog(0, "Stale inode %llu disk generation: %u " > + "inode generation: %u\n", > + (unsigned long long)oi->ip_blkno, > + le32_to_cpu(fe->i_generation), > + inode->i_generation); > + status = -ESTALE; > + goto bail_refresh; > + } > + if (le64_to_cpu(fe->i_dtime) || > + !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { > + mlog(0, "Stale inode %llu dtime: %llu flags: 0x%x\n", > + (unsigned long long)oi->ip_blkno, > + (unsigned long long)le64_to_cpu(fe->i_dtime), > + le32_to_cpu(fe->i_flags)); > + status = -ESTALE; > + goto bail_refresh; > + }I don't think we need this change anymore. Because you called ilookup(), we know that the inode is not in the inode cache. We'll create it new when you call iget. Joel -- "Always give your best, never get discouraged, never be petty; always remember, others may hate you. Those who hate you don't win unless you hate them. And then you destroy yourself." - Richard M. Nixon Joel Becker Principal Software Developer Oracle E-mail: joel.becker at oracle.com Phone: (650) 506-8127
Wengang Wang
2009-Feb-10 01:33 UTC
[Ocfs2-devel] [PATCH 1/1] OCFS2: anti stale inode for nfs (V2).
Hi Joel, Joel Becker wrote:> On Fri, Dec 12, 2008 at 05:00:44PM +0800, wengang wang wrote: >> Changes from V1: >> 1) separate lines into sub-functions. >> 2) use mlog(0, .. instead of mlog(ML_NOTICE, .. for stale error. > > Wengang, > Sorry it took me so long to review this. You're going to be > slightly amused - we're going to be using some of your original ideas, > though with modification.very good :)> >> Index: fs/ocfs2/export.c >> ==================================================================>> --- fs/ocfs2/export.c (revision 38) >> +++ fs/ocfs2/export.c (working copy) >> @@ -38,6 +38,8 @@ >> #include "inode.h" >> >> #include "buffer_head_io.h" >> +#include "sysfile.h" >> +#include "suballoc.h" >> >> struct ocfs2_inode_handle >> { >> @@ -48,24 +50,92 @@ struct ocfs2_inode_handle >> static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp) >> { >> struct ocfs2_inode_handle *handle = vobjp; >> - struct inode *inode; >> + struct ocfs2_super *osb = OCFS2_SB(sb); >> struct dentry *result; >> - >> + struct inode *inode, *inode_alloc_inode; >> + struct buffer_head *alloc_bh = NULL; >> + u64 blkno = handle->ih_blkno; >> + unsigned short suballoc_bit, suballoc_slot; >> + int status, set; >> + >> mlog_entry("(0x%p, 0x%p)\n", sb, handle); >> >> - if (handle->ih_blkno == 0) { >> - mlog_errno(-ESTALE); >> - return ERR_PTR(-ESTALE); >> + if (blkno == 0) { >> + mlog(0, "nfs wants inode with blkno: 0\n"); >> + result = ERR_PTR(-ESTALE); >> + goto bail; >> + } >> + >> + inode = ocfs2_ilookup(sb, ino_from_blkno(sb, blkno)); > > You need to call ocfs2_ilookup() with the real blkno, not the > ino_from_blkno() value. You'll find that ocfs2_ilookup() is doing > ino_from_blkno for args.fi_ino. > Otherwise, your changes to ocfs2_get_dentry() fit your patch > nicely. > >> @@ -933,6 +927,40 @@ void ocfs2_delete_inode(struct inode *in >> goto bail; >> } >> >> + /* lock the suballoc inode first before locking against inode >> + * its self >> + * */ >> + >> + /* here we don't have cluster locked against inode, so we may >> + * reads out non-up2date contents. >> + * but because suballoc_slot never changes, reading out non-up2date >> + * contents is no problem. >> + * */ >> + status = ocfs2_get_suballoc_slot_bit(OCFS2_SB(inode->i_sb), >> + OCFS2_I(inode)->ip_blkno, >> + &suballoc_slot, NULL, 0); >> + if (status < 0) { >> + mlog_errno(status); >> + goto bail_sigmask; >> + } >> + >> + inode_alloc_inode >> + ocfs2_get_system_file_inode(OCFS2_SB(inode->i_sb), >> + INODE_ALLOC_SYSTEM_INODE, >> + suballoc_slot); >> + if (!inode_alloc_inode) { >> + status = -EEXIST; >> + mlog_errno(status); >> + goto bail_sigmask; >> + } >> + >> + mutex_lock(&inode_alloc_inode->i_mutex); >> + status = ocfs2_inode_lock(inode_alloc_inode, &suballoc_bh, 1); >> + if (status < 0) { >> + mlog_errno(status); >> + goto bail_mutex; >> + } >> + > > Here is where the current patch runs into problems. You moved > the locking of the alloc inode up to the top of ocfs2_delete_inode() in > order to avoid the ABBA deadlock with ocfs2_get_dentry(). Unfortunately, > this is a performance disaster. > When a node closes a file that needs deletion, it gets the inode > lock and checks to see if it can wipe. Only the last closer can wipe - > the others do nothing. But with this change, even nodes that will skip > wiping have to lock the alloc inode. Thus, the alloc inode lock is > pinged around the cluster for every delete. This will really hurt. > What we'd like to do is keep the lock of the alloc_inode inside > ocfs2_remove_inode(), and yet avoid the ABBA deadlock. Mark proposed the > answer, which is a little bit like your original solution. > If you recall, you originally wanted to add a dealloc lock to > every inode. This isn't workable, because it's extra work for the > "normal" case of deletion, and it adds space usage for every inode. In > addition, your old code forced a checkpoint for every deletion, > regardless of whether it was needed. > Now, the change to ocfs2_get_dentry() to get the suballoc lock > provides the appropriate journal flushing. It only happens when > ocfs2_get_dentry() is caled, and doesn't hurt the "normal" case. But > what about the deadlock? > We'd like you to add a lock on the ocfs2_super, the > "nfs_sync_lock". This will be a new lock type, and there is only one > per filesystem. During ocfs2_delete_inode(), we hold this lock as a PR. > This means all nodes can do ocfs2_delete_inode() concurrently. In fact, > due to lock caching, all nodes will acquire the lock once and never > release it unless ocfs2_get_dentry() is called. Thus, the normal case > has no performance impact. > In ocfs2_get_dentry(), if ocfs2_ilookup() fails, we take the > nfs_sync_lock in EX mode before we take the alloc_inode lock. Because > the nfs_sync_lock blocks out ocfs2_delete_inode(), there is no ordering > concern with the alloc_inode and inode locks. There can be no ABBA > deadlock. Becasue we only take the lock when ocfs2_ilookup() fails, we > only will slow down when the inode is not in cache. That only happens > when an NFS server reboots or a client is silent for a long time. Thus, > the normal NFS case is not impacted by this change either. >I see. thanks for your patience and detail! one thing is that, the "nfs_sync_lock" will be one in count or more than one? I think having lots of this kind of lock(one for each meta block) will bring us least performance impact for deletions. well so much locks eat memory. for balancing, how about setting up 16(or 32) such locks? different inode goes to different lock according to its block number. and 16(or 32) such locks won't take much memory. --just like the idea in my original patch.>> Index: fs/ocfs2/dlmglue.c >> ==================================================================>> --- fs/ocfs2/dlmglue.c (revision 38) >> +++ fs/ocfs2/dlmglue.c (working copy) >> @@ -1940,19 +1940,24 @@ static int ocfs2_inode_lock_update(struc >> status = -EIO; >> goto bail_refresh; >> } >> - mlog_bug_on_msg(inode->i_generation !>> - le32_to_cpu(fe->i_generation), >> - "Invalid dinode %llu disk generation: %u " >> - "inode->i_generation: %u\n", >> - (unsigned long long)oi->ip_blkno, >> - le32_to_cpu(fe->i_generation), >> - inode->i_generation); >> - mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) || >> - !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)), >> - "Stale dinode %llu dtime: %llu flags: 0x%x\n", >> - (unsigned long long)oi->ip_blkno, >> - (unsigned long long)le64_to_cpu(fe->i_dtime), >> - le32_to_cpu(fe->i_flags)); >> + if (inode->i_generation != le32_to_cpu(fe->i_generation)) { >> + mlog(0, "Stale inode %llu disk generation: %u " >> + "inode generation: %u\n", >> + (unsigned long long)oi->ip_blkno, >> + le32_to_cpu(fe->i_generation), >> + inode->i_generation); >> + status = -ESTALE; >> + goto bail_refresh; >> + } >> + if (le64_to_cpu(fe->i_dtime) || >> + !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { >> + mlog(0, "Stale inode %llu dtime: %llu flags: 0x%x\n", >> + (unsigned long long)oi->ip_blkno, >> + (unsigned long long)le64_to_cpu(fe->i_dtime), >> + le32_to_cpu(fe->i_flags)); >> + status = -ESTALE; >> + goto bail_refresh; >> + } > > I don't think we need this change anymore. Because you called > ilookup(), we know that the inode is not in the inode cache. We'll > create it new when you call iget.for now I'm not very sure about this. I will do as you suggested until I find out it's not true :P thanks wengang.