xuejiufei
2016-Mar-02 09:29 UTC
[Ocfs2-devel] [PATCH] ocfs2: fix a tiny race that leads file system read-only
when o2hb detect a node down, it first set the dead node to recovery map and create ocfs2rec which will replay journal for dead node. o2hb thread then call dlm_do_local_recovery_cleanup() to delete the lock for dead node. After the lock of dead node is gone, locks for other nodes can be granted and may modify the meta data without replaying journal of the dead node. The detail is described as follows. N1 N2 N3(master) modify the extent tree of inode, and commit dirty metadata to journal, then goes down. o2hb thread detects N1 goes down, set recovery map and delete the lock of N1. dlm_thread flush ast for the lock of N2. do not detect the death of N1, so recovery map is empty. read inode from disk without replaying the journal of N1 and modify the extent tree of the inode that N1 had modified. ocfs2rec recover the journal of N1. The modification of N2 is lost. The modification of N1 and N2 are not serial, and it will lead to read-only file system. We can set recovery_waiting flag to the lock resource after delete the lock for dead node to prevent other node from getting the lock before dlm recovery. After dlm recovery, the recovery map on N2 is not empty, ocfs2_inode_lock_full_nested() will wait for ocfs2 recovery. Signed-off-by: Jiufei Xue <xuejiufei at huawei.com> --- fs/ocfs2/dlm/dlmcommon.h | 5 ++++- fs/ocfs2/dlm/dlmmaster.c | 3 ++- fs/ocfs2/dlm/dlmrecovery.c | 8 ++++++++ fs/ocfs2/dlm/dlmthread.c | 6 ++++-- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 68c607e..1276d91 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, #define DLM_LOCK_RES_DROPPING_REF 0x00000040 #define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000 #define DLM_LOCK_RES_SETREF_INPROG 0x00002000 +#define DLM_LOCK_RES_RECOVERY_WAITING 0x00004000 /* max milliseconds to wait to sync up a network failure with a node death */ #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) @@ -789,7 +790,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res) assert_spin_locked(&res->spinlock); - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) status = DLM_RECOVERING; else if (res->state & DLM_LOCK_RES_MIGRATING) status = DLM_MIGRATING; @@ -1009,6 +1011,7 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) { __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS| DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING| DLM_LOCK_RES_MIGRATING)); } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 9477d6e..12d385b 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2432,7 +2432,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, return 0; /* delay migration when the lockres is in RECOCERING state */ - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) return 0; if (res->owner != dlm->node_num) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index b94a425..099714e 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2163,6 +2163,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, bucket, hash_node) { + if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) { + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + } + if (!(res->state & DLM_LOCK_RES_RECOVERING)) continue; @@ -2300,6 +2307,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, res->lockname.len, res->lockname.name, freed, dead_node); __dlm_print_one_lock_resource(res); } + res->state |= DLM_LOCK_RES_RECOVERY_WAITING; dlm_lockres_clear_refmap_bit(dlm, res, dead_node); } else if (test_bit(dead_node, res->refmap)) { mlog(0, "%s:%.*s: dead node %u had a ref, but had " diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index c5f6c24..73c03be 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res) if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) return 0; - if (res->state & DLM_LOCK_RES_RECOVERING) + if (res->state & (DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_RECOVERY_WAITING)) return 0; /* Another node has this resource with this node as the master */ @@ -700,7 +701,8 @@ static int dlm_thread(void *data) * dirty for a short while. */ BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); if (res->state & (DLM_LOCK_RES_IN_PROGRESS | - DLM_LOCK_RES_RECOVERING)) { + DLM_LOCK_RES_RECOVERING | + DLM_LOCK_RES_RECOVERY_WAITING)) { /* move it to the tail and keep going */ res->state &= ~DLM_LOCK_RES_DIRTY; spin_unlock(&res->spinlock); -- 1.8.4.3
Joseph Qi
2016-Mar-03 01:07 UTC
[Ocfs2-devel] [PATCH] ocfs2: fix a tiny race that leads file system read-only
On 2016/3/2 17:29, xuejiufei wrote:> when o2hb detect a node down, it first set the dead node to recovery > map and create ocfs2rec which will replay journal for dead node. > o2hb thread then call dlm_do_local_recovery_cleanup() to delete the > lock for dead node. After the lock of dead node is gone, locks for other > nodes can be granted and may modify the meta data without replaying > journal of the dead node. The detail is described as follows. > > N1 N2 N3(master) > modify the extent tree of > inode, and commit > dirty metadata to journal, > then goes down. > o2hb thread detects > N1 goes down, set > recovery map and > delete the lock of N1. > > dlm_thread flush ast > for the lock of N2. > do not detect the death > of N1, so recovery map is > empty. > > read inode from disk > without replaying > the journal of N1 and > modify the extent tree > of the inode that N1 > had modified. > ocfs2rec recover the > journal of N1. > The modification of N2 > is lost. > > The modification of N1 and N2 are not serial, and it will lead to > read-only file system. We can set recovery_waiting flag to the lock > resource after delete the lock for dead node to prevent other node > from getting the lock before dlm recovery. After dlm recovery, the > recovery map on N2 is not empty, ocfs2_inode_lock_full_nested() will > wait for ocfs2 recovery. > > Signed-off-by: Jiufei Xue <xuejiufei at huawei.com>Lockres $RECOVERY cannot be set the flag. So I agree that the flag is set in dlm_free_dead_locks which has already excluded lockres $RECOVERY. So this fix looks good to me. Reviewed-by: Joseph Qi <joseph.qi at huawei.com>> --- > fs/ocfs2/dlm/dlmcommon.h | 5 ++++- > fs/ocfs2/dlm/dlmmaster.c | 3 ++- > fs/ocfs2/dlm/dlmrecovery.c | 8 ++++++++ > fs/ocfs2/dlm/dlmthread.c | 6 ++++-- > 4 files changed, 18 insertions(+), 4 deletions(-) > > diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h > index 68c607e..1276d91 100644 > --- a/fs/ocfs2/dlm/dlmcommon.h > +++ b/fs/ocfs2/dlm/dlmcommon.h > @@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, > #define DLM_LOCK_RES_DROPPING_REF 0x00000040 > #define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000 > #define DLM_LOCK_RES_SETREF_INPROG 0x00002000 > +#define DLM_LOCK_RES_RECOVERY_WAITING 0x00004000 > > /* max milliseconds to wait to sync up a network failure with a node death */ > #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) > @@ -789,7 +790,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res) > > assert_spin_locked(&res->spinlock); > > - if (res->state & DLM_LOCK_RES_RECOVERING) > + if (res->state & (DLM_LOCK_RES_RECOVERING| > + DLM_LOCK_RES_RECOVERY_WAITING)) > status = DLM_RECOVERING; > else if (res->state & DLM_LOCK_RES_MIGRATING) > status = DLM_MIGRATING; > @@ -1009,6 +1011,7 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) > { > __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS| > DLM_LOCK_RES_RECOVERING| > + DLM_LOCK_RES_RECOVERY_WAITING| > DLM_LOCK_RES_MIGRATING)); > } > > diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c > index 9477d6e..12d385b 100644 > --- a/fs/ocfs2/dlm/dlmmaster.c > +++ b/fs/ocfs2/dlm/dlmmaster.c > @@ -2432,7 +2432,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, > return 0; > > /* delay migration when the lockres is in RECOCERING state */ > - if (res->state & DLM_LOCK_RES_RECOVERING) > + if (res->state & (DLM_LOCK_RES_RECOVERING| > + DLM_LOCK_RES_RECOVERY_WAITING)) > return 0; > > if (res->owner != dlm->node_num) > diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c > index b94a425..099714e 100644 > --- a/fs/ocfs2/dlm/dlmrecovery.c > +++ b/fs/ocfs2/dlm/dlmrecovery.c > @@ -2163,6 +2163,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, > for (i = 0; i < DLM_HASH_BUCKETS; i++) { > bucket = dlm_lockres_hash(dlm, i); > hlist_for_each_entry(res, bucket, hash_node) { > + if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) { > + spin_lock(&res->spinlock); > + res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING; > + spin_unlock(&res->spinlock); > + wake_up(&res->wq); > + } > + > if (!(res->state & DLM_LOCK_RES_RECOVERING)) > continue; > > @@ -2300,6 +2307,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, > res->lockname.len, res->lockname.name, freed, dead_node); > __dlm_print_one_lock_resource(res); > } > + res->state |= DLM_LOCK_RES_RECOVERY_WAITING; > dlm_lockres_clear_refmap_bit(dlm, res, dead_node); > } else if (test_bit(dead_node, res->refmap)) { > mlog(0, "%s:%.*s: dead node %u had a ref, but had " > diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c > index c5f6c24..73c03be 100644 > --- a/fs/ocfs2/dlm/dlmthread.c > +++ b/fs/ocfs2/dlm/dlmthread.c > @@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res) > if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) > return 0; > > - if (res->state & DLM_LOCK_RES_RECOVERING) > + if (res->state & (DLM_LOCK_RES_RECOVERING| > + DLM_LOCK_RES_RECOVERY_WAITING)) > return 0; > > /* Another node has this resource with this node as the master */ > @@ -700,7 +701,8 @@ static int dlm_thread(void *data) > * dirty for a short while. */ > BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); > if (res->state & (DLM_LOCK_RES_IN_PROGRESS | > - DLM_LOCK_RES_RECOVERING)) { > + DLM_LOCK_RES_RECOVERING | > + DLM_LOCK_RES_RECOVERY_WAITING)) { > /* move it to the tail and keep going */ > res->state &= ~DLM_LOCK_RES_DIRTY; > spin_unlock(&res->spinlock); >