piaojun
2018-Mar-05 03:44 UTC
[Ocfs2-devel] [PATCH v3] ocfs2/dlm: don't handle migrate lockres if already in shutdown
We should not handle migrate lockres if we are already in 'DLM_CTXT_IN_SHUTDOWN', as that will cause lockres remains after leaving dlm domain. At last other nodes will get stuck into infinite loop when requsting lock from us. The problem is caused by concurrency umount between nodes. Before receiveing N1's DLM_BEGIN_EXIT_DOMAIN_MSG, N2 has picked up N1 as the migrate target. So N2 will continue sending lockres to N1 even though N1 has left domain. N1 N2 (owner) touch file access the file, and get pr lock begin leave domain and pick up N1 as new owner begin leave domain and migrate all lockres done begin migrate lockres to N1 end leave domain, but the lockres left unexpectedly, because migrate task has passed Signed-off-by: Jun Piao <piaojun at huawei.com> Reviewed-by: Yiwen Jiang <jiangyiwen at huawei.com> Reviewed-by: Joseph Qi <jiangqi903 at gmail.com> --- fs/ocfs2/dlm/dlmdomain.c | 14 -------------- fs/ocfs2/dlm/dlmdomain.h | 25 ++++++++++++++++++++++++- fs/ocfs2/dlm/dlmrecovery.c | 9 +++++++++ 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index e1fea14..25b76f0 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -675,20 +675,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm) spin_unlock(&dlm->spinlock); } -int dlm_shutting_down(struct dlm_ctxt *dlm) -{ - int ret = 0; - - spin_lock(&dlm_domain_lock); - - if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) - ret = 1; - - spin_unlock(&dlm_domain_lock); - - return ret; -} - void dlm_unregister_domain(struct dlm_ctxt *dlm) { int leave = 0; diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h index fd6122a..8a92814 100644 --- a/fs/ocfs2/dlm/dlmdomain.h +++ b/fs/ocfs2/dlm/dlmdomain.h @@ -28,7 +28,30 @@ extern spinlock_t dlm_domain_lock; extern struct list_head dlm_domains; -int dlm_shutting_down(struct dlm_ctxt *dlm); +static inline int dlm_joined(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + if (dlm->dlm_state == DLM_CTXT_JOINED) + ret = 1; + spin_unlock(&dlm_domain_lock); + + return ret; +} + +static inline int dlm_shutting_down(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) + ret = 1; + spin_unlock(&dlm_domain_lock); + + return ret; +} + void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, int node_num); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index ec8f758..505ab42 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1378,6 +1378,15 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, if (!dlm_grab(dlm)) return -EINVAL; + if (!dlm_joined(dlm)) { + mlog(ML_ERROR, "Domain %s not joined! " + "lockres %.*s, master %u\n", + dlm->name, mres->lockname_len, + mres->lockname, mres->master); + dlm_put(dlm); + return -EINVAL; + } + BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); real_master = mres->master; --
Changwei Ge
2018-Mar-05 08:36 UTC
[Ocfs2-devel] [PATCH v3] ocfs2/dlm: don't handle migrate lockres if already in shutdown
Looks good to me. Reviewed-by: Changwei Ge <ge.changwei at h3c.com> On 2018/3/5 11:45, piaojun wrote:> We should not handle migrate lockres if we are already in > 'DLM_CTXT_IN_SHUTDOWN', as that will cause lockres remains after leaving > dlm domain. At last other nodes will get stuck into infinite loop when > requsting lock from us. > > The problem is caused by concurrency umount between nodes. Before > receiveing N1's DLM_BEGIN_EXIT_DOMAIN_MSG, N2 has picked up N1 as the > migrate target. So N2 will continue sending lockres to N1 even though N1 > has left domain. > > N1 N2 (owner) > touch file > > access the file, > and get pr lock > > begin leave domain and > pick up N1 as new owner > > begin leave domain and > migrate all lockres done > > begin migrate lockres to N1 > > end leave domain, but > the lockres left > unexpectedly, because > migrate task has passed > > Signed-off-by: Jun Piao <piaojun at huawei.com> > Reviewed-by: Yiwen Jiang <jiangyiwen at huawei.com> > Reviewed-by: Joseph Qi <jiangqi903 at gmail.com> > --- > fs/ocfs2/dlm/dlmdomain.c | 14 -------------- > fs/ocfs2/dlm/dlmdomain.h | 25 ++++++++++++++++++++++++- > fs/ocfs2/dlm/dlmrecovery.c | 9 +++++++++ > 3 files changed, 33 insertions(+), 15 deletions(-) > > diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c > index e1fea14..25b76f0 100644 > --- a/fs/ocfs2/dlm/dlmdomain.c > +++ b/fs/ocfs2/dlm/dlmdomain.c > @@ -675,20 +675,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm) > spin_unlock(&dlm->spinlock); > } > > -int dlm_shutting_down(struct dlm_ctxt *dlm) > -{ > - int ret = 0; > - > - spin_lock(&dlm_domain_lock); > - > - if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) > - ret = 1; > - > - spin_unlock(&dlm_domain_lock); > - > - return ret; > -} > - > void dlm_unregister_domain(struct dlm_ctxt *dlm) > { > int leave = 0; > diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h > index fd6122a..8a92814 100644 > --- a/fs/ocfs2/dlm/dlmdomain.h > +++ b/fs/ocfs2/dlm/dlmdomain.h > @@ -28,7 +28,30 @@ > extern spinlock_t dlm_domain_lock; > extern struct list_head dlm_domains; > > -int dlm_shutting_down(struct dlm_ctxt *dlm); > +static inline int dlm_joined(struct dlm_ctxt *dlm) > +{ > + int ret = 0; > + > + spin_lock(&dlm_domain_lock); > + if (dlm->dlm_state == DLM_CTXT_JOINED) > + ret = 1; > + spin_unlock(&dlm_domain_lock); > + > + return ret; > +} > + > +static inline int dlm_shutting_down(struct dlm_ctxt *dlm) > +{ > + int ret = 0; > + > + spin_lock(&dlm_domain_lock); > + if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) > + ret = 1; > + spin_unlock(&dlm_domain_lock); > + > + return ret; > +} > + > void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, > int node_num); > > diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c > index ec8f758..505ab42 100644 > --- a/fs/ocfs2/dlm/dlmrecovery.c > +++ b/fs/ocfs2/dlm/dlmrecovery.c > @@ -1378,6 +1378,15 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, > if (!dlm_grab(dlm)) > return -EINVAL; > > + if (!dlm_joined(dlm)) { > + mlog(ML_ERROR, "Domain %s not joined! " > + "lockres %.*s, master %u\n", > + dlm->name, mres->lockname_len, > + mres->lockname, mres->master); > + dlm_put(dlm); > + return -EINVAL; > + } > + > BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); > > real_master = mres->master; >