Sunil Mushran
2011-Feb-21 22:25 UTC
[Ocfs2-devel] [PATCH 1/2] ocfs2/cluster: Heartbeat mismatch message improved
If o2hb finds unexpected values in the heartbeat slot, it prints a message "ERROR: Device "dm-6": another node is heartbeating in our slot!" This patch adds more information allowing us to see the actual mismatch. The new message reads "ERROR: Heartbeat mismatch on "dm-6": expected(92:0x234543, 0x76567) ondisk(92:0x234543, 0x76565)" Signed-off-by: Sunil Mushran <sunil.mushran at oracle.com> --- fs/ocfs2/cluster/heartbeat.c | 34 +++++++++++++++++----------------- 1 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index b108e86..d633df0 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -543,25 +543,27 @@ static int o2hb_verify_crc(struct o2hb_region *reg, /* We want to make sure that nobody is heartbeating on top of us -- * this will help detect an invalid configuration. */ -static int o2hb_check_last_timestamp(struct o2hb_region *reg) +static void o2hb_check_last_timestamp(struct o2hb_region *reg) { - int node_num, ret; struct o2hb_disk_slot *slot; struct o2hb_disk_heartbeat_block *hb_block; - node_num = o2nm_this_node(); - - ret = 1; - slot = ®->hr_slots[node_num]; + slot = ®->hr_slots[o2nm_this_node()]; /* Don't check on our 1st timestamp */ - if (slot->ds_last_time) { - hb_block = slot->ds_raw_block; + if (!slot->ds_last_time) + return; - if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time) - ret = 0; - } + hb_block = slot->ds_raw_block; + if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time) + return; - return ret; + mlog(ML_ERROR, "Heartbeat mismatch on \"%s\": " + "expected(%u:0x%llx, 0x%llx), ondisk(%u:0x%llx, 0x%llx)\n", + reg->hr_dev_name, slot->ds_node_num, + (unsigned long long)slot->ds_last_generation, + (unsigned long long)slot->ds_last_time, hb_block->hb_node, + (unsigned long long)le64_to_cpu(hb_block->hb_generation), + (unsigned long long)le64_to_cpu(hb_block->hb_seq)); } static inline void o2hb_prepare_block(struct o2hb_region *reg, @@ -987,9 +989,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) /* With an up to date view of the slots, we can check that no * other node has been improperly configured to heartbeat in * our slot. */ - if (!o2hb_check_last_timestamp(reg)) - mlog(ML_ERROR, "Device \"%s\": another node is heartbeating " - "in our slot!\n", reg->hr_dev_name); + o2hb_check_last_timestamp(reg); /* fill in the proper info for our next heartbeat */ o2hb_prepare_block(reg, reg->hr_generation); @@ -1003,8 +1003,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) } i = -1; - while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { - + while((i = find_next_bit(configured_nodes, + O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { change |= o2hb_check_slot(reg, ®->hr_slots[i]); } -- 1.7.1
Sunil Mushran
2011-Feb-21 22:25 UTC
[Ocfs2-devel] [PATCH 2/2] ocfs2/dlm: Move kmalloc() outside the spinlock
In dlm_query_region_handler(), move the kmalloc outside the spinlock. This allows us to use GFP_KERNEL instead of GFP_ATOMIC. Signed-off-by: Sunil Mushran <sunil.mushran at oracle.com> --- fs/ocfs2/dlm/dlmdomain.c | 28 ++++++++++++++++------------ 1 files changed, 16 insertions(+), 12 deletions(-) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 7e38a07..99805d5 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -926,9 +926,10 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, } static int dlm_match_regions(struct dlm_ctxt *dlm, - struct dlm_query_region *qr) + struct dlm_query_region *qr, + char *local, int locallen) { - char *local = NULL, *remote = qr->qr_regions; + char *remote = qr->qr_regions; char *l, *r; int localnr, i, j, foundit; int status = 0; @@ -957,13 +958,8 @@ static int dlm_match_regions(struct dlm_ctxt *dlm, r += O2HB_MAX_REGION_NAME_LEN; } - local = kmalloc(sizeof(qr->qr_regions), GFP_ATOMIC); - if (!local) { - status = -ENOMEM; - goto bail; - } - - localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS); + localnr = min(O2NM_MAX_REGIONS, locallen/O2HB_MAX_REGION_NAME_LEN); + localnr = o2hb_get_all_regions(local, (u8)localnr); /* compare local regions with remote */ l = local; @@ -1012,8 +1008,6 @@ static int dlm_match_regions(struct dlm_ctxt *dlm, } bail: - kfree(local); - return status; } @@ -1075,6 +1069,7 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, { struct dlm_query_region *qr; struct dlm_ctxt *dlm = NULL; + char *local = NULL; int status = 0; int locked = 0; @@ -1083,6 +1078,13 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node, qr->qr_domain); + /* buffer used in dlm_mast_regions() */ + local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); + if (!local) { + status = -ENOMEM; + goto bail; + } + status = -EINVAL; spin_lock(&dlm_domain_lock); @@ -1112,13 +1114,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, goto bail; } - status = dlm_match_regions(dlm, qr); + status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions)); bail: if (locked) spin_unlock(&dlm->spinlock); spin_unlock(&dlm_domain_lock); + kfree(local); + return status; } -- 1.7.1
Joel Becker
2011-Feb-22 09:34 UTC
[Ocfs2-devel] [PATCH 1/2] ocfs2/cluster: Heartbeat mismatch message improved
On Mon, Feb 21, 2011 at 02:25:24PM -0800, Sunil Mushran wrote:> If o2hb finds unexpected values in the heartbeat slot, it prints a message > "ERROR: Device "dm-6": another node is heartbeating in our slot!" > > This patch adds more information allowing us to see the actual mismatch. > The new message reads "ERROR: Heartbeat mismatch on "dm-6": expected(92:0x234543, > 0x76567) ondisk(92:0x234543, 0x76565)"Can you still say 'another node is heartbeating in our slot', just with the detail? The new message doesn't give a clue as to what is happening. Joel -- "The first thing we do, let's kill all the lawyers." -Henry VI, IV:ii http://www.jlbec.org/ jlbec at evilplan.org