This patch separates o2net and o2quo from knowing about one another as much
as possible. This is the first in a series of patches that will allow
userspace cluster interaction. Quorum is separated out first, and will
ultimately only be associated with the disk heartbeat as a separate module.
To do so, this patch performs the following changes:
* o2hb_notify() is added to handle injection of events in a synchronous
manner. All locking is preserved as expected.
* disk hearbeat timeouts now inject an event for this node being down. This
event is handled as special by o2quo which fences the node.
* o2quo callbacks are now called directly by heartbeat rather than going
through o2net. Previously, o2net callbacks called o2quo callbacks
immediately. This ordering is preserved by increasing o2quo's priority
over
o2net.
* Two new heartbeat event types are added: O2HB_CONN_{UP,DOWN}_CB, which
correspond to tcp connections being established and terminated.
* Outside of callbacks, where o2net used to call o2quo functions directly,
it now injects the O2HB_CONN_{UP,DOWN}_CB events.
* o2net knowledge of o2quo in header files has been moved to quorum.h
* o2net's handling of quorum decisions on connection failure has been
moved to o2quo.
* o2quo is initialized by the nodemanager rather than by o2net.
*******
Unfortunately, this code is actually broken. It will cause a deadlock when
umounting the last file system due to a deadlock on o2hb_callback_sem.
Don't actually use this code; It's just posted for review
*******
fs/ocfs2/cluster/heartbeat.c | 14 ++++++
fs/ocfs2/cluster/heartbeat.h | 5 ++
fs/ocfs2/cluster/nodemanager.c | 3 +
fs/ocfs2/cluster/quorum.c | 82 +++++++++++++++++++++++++++++++++++++---
fs/ocfs2/cluster/quorum.h | 13 ++----
fs/ocfs2/cluster/tcp.c | 36 +++++------------
fs/ocfs2/cluster/tcp_internal.h | 12 -----
7 files changed, 117 insertions(+), 48 deletions(-)
Signed-off-by: Jeff Mahoney <jeffm at suse.com>
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/heartbeat.c
linux-2.6.15-staging2/fs/ocfs2/cluster/heartbeat.c
--- linux-2.6.15-staging1/fs/ocfs2/cluster/heartbeat.c 2006-01-08
18:23:29.376721976 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/heartbeat.c 2006-01-08
18:15:23.647564032 -0500
@@ -158,6 +158,7 @@ struct o2hb_bio_wait_ctxt {
static void o2hb_write_timeout(void *arg)
{
struct o2hb_region *reg = arg;
+ struct o2nm_node *node = o2nm_get_node_by_num(o2nm_this_node());
mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
"milliseconds\n", reg->hr_dev_name,
@@ -588,6 +589,7 @@ static void o2hb_queue_node_event(struct
{
assert_spin_locked(&o2hb_live_lock);
+ INIT_LIST_HEAD(&event->hn_item);
event->hn_event_type = type;
event->hn_node = node;
event->hn_node_num = node_num;
@@ -598,6 +600,18 @@ static void o2hb_queue_node_event(struct
list_add_tail(&event->hn_item, &o2hb_node_events);
}
+void o2hb_notify(enum o2hb_callback_type type, struct o2nm_node *node,
+ int node_num)
+{
+ struct o2hb_node_event event;
+
+ spin_lock(&o2hb_live_lock);
+ o2hb_queue_node_event(&event, type, node, node_num);
+ spin_unlock(&o2hb_live_lock);
+ o2hb_run_event_list(&event);
+}
+EXPORT_SYMBOL_GPL(o2hb_notify);
+
static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
{
struct o2hb_node_event event diff -ruNpX dontdiff
linux-2.6.15-staging1/fs/ocfs2/cluster/heartbeat.h
linux-2.6.15-staging2/fs/ocfs2/cluster/heartbeat.h
--- linux-2.6.15-staging1/fs/ocfs2/cluster/heartbeat.h 2006-01-08
18:23:29.376721976 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/heartbeat.h 2006-01-08
18:13:52.643398768 -0500
@@ -46,6 +46,8 @@ extern unsigned int o2hb_dead_threshold;
enum o2hb_callback_type {
O2HB_NODE_DOWN_CB = 0,
O2HB_NODE_UP_CB,
+ O2HB_CONN_DOWN_CB, /* When a TCP connection fails */
+ O2HB_CONN_UP_CB, /* When a TCP connection is made */
O2HB_NUM_CB
};
@@ -78,5 +80,8 @@ int o2hb_check_node_heartbeating(u8 node
int o2hb_check_node_heartbeating_from_callback(u8 node_num);
int o2hb_check_local_node_heartbeating(void);
void o2hb_stop_all_regions(void);
+void o2hb_notify(enum o2hb_callback_type type, struct o2nm_node *node,
+ int node_num);
+
#endif /* O2CLUSTER_HEARTBEAT_H */
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/nodemanager.c
linux-2.6.15-staging2/fs/ocfs2/cluster/nodemanager.c
--- linux-2.6.15-staging1/fs/ocfs2/cluster/nodemanager.c 2006-01-08
18:23:29.377721824 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/nodemanager.c 2006-01-08
18:13:52.644398616 -0500
@@ -27,6 +27,7 @@
#include "endian.h"
#include "tcp.h"
#include "nodemanager.h"
+#include "quorum.h"
#include "heartbeat.h"
#include "masklog.h"
#include "sys.h"
@@ -740,6 +741,7 @@ static void __exit exit_o2nm(void)
configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
o2cb_sys_shutdown();
+ o2quo_exit();
o2net_exit();
}
@@ -750,6 +752,7 @@ static int __init init_o2nm(void)
cluster_print_version();
o2hb_init();
+ o2quo_init();
o2net_init();
ocfs2_table_header = register_sysctl_table(ocfs2_root_table, 0);
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/quorum.c
linux-2.6.15-staging2/fs/ocfs2/cluster/quorum.c
--- linux-2.6.15-staging1/fs/ocfs2/cluster/quorum.c 2006-01-08
18:23:29.377721824 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/quorum.c 2006-01-08
18:17:37.908153320 -0500
@@ -63,8 +63,14 @@ static struct o2quo_state {
unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
int qs_holds;
unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
+ struct work_struct qs_node_work[O2NM_MAX_NODES];
} o2quo_state;
+static struct o2hb_callback_func o2quo_hb_up_cb, o2quo_hb_down_cb;
+static struct o2hb_callback_func o2quo_hb_conn_up, o2quo_hb_conn_down;
+#define O2QUO_HB_PRI 0x1
+#define O2QUO_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
+
/* this is horribly heavy-handed. It should instead flip the file
* system RO and call some userspace script. */
static void o2quo_fence_self(void)
@@ -184,7 +190,7 @@ static void o2quo_clear_hold(struct o2qu
* the connection. the hold will be droped in conn_up or hb_down. it might be
* perpetuated by con_err until hb_down. if we already have a conn, we might
* be dropping a hold that conn_up got. */
-void o2quo_hb_up(u8 node)
+void o2quo_hb_up(struct o2nm_node *_node, int node, void *data)
{
struct o2quo_state *qs = &o2quo_state;
@@ -208,7 +214,7 @@ void o2quo_hb_up(u8 node)
/* hb going down releases any holds we might have had due to this node from
* conn_up, conn_err, or hb_up */
-void o2quo_hb_down(u8 node)
+void o2quo_hb_down(struct o2nm_node *_node, int node, void *data)
{
struct o2quo_state *qs = &o2quo_state;
@@ -226,6 +237,8 @@ void o2quo_hb_down(u8 node)
o2quo_clear_hold(qs, node);
spin_unlock(&qs->qs_lock);
+
+ cancel_delayed_work(&qs->qs_node_work[node]);
}
/* this tells us that we've decided that the node is still heartbeating
@@ -233,9 +246,10 @@ void o2quo_hb_down(u8 node)
* and indicates that we must now make a quorum decision in the future,
* though we might be doing so after waiting for holds to drain. Here
* we'll be dropping the hold from conn_err. */
-void o2quo_hb_still_up(u8 node)
+void o2quo_hb_still_up(void *arg)
{
struct o2quo_state *qs = &o2quo_state;
+ u8 node = (u8)(long)arg;
spin_lock(&qs->qs_lock);
@@ -252,7 +266,7 @@ void o2quo_hb_still_up(u8 node)
* hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
* it's already heartbeating we we might be dropping a hold that conn_up
got.
* */
-void o2quo_conn_up(u8 node)
+void o2quo_conn_up(struct o2nm_node *_node, int node, void *data)
{
struct o2quo_state *qs = &o2quo_state;
@@ -278,7 +292,7 @@ void o2quo_conn_up(u8 node)
* still heartbeating we grab a hold that will delay decisions until either the
* node stops heartbeating from hb_down or the caller decides that the node is
* still up and calls still_up */
-void o2quo_conn_err(u8 node)
+void o2quo_conn_err(struct o2nm_node *_node, int node, void *data)
{
struct o2quo_state *qs = &o2quo_state;
@@ -299,17 +313,78 @@ void o2quo_conn_err(u8 node)
o2quo_set_hold(qs, node);
spin_unlock(&qs->qs_lock);
+
+ schedule_delayed_work(&qs->qs_node_work[node],
+ msecs_to_jiffies(O2QUO_DELAY_MS));
}
-void o2quo_init(void)
+static void o2quo_unregister_hb_callbacks(void)
+{
+ int ret;
+
+ ret = o2hb_unregister_callback(&o2quo_hb_conn_up);
+ if (ret < 0)
+ mlog(ML_ERROR, "Status return %d unregistering heartbeat "
+ "conn up callback!\n", ret);
+
+ ret = o2hb_unregister_callback(&o2quo_hb_conn_down);
+ if (ret < 0)
+ mlog(ML_ERROR, "Status return %d unregistering heartbeat "
+ "conn down callback!\n", ret);
+ ret = o2hb_unregister_callback(&o2quo_hb_up_cb);
+ if (ret < 0)
+ mlog(ML_ERROR, "Status return %d unregistering heartbeat up "
+ "callback!\n", ret);
+
+ ret = o2hb_unregister_callback(&o2quo_hb_down_cb);
+ if (ret < 0)
+ mlog(ML_ERROR, "Status return %d unregistering heartbeat down "
+ "callback!\n", ret);
+}
+
+static int o2quo_register_hb_callbacks(void)
+{
+ int ret;
+
+ o2hb_setup_callback(&o2quo_hb_down_cb, O2HB_NODE_DOWN_CB,
+ o2quo_hb_down, NULL, O2QUO_HB_PRI);
+ o2hb_setup_callback(&o2quo_hb_up_cb, O2HB_NODE_UP_CB,
+ o2quo_hb_up, NULL, O2QUO_HB_PRI);
+ o2hb_setup_callback(&o2quo_hb_conn_down, O2HB_CONN_DOWN_CB,
+ o2quo_conn_err, NULL, O2QUO_HB_PRI);
+ o2hb_setup_callback(&o2quo_hb_conn_up, O2HB_CONN_UP_CB,
+ o2quo_conn_up, NULL, O2QUO_HB_PRI);
+
+ ret = o2hb_register_callback(&o2quo_hb_up_cb);
+ if (ret == 0)
+ ret = o2hb_register_callback(&o2quo_hb_down_cb);
+ if (ret == 0)
+ ret = o2hb_register_callback(&o2quo_hb_conn_up);
+ if (ret == 0)
+ ret = o2hb_register_callback(&o2quo_hb_conn_down);
+
+ if (ret)
+ o2quo_unregister_hb_callbacks();
+
+ return ret;
+}
+
+
+int o2quo_init(void)
{
struct o2quo_state *qs = &o2quo_state;
+ int i;
spin_lock_init(&qs->qs_lock);
INIT_WORK(&qs->qs_work, o2quo_make_decision, NULL);
+ for (i = 0; i < O2NM_MAX_NODES; i++)
+ INIT_WORK(&qs->qs_node_work[i], o2quo_hb_still_up, (void *)i);
+
+ return o2quo_register_hb_callbacks();
}
void o2quo_exit(void)
{
flush_scheduled_work();
+ o2quo_unregister_hb_callbacks();
}
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/quorum.h
linux-2.6.15-staging2/fs/ocfs2/cluster/quorum.h
--- linux-2.6.15-staging1/fs/ocfs2/cluster/quorum.h 2006-01-08
18:23:29.378721672 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/quorum.h 2006-01-08
18:23:55.863695344 -0500
@@ -23,14 +23,13 @@
#ifndef O2CLUSTER_QUORUM_H
#define O2CLUSTER_QUORUM_H
-void o2quo_init(void);
+int o2quo_init(void);
void o2quo_exit(void);
-
-void o2quo_hb_up(u8 node);
-void o2quo_hb_down(u8 node);
-void o2quo_hb_still_up(u8 node);
-void o2quo_conn_up(u8 node);
-void o2quo_conn_err(u8 node);
void o2quo_disk_timeout(void);
+/* we're delaying our quorum decision so that heartbeat will have timed
+ * out truly dead nodes by the time we come around to making decisions
+ * on their number */
+#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) *
O2HB_REGION_TIMEOUT_MS)
+
#endif /* O2CLUSTER_QUORUM_H */
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/tcp.c
linux-2.6.15-staging2/fs/ocfs2/cluster/tcp.c
--- linux-2.6.15-staging1/fs/ocfs2/cluster/tcp.c 2006-01-08 18:23:29.379721520
-0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/tcp.c 2006-01-08 18:13:52.646398312
-0500
@@ -67,7 +67,6 @@
#include "nodemanager.h"
#define MLOG_MASK_PREFIX ML_TCP
#include "masklog.h"
-#include "quorum.h"
#include "tcp_internal.h"
@@ -128,7 +127,7 @@ static struct workqueue_struct *o2net_wq
static struct work_struct o2net_listen_work;
static struct o2hb_callback_func o2net_hb_up, o2net_hb_down;
-#define O2NET_HB_PRI 0x1
+#define O2NET_HB_PRI 0x2
static struct o2net_handshake *o2net_hand;
static struct o2net_msg *o2net_keep_req, *o2net_keep_resp;
@@ -390,9 +389,9 @@ static void o2net_set_nn_state(struct o2
wake_up(&nn->nn_sc_wq);
if (!was_err && nn->nn_persistent_error) {
- o2quo_conn_err(o2net_num_from_nn(nn));
- queue_delayed_work(o2net_wq, &nn->nn_still_up,
- msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+ u8 node_num = o2net_num_from_nn(nn);
+ struct o2nm_node *node = o2nm_get_node_by_num(node_num);
+ o2hb_notify(O2HB_CONN_DOWN_CB, node, node_num);
}
if (was_valid && !valid) {
@@ -402,7 +401,11 @@ static void o2net_set_nn_state(struct o2
}
if (!was_valid && valid) {
- o2quo_conn_up(o2net_num_from_nn(nn));
+ u8 node_num = o2net_num_from_nn(nn);
+ struct o2nm_node *node = o2nm_get_node_by_num(node_num);
+
+ o2hb_notify(O2HB_CONN_UP_CB, node, node_num);
+
/* this is a bit of a hack. we only try reconnecting
* when heartbeating starts until we get a connection.
* if that connection then dies we don't try reconnecting.
@@ -1424,13 +1427,6 @@ static void o2net_connect_expired(void *
spin_unlock(&nn->nn_lock);
}
-static void o2net_still_up(void *arg)
-{
- struct o2net_node *nn = arg;
-
- o2quo_hb_still_up(o2net_num_from_nn(nn));
-}
-
/* ------------------------------------------------------------ */
void o2net_disconnect_node(struct o2nm_node *node)
@@ -1445,7 +1441,6 @@ void o2net_disconnect_node(struct o2nm_n
if (o2net_wq) {
cancel_delayed_work(&nn->nn_connect_expired);
cancel_delayed_work(&nn->nn_connect_work);
- cancel_delayed_work(&nn->nn_still_up);
flush_workqueue(o2net_wq);
}
}
@@ -1453,8 +1448,6 @@ void o2net_disconnect_node(struct o2nm_n
static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
void *data)
{
- o2quo_hb_down(node_num);
-
if (node_num != o2nm_this_node())
o2net_disconnect_node(node);
}
@@ -1464,8 +1457,6 @@ static void o2net_hb_node_up_cb(struct o
{
struct o2net_node *nn = o2net_nn_from_num(node_num);
- o2quo_hb_up(node_num);
-
/* ensure an immediate connect attempt */
nn->nn_last_connect_attempt = jiffies -
(msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS) + 1);
@@ -1739,7 +1730,7 @@ int o2net_start_listening(struct o2nm_no
destroy_workqueue(o2net_wq);
o2net_wq = NULL;
} else
- o2quo_conn_up(node->nd_num);
+ o2hb_notify(O2HB_CONN_UP_CB, node, node->nd_num);
return ret;
}
@@ -1776,7 +1767,7 @@ void o2net_stop_listening(struct o2nm_no
sock_release(o2net_listen_sock);
o2net_listen_sock = NULL;
- o2quo_conn_err(node->nd_num);
+ o2hb_notify(O2HB_CONN_DOWN_CB, node, node->nd_num);
}
/* ------------------------------------------------------------ */
@@ -1785,8 +1776,6 @@ int o2net_init(void)
{
unsigned long i;
- o2quo_init();
-
o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL);
o2net_keep_req = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
o2net_keep_resp = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL);
@@ -1805,11 +1794,11 @@ int o2net_init(void)
for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
struct o2net_node *nn = o2net_nn_from_num(i);
+ memset(nn, 0, sizeof (*nn));
spin_lock_init(&nn->nn_lock);
INIT_WORK(&nn->nn_connect_work, o2net_start_connect, nn);
INIT_WORK(&nn->nn_connect_expired, o2net_connect_expired, nn);
- INIT_WORK(&nn->nn_still_up, o2net_still_up, nn);
/* until we see hb from a node we'll return einval */
nn->nn_persistent_error = -ENOTCONN;
init_waitqueue_head(&nn->nn_sc_wq);
@@ -1822,7 +1811,6 @@ int o2net_init(void)
void o2net_exit(void)
{
- o2quo_exit();
kfree(o2net_hand);
kfree(o2net_keep_req);
kfree(o2net_keep_resp);
diff -ruNpX dontdiff linux-2.6.15-staging1/fs/ocfs2/cluster/tcp_internal.h
linux-2.6.15-staging2/fs/ocfs2/cluster/tcp_internal.h
--- linux-2.6.15-staging1/fs/ocfs2/cluster/tcp_internal.h 2006-01-08
18:23:29.379721520 -0500
+++ linux-2.6.15-staging2/fs/ocfs2/cluster/tcp_internal.h 2006-01-08
18:13:52.646398312 -0500
@@ -28,12 +28,7 @@
#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
/* same as hb delay, we're waiting for another node to recognize our hb */
-#define O2NET_RECONNECT_DELAY_MS O2HB_REGION_TIMEOUT_MS
-
-/* we're delaying our quorum decision so that heartbeat will have timed
- * out truly dead nodes by the time we come around to making decisions
- * on their number */
-#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) *
O2HB_REGION_TIMEOUT_MS)
+#define O2NET_RECONNECT_DELAY_MS 2000
#define O2NET_KEEPALIVE_DELAY_SECS 5
#define O2NET_IDLE_TIMEOUT_SECS 10
@@ -87,11 +82,6 @@ struct o2net_node {
* established. this expiring gives up on the node and errors out
* transmits */
struct work_struct nn_connect_expired;
-
- /* after we give up on a socket we wait a while before deciding
- * that it is still heartbeating and that we should do some
- * quorum work */
- struct work_struct nn_still_up;
};
struct o2net_sock_container {