Sunil Mushran
2009-Nov-17 21:07 UTC
[Ocfs2-devel] [PATCH 1/1] ocfs2/cluster: Make fence method configurable
By default, o2cb fences the box by calling emergency_restart(). While this scheme works well in production, it comes in the way during testing as it does not let the tester take stack/core dumps for analysis. This patch allows user to dynamically change the fence method to panic() by: # echo "panic" > /sys/kernel/config/cluster/<clustername>/fence_method Signed-off-by: Sunil Mushran <sunil.mushran at oracle.com> --- fs/ocfs2/cluster/nodemanager.c | 50 ++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/cluster/nodemanager.h | 7 +++++ fs/ocfs2/cluster/quorum.c | 14 +++++++++- 3 files changed, 69 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 7ee6188..77cdc09 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -35,6 +35,11 @@ * cluster references throughout where nodes are looked up */ struct o2nm_cluster *o2nm_single_cluster = NULL; +char *o2nm_fence_method_desc[O2NM_FENCE_METHODS + 1] = { + "reset", /* O2NM_FENCE_RESET */ + "panic", /* O2NM_FENCE_PANIC */ + "unknown", +}; struct o2nm_node *o2nm_get_node_by_num(u8 node_num) { @@ -579,6 +584,41 @@ static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write( return o2nm_cluster_attr_write(page, count, &cluster->cl_reconnect_delay_ms); } + +static ssize_t o2nm_cluster_attr_fence_method_read( + struct o2nm_cluster *cluster, char *page) +{ + unsigned int i = O2NM_FENCE_METHODS; + + if (cluster && cluster->cl_fence_method < O2NM_FENCE_METHODS) + i = cluster->cl_fence_method; + + return sprintf(page, "%s\n", o2nm_fence_method_desc[i]); +} + +static ssize_t o2nm_cluster_attr_fence_method_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + unsigned int i; + + if (page[count - 1] != '\n') + goto bail; + + for (i = 0; i < O2NM_FENCE_METHODS; ++i) { + if (count != strlen(o2nm_fence_method_desc[i]) + 1) + continue; + if (!strncasecmp(page, o2nm_fence_method_desc[i], count - 1)) { + printk(KERN_INFO "ocfs2: Changing fence method to %s\n", + o2nm_fence_method_desc[i]); + cluster->cl_fence_method = i; + return count; + } + } + +bail: + return -EINVAL; +} + static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = { .attr = { .ca_owner = THIS_MODULE, .ca_name = "idle_timeout_ms", @@ -603,10 +643,19 @@ static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = { .store = o2nm_cluster_attr_reconnect_delay_ms_write, }; +static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "fence_method", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_fence_method_read, + .store = o2nm_cluster_attr_fence_method_write, +}; + static struct configfs_attribute *o2nm_cluster_attrs[] = { &o2nm_cluster_attr_idle_timeout_ms.attr, &o2nm_cluster_attr_keepalive_delay_ms.attr, &o2nm_cluster_attr_reconnect_delay_ms.attr, + &o2nm_cluster_attr_fence_method.attr, NULL, }; static ssize_t o2nm_cluster_show(struct config_item *item, @@ -778,6 +827,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT; cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT; + cluster->cl_fence_method = O2NM_FENCE_RESET; ret = &cluster->cl_group; o2nm_single_cluster = cluster; diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index c992ea0..09ea2d3 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h @@ -33,6 +33,12 @@ #include <linux/configfs.h> #include <linux/rbtree.h> +enum o2nm_fence_method { + O2NM_FENCE_RESET = 0, + O2NM_FENCE_PANIC, + O2NM_FENCE_METHODS, /* Number of fence methods */ +}; + struct o2nm_node { spinlock_t nd_lock; struct config_item nd_item; @@ -58,6 +64,7 @@ struct o2nm_cluster { unsigned int cl_idle_timeout_ms; unsigned int cl_keepalive_delay_ms; unsigned int cl_reconnect_delay_ms; + enum o2nm_fence_method cl_fence_method; /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index bbacf7d..cc6ed4e 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -74,8 +74,18 @@ static void o2quo_fence_self(void) * threads can still schedule, etc, etc */ o2hb_stop_all_regions(); - printk("ocfs2 is very sorry to be fencing this system by restarting\n"); - emergency_restart(); + switch (o2nm_single_cluster->cl_fence_method) { + case O2NM_FENCE_PANIC: + panic("*** ocfs2 is very sorry to be fencing this system by " + "panicing ***\n"); + break; + case O2NM_FENCE_RESET: + default: + printk("*** ocfs2 is very sorry to be fencing this system by " + "restarting ***\n"); + emergency_restart(); + break; + }; } /* Indicate that a timeout occured on a hearbeat region write. The -- 1.5.6.5
Joel Becker
2009-Nov-17 21:43 UTC
[Ocfs2-devel] [PATCH 1/1] ocfs2/cluster: Make fence method configurable
On Tue, Nov 17, 2009 at 01:07:58PM -0800, Sunil Mushran wrote:> By default, o2cb fences the box by calling emergency_restart(). While this > scheme works well in production, it comes in the way during testing as it > does not let the tester take stack/core dumps for analysis. > > This patch allows user to dynamically change the fence method to panic() by: > # echo "panic" > /sys/kernel/config/cluster/<clustername>/fence_method > > Signed-off-by: Sunil Mushran <sunil.mushran at oracle.com> > --- > fs/ocfs2/cluster/nodemanager.c | 50 ++++++++++++++++++++++++++++++++++++++++ > fs/ocfs2/cluster/nodemanager.h | 7 +++++ > fs/ocfs2/cluster/quorum.c | 14 +++++++++- > 3 files changed, 69 insertions(+), 2 deletions(-) > > diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c > index 7ee6188..77cdc09 100644 > --- a/fs/ocfs2/cluster/nodemanager.c > +++ b/fs/ocfs2/cluster/nodemanager.c > @@ -35,6 +35,11 @@ > * cluster references throughout where nodes are looked up */ > struct o2nm_cluster *o2nm_single_cluster = NULL; > > +char *o2nm_fence_method_desc[O2NM_FENCE_METHODS + 1] = { > + "reset", /* O2NM_FENCE_RESET */ > + "panic", /* O2NM_FENCE_PANIC */ > + "unknown", > +};Why do you have "unknown"? There's no point to it; we default to "reset" and you never allow anything about O2NM_FENCE_METHODS to be set.> +static ssize_t o2nm_cluster_attr_fence_method_read( > + struct o2nm_cluster *cluster, char *page) > +{ > + unsigned int i = O2NM_FENCE_METHODS; > + > + if (cluster && cluster->cl_fence_method < O2NM_FENCE_METHODS) > + i = cluster->cl_fence_method;cl_fence_method should always be below O2NM_FENCE_METHODS. You only need to check if(cluster). If cl_fence_method is off, we'll crash, which is fine.> + switch (o2nm_single_cluster->cl_fence_method) { > + case O2NM_FENCE_PANIC: > + panic("*** ocfs2 is very sorry to be fencing this system by " > + "panicing ***\n"); > + break; > + case O2NM_FENCE_RESET: > + default:Why not: + default: + WARN_ON(o2nm_single_cluster->cl_fence_method >+ O2NM_FENCE_METHODS); + case O2NM_FENCE_RESET: so that netconsole will catch a wacky cl_fence_method.> + printk("*** ocfs2 is very sorry to be fencing this system by " > + "restarting ***\n");And here, print KERN_ERR so noone filters it out with low klogd levels. Joel -- "For every complex problem there exists a solution that is brief, concise, and totally wrong." -Unknown Joel Becker Principal Software Developer Oracle E-mail: joel.becker at oracle.com Phone: (650) 506-8127