On 3/15/07, Mustafa A. Hashmi <mahashmi@gmail.com>
wrote:> David:
>
> On 3/14/07, David Vasil <dmvasil@ornl.gov> wrote:
> > What are people doing for failover (at the lustre layer) under 1.4.X
> > series lustre? Specifically the failing of OSTs between a failed host
> > and its failover pair.
>
> The failover bit is easily controlled via lconf and grouping of nodes. The
> issue, as you list it further on:
>
> > Under 1.4.9 I have found that the --group feature to lconf does not
> > appear to work. Likewise I have had issues with "lconf --cleanup
> > --force --service <ost> <config file>" trying to
unload all of lustre
> > modules on a running OSS (which leaves the OSS in somewhat of a bad
> state).
>
> is exactly what I am facing as well. Recently on a 1.4.9 cluster while
> trying to ''fail-back'' an ost to the primary oss, the
secondary oss which had
> taken over services refused to give them up. Unfortunately time on that
> cluster was limited for me and I am relegated to setting 1.4.9 up on a few
> new systems to carry on testing.
>
> I will update you (and all) within 2 days hopefully. If anyone else can
pipe
> in what David and myself may be doing wrong given the lconf commands listed
> above, it would greatly help.
> Regards,
> --
Okay so from my understanding we are trying the same things on a
production cluster with 1.4.6 (ish) and we had to code out lactive to
use config files since we no longer needed ldap. From my
understanding failover is supposed to happen using lactive and is
kinda a manual process to do. (1.6 takes care of all this with magic
and the mgs).
So I had to patch lactive and lusterdb.py to use config files as well as ldap.
======= lactive patch ========--- b/lactive 2007-02-22 13:59:49.000000000
-0800
+++ a/lactive 2007-03-15 08:47:05.987903000 -0700
@@ -31,6 +31,7 @@
import sys, getopt, types
import string, os
import ldap
+import xml.dom.minidom
from stat import S_IROTH, S_IRGRP
PYMOD_DIR = [ "/usr/lib64/lustre/python",
"/usr/lib/lustre/python" ]
@@ -52,6 +53,7 @@
(''group'', "The group of devices to update",
PARAM),
(''active'', "The active node name", PARAM),
(''pwfile'', "File containing password", PARAM),
+ (''config_file'', "Config file to use instead of
ldap", PARAM)
]
def fatal(*args):
@@ -59,44 +61,57 @@
print "! " + msg
sys.exit(1)
+class stdoutWriter:
+ def stdoutWrite(self, text):
+ print text
+
cl = Lustre.Options("lactive","", lactive_options)
config, args = cl.parse(sys.argv[1:])
if not (config.group or config.active):
- fatal("Must specify both group and active node.")
+ fatal("Must specify both group and active node.")
+
+if not config.config_file:
+ if not config.config:
+ fatal("Missing config")
+
+ if config.pwfile:
+ try:
+ pwperm = os.stat(config.pwfile)[0]
+ pwreadable = pwperm & (S_IRGRP | S_IROTH)
+ if pwreadable:
+ if pwreadable == (S_IRGRP | S_IROTH):
+ readable_by = "group and others"
+ elif pwreadable == S_IRGRP:
+ readable_by = "group"
+ else:
+ readable_by = "others"
+ print "WARNING: Password file %s is readable by %s" %
(
+ config.pwfile, readable_by)
+
+ pwfile = open(config.pwfile, "r")
+ pw = string.strip(pwfile.readline())
+ pwfile.close()
+ except Exception, e:
+ fatal("Can''t read secret from pwfile %s: %s" %
(config.pwfile, e))
+ else:
+ print "no pwfile specified, binding anonymously"
+ pw = ""
-if not config.config:
- fatal("Missing config")
+ base = "config=%s,fs=lustre" % (config.config,)
-if config.pwfile:
try:
- pwperm = os.stat(config.pwfile)[0]
- pwreadable = pwperm & (S_IRGRP | S_IROTH)
- if pwreadable:
- if pwreadable == (S_IRGRP | S_IROTH):
- readable_by = "group and others"
- elif pwreadable == S_IRGRP:
- readable_by = "group"
- else:
- readable_by = "others"
- print "WARNING: Password file %s is readable by %s" % (
- config.pwfile, readable_by)
-
- pwfile = open(config.pwfile, "r")
- pw = string.strip(pwfile.readline())
- pwfile.close()
- except Exception, e:
- fatal("Can''t read secret from pwfile %s: %s" %
(config.pwfile, e))
+ db = Lustre.LustreDB_LDAP('''', {}, base=base, pw = pw,
url config.ldapurl, update = 1)
+ except Lustre.error.LconfError, e:
+ print e
+ sys.exit(1)
else:
- print "no pwfile specified, binding anonymously"
- pw = ""
-
-base = "config=%s,fs=lustre" % (config.config,)
-try:
- db = Lustre.LustreDB_LDAP('''', {}, base=base, pw = pw, url
config.ldapurl, update = 1)
-except Lustre.error.LconfError, e:
- print e
- sys.exit(1)
+ try:
+ dom = xml.dom.minidom.parse(config.config_file)
+ db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
+ except Lustre.error.LconfError, e:
+ print e
+ sys.exit(1)
active_node = db.lookup_name(config.active)
if not active_node:
@@ -118,7 +133,13 @@
config.active, new_active_uuid))
db.update_active(tgtuuid, new_active_uuid)
-
-
-
-
+if config.config_file:
+ fp = open(config.config_file+".new", "w")
+ try:
+ db.dom_node.writexml(fp)
+ except ImportError, e:
+ print e
+ print "Bah!!! there was an import error!!!"
+ sys.exit(1)
+ os.rename(config.config_file, config.config_file+"~")
+ os.rename(config.config_file+".new", config.config_file)
============ end lactive patch ========================
============ lustredb.py patch ========================--- a/lustredb.py
2007-03-14 09:57:20.000000000 -0700
+++ b/lustredb.py 2007-03-15 08:51:52.028975000 -0700
@@ -384,7 +384,10 @@
return ret
def _update_active(self, tgt, new):
- raise Lustre.LconfError("updates not implemented for XML")
+ node = self.xmllookup_by_uuid(self.dom_node, tgt)
+ children = node.getElementsByTagName("active_ref")
+ active_ref = children[0]
+ active_ref.setAttribute("uuidref", new)
# =============================================================== # LDAP
Support
=============== end lustredb.py patch =========================