Ian Main
2008-Jun-12  15:35 UTC
[Ovirt-devel] [PATCH] Use multiple processes to check host status
This patch causes host-status to fork() up to 10 times to connect out
to hosts via libvirt.  This should help with the bottleneck we were
seeing with libvirt connect timeouts.
Signed-off-by: Ian Main <imain at redhat.com>
---
 wui/src/host-status/host-status.rb |  193 +++++++++++++++++++++---------------
 1 files changed, 114 insertions(+), 79 deletions(-)
diff --git a/wui/src/host-status/host-status.rb
b/wui/src/host-status/host-status.rb
index 41638da..eddd348 100755
--- a/wui/src/host-status/host-status.rb
+++ b/wui/src/host-status/host-status.rb
@@ -1,5 +1,5 @@
 #!/usr/bin/ruby
-# 
+#
 # Copyright (C) 2008 Red Hat, Inc.
 # Written by Chris Lalancette <clalance at redhat.com>
 #
@@ -29,7 +29,7 @@ include Daemonize
 $logfile = '/var/log/ovirt-wui/host-status.log'
 
 do_daemon = true
-sleeptime = 5
+sleeptime = 20
 opts = OptionParser.new do |opts|
   opts.on("-h", "--help", "Print help message")
do
     puts opts
@@ -97,104 +97,139 @@ def kick_taskomatic(msg, vm)
   task.save
 end
 
-loop do
-  get_credentials
 
-  hosts = Host.find(:all)
-  hosts.each do |host|
-    
-    begin
-      conn = Libvirt::open("qemu+tcp://" + host.hostname +
"/system")
-    rescue
-      # we couldn't contact the host for whatever reason.  Since we
can't get
-      # to this host, we have to mark all vms on it as disconnected or stopped
-      # or such.
-      if host.state != "unavailable"
-        puts "Updating host state to unavailable: " + host.hostname
-        host.state = "unavailable"
-        host.save
-      end
+def check_status(host)
 
-      Vm.find(:all, :conditions => [ "host_id = ?", host.id
]).each do |vm|
-        # Since we can't reach the host on which the vms reside, we mark
these
-        # as STATE_UNREACHABLE.  If they come back up we can mark them as
-        # running again, else they'll be stopped.  At least for now the
user
-	# will know what's going on.
-        #
-        # If this causes too much trouble in the UI, this can be changed to
-        # STATE_STOPPED for now until it is resolved of another solution is
-        # brought forward.
-
-        if vm.state != Vm::STATE_UNREACHABLE:
-          kick_taskomatic(Vm::STATE_UNREACHABLE, vm)
-        end
+  # This is in a new process, we need a new database connection.
+  database_connect
+
+  begin
+    puts "Connecting to host " + host.hostname
+    conn = Libvirt::open("qemu+tcp://" + host.hostname +
"/system")
+  rescue
+    # we couldn't contact the host for whatever reason.  Since we can't
get
+    # to this host, we have to mark all vms on it as disconnected or stopped
+    # or such.
+    if host.state != "unavailable"
+      puts "Updating host state to unavailable: " + host.hostname
+      host.state = "unavailable"
+      host.save
+    end
+
+    Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each
do |vm|
+      # Since we can't reach the host on which the vms reside, we mark
these
+      # as STATE_UNREACHABLE.  If they come back up we can mark them as
+      # running again, else they'll be stopped.  At least for now the user
+      # will know what's going on.
+      #
+      # If this causes too much trouble in the UI, this can be changed to
+      # STATE_STOPPED for now until it is resolved of another solution is
+      # brought forward.
+
+      if vm.state != Vm::STATE_UNREACHABLE:
+        kick_taskomatic(Vm::STATE_UNREACHABLE, vm)
       end
 
+    end
+
+    return
+  end
+
+  if host.state != "available"
+    puts "Updating host state to available: " + host.hostname
+    host.state = "available"
+    host.save
+  end
+
+  begin
+    vm_ids = conn.list_domains
+  rescue
+    puts "Failed to request domain list on host " + host.hostname
+    conn.close
+    next
+  end
+
+  # Here we're going through every vm listed through libvirt.  This
+  # really only lets us find ones that are started that shouldn't be.
+  vm_ids.each do |vm_id|
+    puts "VM ID: %d" % [vm_id]
+    begin
+      dom = conn.lookup_domain_by_id(vm_id)
+    rescue
+      puts "Failed to find domain " + vm.description
       next
     end
 
-    if host.state != "available"
-      puts "Updating host state to available: " + host.hostname
-      host.state = "available"
-      host.save
+    vm_uuid = dom.uuid
+    info = dom.info
+
+    puts "VM UUID: %s" % [vm_uuid]
+    info = dom.info
+
+    vm = Vm.find(:first, :conditions => [ "uuid = ?", vm_uuid ])
+    if vm == nil
+      puts "VM Not found in database, must be created by user.  giving
up."
+      next
     end
 
+    check_state(vm, info)
+  end
+
+  # Now we get a list of all vms that should be on this system and see if
+  # they are all running.
+  Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do
|vm|
+
     begin
-      vm_ids = conn.list_domains
+      dom = conn.lookup_domain_by_uuid(vm.uuid)
     rescue
-      puts "Failed to request domain list on host " + host.hostname
-      conn.close
+      # OK.  We couldn't find the UUID that we thought was there.  The only
+      # explanation is that the domain is dead.
+      puts "Failed to find domain " + vm.description
+      kick_taskomatic(Vm::STATE_STOPPED, vm)
       next
     end
+    info = dom.info
+    check_state(vm, info)
 
-    # Here we're going through every vm listed through libvirt.  This
-    # really only lets us find ones that are started that shouldn't be.
-    vm_ids.each do |vm_id|
-      puts "VM ID: %d" % [vm_id]
-      begin
-        dom = conn.lookup_domain_by_id(vm_id)
-      rescue
-        puts "Failed to find domain " + vm.description
-        next
-      end
-      
-      vm_uuid = dom.uuid
-      info = dom.info
-
-      puts "VM UUID: %s" % [vm_uuid]
-      info = dom.info
-      puts info.to_s
- 
-      vm = Vm.find(:first, :conditions => [ "uuid = ?", vm_uuid ])
-      if vm == nil
-        puts "VM Not found in database, must be created by user.  giving
up."
-        next
-      end
+    conn.close
 
-      check_state(vm, info)
-    end
+  end
+end
 
-    # Now we get a list of all vms that should be on this system and see if
-    # they are all running.
-    Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each
do |vm|
-    
-      begin
-        dom = conn.lookup_domain_by_uuid(vm.uuid)
-      rescue
-        # OK.  We couldn't find the UUID that we thought was there.  The
only
-        # explanation is that the domain is dead.
-        puts "Failed to find domain " + vm.description
-        kick_taskomatic(Vm::STATE_STOPPED, vm)
-        next
-      end
-      info = dom.info
-      check_state(vm, info)
+get_credentials
 
-      conn.close
+loop do
+
+  # fork() seems to really mess with our db connection.  Need to have this
+  # in the main connection as well.  I verified it's not leaking
connections/fds.
+  database_connect
+  hosts = Host.find(:all)
+
+  p_count = 0
+  hosts.each do |host|
+
+    p_count += 1
 
+    # Only allow up to 10 processes running at a time.  If we go above 10
+    # Then we wait for one to exit before continuing.
+    if p_count > 10
+      Process.wait
+      p_count -= 1
     end
+
+    fork do
+      check_status(host)
+      exit 0
+    end
+
   end
 
+  while p_count > 0
+    Process.wait
+    p_count -= 1
+  end
+
+
   STDOUT.flush
   sleep sleeptime
 end
-- 
1.5.5.1
Ian Main
2008-Jun-13  21:38 UTC
[Ovirt-devel] [PATCH] Use multiple processes to check host status
This patch causes host-status to fork() up to node_count/5 times to
connect out to hosts via libvirt.  This guarantees that that it takes at
most 5 timeouts in a row to verify all nodes.  This should help with the
bottleneck we were seeing with libvirt connect timeouts.  Testing with 105
nodes, almost all of which were down, it took 27s to query all of them.
Signed-off-by: Ian Main <imain at redhat.com>
---
 wui/src/host-status/host-status.rb |  194 +++++++++++++++++++++---------------
 1 files changed, 115 insertions(+), 79 deletions(-)
diff --git a/wui/src/host-status/host-status.rb
b/wui/src/host-status/host-status.rb
index 41638da..fcfd586 100755
--- a/wui/src/host-status/host-status.rb
+++ b/wui/src/host-status/host-status.rb
@@ -1,5 +1,5 @@
 #!/usr/bin/ruby
-# 
+#
 # Copyright (C) 2008 Red Hat, Inc.
 # Written by Chris Lalancette <clalance at redhat.com>
 #
@@ -29,7 +29,7 @@ include Daemonize
 $logfile = '/var/log/ovirt-wui/host-status.log'
 
 do_daemon = true
-sleeptime = 5
+sleeptime = 20
 opts = OptionParser.new do |opts|
   opts.on("-h", "--help", "Print help message")
do
     puts opts
@@ -97,104 +97,140 @@ def kick_taskomatic(msg, vm)
   task.save
 end
 
-loop do
-  get_credentials
 
-  hosts = Host.find(:all)
-  hosts.each do |host|
-    
-    begin
-      conn = Libvirt::open("qemu+tcp://" + host.hostname +
"/system")
-    rescue
-      # we couldn't contact the host for whatever reason.  Since we
can't get
-      # to this host, we have to mark all vms on it as disconnected or stopped
-      # or such.
-      if host.state != "unavailable"
-        puts "Updating host state to unavailable: " + host.hostname
-        host.state = "unavailable"
-        host.save
-      end
+def check_status(host)
 
-      Vm.find(:all, :conditions => [ "host_id = ?", host.id
]).each do |vm|
-        # Since we can't reach the host on which the vms reside, we mark
these
-        # as STATE_UNREACHABLE.  If they come back up we can mark them as
-        # running again, else they'll be stopped.  At least for now the
user
-	# will know what's going on.
-        #
-        # If this causes too much trouble in the UI, this can be changed to
-        # STATE_STOPPED for now until it is resolved of another solution is
-        # brought forward.
-
-        if vm.state != Vm::STATE_UNREACHABLE:
-          kick_taskomatic(Vm::STATE_UNREACHABLE, vm)
-        end
+  # This is in a new process, we need a new database connection.
+  database_connect
+
+  begin
+    puts "Connecting to host " + host.hostname
+    conn = Libvirt::open("qemu+tcp://" + host.hostname +
"/system")
+  rescue
+    # we couldn't contact the host for whatever reason.  Since we can't
get
+    # to this host, we have to mark all vms on it as disconnected or stopped
+    # or such.
+    if host.state != "unavailable"
+      puts "Updating host state to unavailable: " + host.hostname
+      host.state = "unavailable"
+      host.save
+    end
+
+    Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each
do |vm|
+      # Since we can't reach the host on which the vms reside, we mark
these
+      # as STATE_UNREACHABLE.  If they come back up we can mark them as
+      # running again, else they'll be stopped.  At least for now the user
+      # will know what's going on.
+      #
+      # If this causes too much trouble in the UI, this can be changed to
+      # STATE_STOPPED for now until it is resolved of another solution is
+      # brought forward.
+
+      if vm.state != Vm::STATE_UNREACHABLE:
+        kick_taskomatic(Vm::STATE_UNREACHABLE, vm)
       end
 
+    end
+
+    return
+  end
+
+  if host.state != "available"
+    puts "Updating host state to available: " + host.hostname
+    host.state = "available"
+    host.save
+  end
+
+  begin
+    vm_ids = conn.list_domains
+  rescue
+    puts "Failed to request domain list on host " + host.hostname
+    conn.close
+    next
+  end
+
+  # Here we're going through every vm listed through libvirt.  This
+  # really only lets us find ones that are started that shouldn't be.
+  vm_ids.each do |vm_id|
+    puts "VM ID: %d" % [vm_id]
+    begin
+      dom = conn.lookup_domain_by_id(vm_id)
+    rescue
+      puts "Failed to find domain " + vm.description
       next
     end
 
-    if host.state != "available"
-      puts "Updating host state to available: " + host.hostname
-      host.state = "available"
-      host.save
+    vm_uuid = dom.uuid
+    info = dom.info
+
+    puts "VM UUID: %s" % [vm_uuid]
+    info = dom.info
+
+    vm = Vm.find(:first, :conditions => [ "uuid = ?", vm_uuid ])
+    if vm == nil
+      puts "VM Not found in database, must be created by user.  giving
up."
+      next
     end
 
+    check_state(vm, info)
+  end
+
+  # Now we get a list of all vms that should be on this system and see if
+  # they are all running.
+  Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do
|vm|
+
     begin
-      vm_ids = conn.list_domains
+      dom = conn.lookup_domain_by_uuid(vm.uuid)
     rescue
-      puts "Failed to request domain list on host " + host.hostname
-      conn.close
+      # OK.  We couldn't find the UUID that we thought was there.  The only
+      # explanation is that the domain is dead.
+      puts "Failed to find domain " + vm.description
+      kick_taskomatic(Vm::STATE_STOPPED, vm)
       next
     end
+    info = dom.info
+    check_state(vm, info)
 
-    # Here we're going through every vm listed through libvirt.  This
-    # really only lets us find ones that are started that shouldn't be.
-    vm_ids.each do |vm_id|
-      puts "VM ID: %d" % [vm_id]
-      begin
-        dom = conn.lookup_domain_by_id(vm_id)
-      rescue
-        puts "Failed to find domain " + vm.description
-        next
-      end
-      
-      vm_uuid = dom.uuid
-      info = dom.info
-
-      puts "VM UUID: %s" % [vm_uuid]
-      info = dom.info
-      puts info.to_s
- 
-      vm = Vm.find(:first, :conditions => [ "uuid = ?", vm_uuid ])
-      if vm == nil
-        puts "VM Not found in database, must be created by user.  giving
up."
-        next
-      end
+    conn.close
 
-      check_state(vm, info)
-    end
+  end
+end
 
-    # Now we get a list of all vms that should be on this system and see if
-    # they are all running.
-    Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each
do |vm|
-    
-      begin
-        dom = conn.lookup_domain_by_uuid(vm.uuid)
-      rescue
-        # OK.  We couldn't find the UUID that we thought was there.  The
only
-        # explanation is that the domain is dead.
-        puts "Failed to find domain " + vm.description
-        kick_taskomatic(Vm::STATE_STOPPED, vm)
-        next
-      end
-      info = dom.info
-      check_state(vm, info)
+get_credentials
 
-      conn.close
+loop do
+
+  # fork() seems to really mess with our db connection.  Need to have this
+  # in the main connection as well.  I verified it's not leaking
connections/fds.
+  database_connect
+  hosts = Host.find(:all)
+
+  p_count = 0
+  hosts.each do |host|
+
+    p_count += 1
 
+    # Only allow up to n_hosts / 5 processes running at a time.  If we go above
this
+    # Then we wait for one to exit before continuing.  This guarantees it will
take
+    # at most 5 timeouts to check all hosts.
+    if p_count > hosts.length / 5
+      Process.wait
+      p_count -= 1
     end
+
+    fork do
+      check_status(host)
+      exit 0
+    end
+
   end
 
+  while p_count > 0
+    Process.wait
+    p_count -= 1
+  end
+
+
   STDOUT.flush
   sleep sleeptime
 end
-- 
1.5.5.1