Ian Main
2008-Jun-12 15:35 UTC
[Ovirt-devel] [PATCH] Use multiple processes to check host status
This patch causes host-status to fork() up to 10 times to connect out to hosts via libvirt. This should help with the bottleneck we were seeing with libvirt connect timeouts. Signed-off-by: Ian Main <imain at redhat.com> --- wui/src/host-status/host-status.rb | 193 +++++++++++++++++++++--------------- 1 files changed, 114 insertions(+), 79 deletions(-) diff --git a/wui/src/host-status/host-status.rb b/wui/src/host-status/host-status.rb index 41638da..eddd348 100755 --- a/wui/src/host-status/host-status.rb +++ b/wui/src/host-status/host-status.rb @@ -1,5 +1,5 @@ #!/usr/bin/ruby -# +# # Copyright (C) 2008 Red Hat, Inc. # Written by Chris Lalancette <clalance at redhat.com> # @@ -29,7 +29,7 @@ include Daemonize $logfile = '/var/log/ovirt-wui/host-status.log' do_daemon = true -sleeptime = 5 +sleeptime = 20 opts = OptionParser.new do |opts| opts.on("-h", "--help", "Print help message") do puts opts @@ -97,104 +97,139 @@ def kick_taskomatic(msg, vm) task.save end -loop do - get_credentials - hosts = Host.find(:all) - hosts.each do |host| - - begin - conn = Libvirt::open("qemu+tcp://" + host.hostname + "/system") - rescue - # we couldn't contact the host for whatever reason. Since we can't get - # to this host, we have to mark all vms on it as disconnected or stopped - # or such. - if host.state != "unavailable" - puts "Updating host state to unavailable: " + host.hostname - host.state = "unavailable" - host.save - end +def check_status(host) - Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do |vm| - # Since we can't reach the host on which the vms reside, we mark these - # as STATE_UNREACHABLE. If they come back up we can mark them as - # running again, else they'll be stopped. At least for now the user - # will know what's going on. - # - # If this causes too much trouble in the UI, this can be changed to - # STATE_STOPPED for now until it is resolved of another solution is - # brought forward. - - if vm.state != Vm::STATE_UNREACHABLE: - kick_taskomatic(Vm::STATE_UNREACHABLE, vm) - end + # This is in a new process, we need a new database connection. + database_connect + + begin + puts "Connecting to host " + host.hostname + conn = Libvirt::open("qemu+tcp://" + host.hostname + "/system") + rescue + # we couldn't contact the host for whatever reason. Since we can't get + # to this host, we have to mark all vms on it as disconnected or stopped + # or such. + if host.state != "unavailable" + puts "Updating host state to unavailable: " + host.hostname + host.state = "unavailable" + host.save + end + + Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do |vm| + # Since we can't reach the host on which the vms reside, we mark these + # as STATE_UNREACHABLE. If they come back up we can mark them as + # running again, else they'll be stopped. At least for now the user + # will know what's going on. + # + # If this causes too much trouble in the UI, this can be changed to + # STATE_STOPPED for now until it is resolved of another solution is + # brought forward. + + if vm.state != Vm::STATE_UNREACHABLE: + kick_taskomatic(Vm::STATE_UNREACHABLE, vm) end + end + + return + end + + if host.state != "available" + puts "Updating host state to available: " + host.hostname + host.state = "available" + host.save + end + + begin + vm_ids = conn.list_domains + rescue + puts "Failed to request domain list on host " + host.hostname + conn.close + next + end + + # Here we're going through every vm listed through libvirt. This + # really only lets us find ones that are started that shouldn't be. + vm_ids.each do |vm_id| + puts "VM ID: %d" % [vm_id] + begin + dom = conn.lookup_domain_by_id(vm_id) + rescue + puts "Failed to find domain " + vm.description next end - if host.state != "available" - puts "Updating host state to available: " + host.hostname - host.state = "available" - host.save + vm_uuid = dom.uuid + info = dom.info + + puts "VM UUID: %s" % [vm_uuid] + info = dom.info + + vm = Vm.find(:first, :conditions => [ "uuid = ?", vm_uuid ]) + if vm == nil + puts "VM Not found in database, must be created by user. giving up." + next end + check_state(vm, info) + end + + # Now we get a list of all vms that should be on this system and see if + # they are all running. + Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do |vm| + begin - vm_ids = conn.list_domains + dom = conn.lookup_domain_by_uuid(vm.uuid) rescue - puts "Failed to request domain list on host " + host.hostname - conn.close + # OK. We couldn't find the UUID that we thought was there. The only + # explanation is that the domain is dead. + puts "Failed to find domain " + vm.description + kick_taskomatic(Vm::STATE_STOPPED, vm) next end + info = dom.info + check_state(vm, info) - # Here we're going through every vm listed through libvirt. This - # really only lets us find ones that are started that shouldn't be. - vm_ids.each do |vm_id| - puts "VM ID: %d" % [vm_id] - begin - dom = conn.lookup_domain_by_id(vm_id) - rescue - puts "Failed to find domain " + vm.description - next - end - - vm_uuid = dom.uuid - info = dom.info - - puts "VM UUID: %s" % [vm_uuid] - info = dom.info - puts info.to_s - - vm = Vm.find(:first, :conditions => [ "uuid = ?", vm_uuid ]) - if vm == nil - puts "VM Not found in database, must be created by user. giving up." - next - end + conn.close - check_state(vm, info) - end + end +end - # Now we get a list of all vms that should be on this system and see if - # they are all running. - Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do |vm| - - begin - dom = conn.lookup_domain_by_uuid(vm.uuid) - rescue - # OK. We couldn't find the UUID that we thought was there. The only - # explanation is that the domain is dead. - puts "Failed to find domain " + vm.description - kick_taskomatic(Vm::STATE_STOPPED, vm) - next - end - info = dom.info - check_state(vm, info) +get_credentials - conn.close +loop do + + # fork() seems to really mess with our db connection. Need to have this + # in the main connection as well. I verified it's not leaking connections/fds. + database_connect + hosts = Host.find(:all) + + p_count = 0 + hosts.each do |host| + + p_count += 1 + # Only allow up to 10 processes running at a time. If we go above 10 + # Then we wait for one to exit before continuing. + if p_count > 10 + Process.wait + p_count -= 1 end + + fork do + check_status(host) + exit 0 + end + end + while p_count > 0 + Process.wait + p_count -= 1 + end + + STDOUT.flush sleep sleeptime end -- 1.5.5.1
Ian Main
2008-Jun-13 21:38 UTC
[Ovirt-devel] [PATCH] Use multiple processes to check host status
This patch causes host-status to fork() up to node_count/5 times to connect out to hosts via libvirt. This guarantees that that it takes at most 5 timeouts in a row to verify all nodes. This should help with the bottleneck we were seeing with libvirt connect timeouts. Testing with 105 nodes, almost all of which were down, it took 27s to query all of them. Signed-off-by: Ian Main <imain at redhat.com> --- wui/src/host-status/host-status.rb | 194 +++++++++++++++++++++--------------- 1 files changed, 115 insertions(+), 79 deletions(-) diff --git a/wui/src/host-status/host-status.rb b/wui/src/host-status/host-status.rb index 41638da..fcfd586 100755 --- a/wui/src/host-status/host-status.rb +++ b/wui/src/host-status/host-status.rb @@ -1,5 +1,5 @@ #!/usr/bin/ruby -# +# # Copyright (C) 2008 Red Hat, Inc. # Written by Chris Lalancette <clalance at redhat.com> # @@ -29,7 +29,7 @@ include Daemonize $logfile = '/var/log/ovirt-wui/host-status.log' do_daemon = true -sleeptime = 5 +sleeptime = 20 opts = OptionParser.new do |opts| opts.on("-h", "--help", "Print help message") do puts opts @@ -97,104 +97,140 @@ def kick_taskomatic(msg, vm) task.save end -loop do - get_credentials - hosts = Host.find(:all) - hosts.each do |host| - - begin - conn = Libvirt::open("qemu+tcp://" + host.hostname + "/system") - rescue - # we couldn't contact the host for whatever reason. Since we can't get - # to this host, we have to mark all vms on it as disconnected or stopped - # or such. - if host.state != "unavailable" - puts "Updating host state to unavailable: " + host.hostname - host.state = "unavailable" - host.save - end +def check_status(host) - Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do |vm| - # Since we can't reach the host on which the vms reside, we mark these - # as STATE_UNREACHABLE. If they come back up we can mark them as - # running again, else they'll be stopped. At least for now the user - # will know what's going on. - # - # If this causes too much trouble in the UI, this can be changed to - # STATE_STOPPED for now until it is resolved of another solution is - # brought forward. - - if vm.state != Vm::STATE_UNREACHABLE: - kick_taskomatic(Vm::STATE_UNREACHABLE, vm) - end + # This is in a new process, we need a new database connection. + database_connect + + begin + puts "Connecting to host " + host.hostname + conn = Libvirt::open("qemu+tcp://" + host.hostname + "/system") + rescue + # we couldn't contact the host for whatever reason. Since we can't get + # to this host, we have to mark all vms on it as disconnected or stopped + # or such. + if host.state != "unavailable" + puts "Updating host state to unavailable: " + host.hostname + host.state = "unavailable" + host.save + end + + Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do |vm| + # Since we can't reach the host on which the vms reside, we mark these + # as STATE_UNREACHABLE. If they come back up we can mark them as + # running again, else they'll be stopped. At least for now the user + # will know what's going on. + # + # If this causes too much trouble in the UI, this can be changed to + # STATE_STOPPED for now until it is resolved of another solution is + # brought forward. + + if vm.state != Vm::STATE_UNREACHABLE: + kick_taskomatic(Vm::STATE_UNREACHABLE, vm) end + end + + return + end + + if host.state != "available" + puts "Updating host state to available: " + host.hostname + host.state = "available" + host.save + end + + begin + vm_ids = conn.list_domains + rescue + puts "Failed to request domain list on host " + host.hostname + conn.close + next + end + + # Here we're going through every vm listed through libvirt. This + # really only lets us find ones that are started that shouldn't be. + vm_ids.each do |vm_id| + puts "VM ID: %d" % [vm_id] + begin + dom = conn.lookup_domain_by_id(vm_id) + rescue + puts "Failed to find domain " + vm.description next end - if host.state != "available" - puts "Updating host state to available: " + host.hostname - host.state = "available" - host.save + vm_uuid = dom.uuid + info = dom.info + + puts "VM UUID: %s" % [vm_uuid] + info = dom.info + + vm = Vm.find(:first, :conditions => [ "uuid = ?", vm_uuid ]) + if vm == nil + puts "VM Not found in database, must be created by user. giving up." + next end + check_state(vm, info) + end + + # Now we get a list of all vms that should be on this system and see if + # they are all running. + Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do |vm| + begin - vm_ids = conn.list_domains + dom = conn.lookup_domain_by_uuid(vm.uuid) rescue - puts "Failed to request domain list on host " + host.hostname - conn.close + # OK. We couldn't find the UUID that we thought was there. The only + # explanation is that the domain is dead. + puts "Failed to find domain " + vm.description + kick_taskomatic(Vm::STATE_STOPPED, vm) next end + info = dom.info + check_state(vm, info) - # Here we're going through every vm listed through libvirt. This - # really only lets us find ones that are started that shouldn't be. - vm_ids.each do |vm_id| - puts "VM ID: %d" % [vm_id] - begin - dom = conn.lookup_domain_by_id(vm_id) - rescue - puts "Failed to find domain " + vm.description - next - end - - vm_uuid = dom.uuid - info = dom.info - - puts "VM UUID: %s" % [vm_uuid] - info = dom.info - puts info.to_s - - vm = Vm.find(:first, :conditions => [ "uuid = ?", vm_uuid ]) - if vm == nil - puts "VM Not found in database, must be created by user. giving up." - next - end + conn.close - check_state(vm, info) - end + end +end - # Now we get a list of all vms that should be on this system and see if - # they are all running. - Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do |vm| - - begin - dom = conn.lookup_domain_by_uuid(vm.uuid) - rescue - # OK. We couldn't find the UUID that we thought was there. The only - # explanation is that the domain is dead. - puts "Failed to find domain " + vm.description - kick_taskomatic(Vm::STATE_STOPPED, vm) - next - end - info = dom.info - check_state(vm, info) +get_credentials - conn.close +loop do + + # fork() seems to really mess with our db connection. Need to have this + # in the main connection as well. I verified it's not leaking connections/fds. + database_connect + hosts = Host.find(:all) + + p_count = 0 + hosts.each do |host| + + p_count += 1 + # Only allow up to n_hosts / 5 processes running at a time. If we go above this + # Then we wait for one to exit before continuing. This guarantees it will take + # at most 5 timeouts to check all hosts. + if p_count > hosts.length / 5 + Process.wait + p_count -= 1 end + + fork do + check_status(host) + exit 0 + end + end + while p_count > 0 + Process.wait + p_count -= 1 + end + + STDOUT.flush sleep sleeptime end -- 1.5.5.1