[Ovirt-devel] [PATCH] Use multiple processes to check host status
Hugh O. Brock
hbrock at redhat.com
Mon Jun 16 19:35:00 UTC 2008
On Fri, Jun 13, 2008 at 02:38:16PM -0700, Ian Main wrote:
> This patch causes host-status to fork() up to node_count/5 times to
> connect out to hosts via libvirt. This guarantees that that it takes at
> most 5 timeouts in a row to verify all nodes. This should help with the
> bottleneck we were seeing with libvirt connect timeouts. Testing with 105
> nodes, almost all of which were down, it took 27s to query all of them.
>
> Signed-off-by: Ian Main <imain at redhat.com>
> ---
> wui/src/host-status/host-status.rb | 194 +++++++++++++++++++++---------------
> 1 files changed, 115 insertions(+), 79 deletions(-)
>
> diff --git a/wui/src/host-status/host-status.rb b/wui/src/host-status/host-status.rb
> index 41638da..fcfd586 100755
> --- a/wui/src/host-status/host-status.rb
> +++ b/wui/src/host-status/host-status.rb
> @@ -1,5 +1,5 @@
> #!/usr/bin/ruby
> -#
> +#
> # Copyright (C) 2008 Red Hat, Inc.
> # Written by Chris Lalancette <clalance at redhat.com>
> #
> @@ -29,7 +29,7 @@ include Daemonize
> $logfile = '/var/log/ovirt-wui/host-status.log'
>
> do_daemon = true
> -sleeptime = 5
> +sleeptime = 20
> opts = OptionParser.new do |opts|
> opts.on("-h", "--help", "Print help message") do
> puts opts
> @@ -97,104 +97,140 @@ def kick_taskomatic(msg, vm)
> task.save
> end
>
> -loop do
> - get_credentials
>
> - hosts = Host.find(:all)
> - hosts.each do |host|
> -
> - begin
> - conn = Libvirt::open("qemu+tcp://" + host.hostname + "/system")
> - rescue
> - # we couldn't contact the host for whatever reason. Since we can't get
> - # to this host, we have to mark all vms on it as disconnected or stopped
> - # or such.
> - if host.state != "unavailable"
> - puts "Updating host state to unavailable: " + host.hostname
> - host.state = "unavailable"
> - host.save
> - end
> +def check_status(host)
>
> - Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do |vm|
> - # Since we can't reach the host on which the vms reside, we mark these
> - # as STATE_UNREACHABLE. If they come back up we can mark them as
> - # running again, else they'll be stopped. At least for now the user
> - # will know what's going on.
> - #
> - # If this causes too much trouble in the UI, this can be changed to
> - # STATE_STOPPED for now until it is resolved of another solution is
> - # brought forward.
> -
> - if vm.state != Vm::STATE_UNREACHABLE:
> - kick_taskomatic(Vm::STATE_UNREACHABLE, vm)
> - end
> + # This is in a new process, we need a new database connection.
> + database_connect
> +
> + begin
> + puts "Connecting to host " + host.hostname
> + conn = Libvirt::open("qemu+tcp://" + host.hostname + "/system")
> + rescue
> + # we couldn't contact the host for whatever reason. Since we can't get
> + # to this host, we have to mark all vms on it as disconnected or stopped
> + # or such.
> + if host.state != "unavailable"
> + puts "Updating host state to unavailable: " + host.hostname
> + host.state = "unavailable"
> + host.save
> + end
> +
> + Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do |vm|
> + # Since we can't reach the host on which the vms reside, we mark these
> + # as STATE_UNREACHABLE. If they come back up we can mark them as
> + # running again, else they'll be stopped. At least for now the user
> + # will know what's going on.
> + #
> + # If this causes too much trouble in the UI, this can be changed to
> + # STATE_STOPPED for now until it is resolved of another solution is
> + # brought forward.
> +
> + if vm.state != Vm::STATE_UNREACHABLE:
> + kick_taskomatic(Vm::STATE_UNREACHABLE, vm)
> end
>
> + end
> +
> + return
> + end
> +
> + if host.state != "available"
> + puts "Updating host state to available: " + host.hostname
> + host.state = "available"
> + host.save
> + end
> +
> + begin
> + vm_ids = conn.list_domains
> + rescue
> + puts "Failed to request domain list on host " + host.hostname
> + conn.close
> + next
> + end
> +
> + # Here we're going through every vm listed through libvirt. This
> + # really only lets us find ones that are started that shouldn't be.
> + vm_ids.each do |vm_id|
> + puts "VM ID: %d" % [vm_id]
> + begin
> + dom = conn.lookup_domain_by_id(vm_id)
> + rescue
> + puts "Failed to find domain " + vm.description
> next
> end
>
> - if host.state != "available"
> - puts "Updating host state to available: " + host.hostname
> - host.state = "available"
> - host.save
> + vm_uuid = dom.uuid
> + info = dom.info
> +
> + puts "VM UUID: %s" % [vm_uuid]
> + info = dom.info
> +
> + vm = Vm.find(:first, :conditions => [ "uuid = ?", vm_uuid ])
> + if vm == nil
> + puts "VM Not found in database, must be created by user. giving up."
> + next
> end
>
> + check_state(vm, info)
> + end
> +
> + # Now we get a list of all vms that should be on this system and see if
> + # they are all running.
> + Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do |vm|
> +
> begin
> - vm_ids = conn.list_domains
> + dom = conn.lookup_domain_by_uuid(vm.uuid)
> rescue
> - puts "Failed to request domain list on host " + host.hostname
> - conn.close
> + # OK. We couldn't find the UUID that we thought was there. The only
> + # explanation is that the domain is dead.
> + puts "Failed to find domain " + vm.description
> + kick_taskomatic(Vm::STATE_STOPPED, vm)
> next
> end
> + info = dom.info
> + check_state(vm, info)
>
> - # Here we're going through every vm listed through libvirt. This
> - # really only lets us find ones that are started that shouldn't be.
> - vm_ids.each do |vm_id|
> - puts "VM ID: %d" % [vm_id]
> - begin
> - dom = conn.lookup_domain_by_id(vm_id)
> - rescue
> - puts "Failed to find domain " + vm.description
> - next
> - end
> -
> - vm_uuid = dom.uuid
> - info = dom.info
> -
> - puts "VM UUID: %s" % [vm_uuid]
> - info = dom.info
> - puts info.to_s
> -
> - vm = Vm.find(:first, :conditions => [ "uuid = ?", vm_uuid ])
> - if vm == nil
> - puts "VM Not found in database, must be created by user. giving up."
> - next
> - end
> + conn.close
>
> - check_state(vm, info)
> - end
> + end
> +end
>
> - # Now we get a list of all vms that should be on this system and see if
> - # they are all running.
> - Vm.find(:all, :conditions => [ "host_id = ?", host.id ]).each do |vm|
> -
> - begin
> - dom = conn.lookup_domain_by_uuid(vm.uuid)
> - rescue
> - # OK. We couldn't find the UUID that we thought was there. The only
> - # explanation is that the domain is dead.
> - puts "Failed to find domain " + vm.description
> - kick_taskomatic(Vm::STATE_STOPPED, vm)
> - next
> - end
> - info = dom.info
> - check_state(vm, info)
> +get_credentials
>
> - conn.close
> +loop do
> +
> + # fork() seems to really mess with our db connection. Need to have this
> + # in the main connection as well. I verified it's not leaking connections/fds.
> + database_connect
> + hosts = Host.find(:all)
> +
> + p_count = 0
> + hosts.each do |host|
> +
> + p_count += 1
>
> + # Only allow up to n_hosts / 5 processes running at a time. If we go above this
> + # Then we wait for one to exit before continuing. This guarantees it will take
> + # at most 5 timeouts to check all hosts.
> + if p_count > hosts.length / 5
> + Process.wait
> + p_count -= 1
> end
> +
> + fork do
> + check_status(host)
> + exit 0
> + end
> +
> end
>
> + while p_count > 0
> + Process.wait
> + p_count -= 1
> + end
> +
> +
> STDOUT.flush
> sleep sleeptime
> end
> --
> 1.5.5.1
Wondered about all the puts calls but Ian tells me they are to help
with debugging when running non-daemonized, which is fine.
Understand we need to replace all this with push-type logic
eventually, but this will do for now.
ACK
--Hugh
More information about the ovirt-devel
mailing list