[Ovirt-devel] [PATCH] Tested, working implementation of migration
Ian Main
imain at redhat.com
Fri Aug 15 05:08:20 UTC 2008
From: Chris Lalancette <clalance at redhat.com>
I updated this patch to fix a few bugs that I found and get rid of whitespace
so I'm reposting - Ian.
An actually working, tested cut of the migration code for taskomatic. It
supports 3 modes:
a) ClearHost - this clears all of the VMs off of a particular host, the idea
being that this host will probably be taken down for maintenance.
b) Migrate VM to a particular host - this migrates a particular VM from where
it currently resides to a destination specified by the admin. The idea here
is that the admin might want to override the automatic placement decision of
taskomatic
c) Migrate VM anywhere else - this migrates a particular VM from where it
currently resides to anywhere else. The idea here is for a step-by-step
clearing of a host (similar to ClearHost), or to reduce load on a particular
host by moving a VM somewhere else.
Signed-off-by: Chris Lalancette <clalance at redhat.com>
---
wui/src/task-omatic/task_host.rb | 33 +++++++
wui/src/task-omatic/task_vm.rb | 190 ++++++++++++++++++-------------------
wui/src/task-omatic/taskomatic.rb | 13 ++-
wui/src/task-omatic/utils.rb | 117 ++++++++++++++++++++++-
4 files changed, 246 insertions(+), 107 deletions(-)
create mode 100644 wui/src/task-omatic/task_host.rb
diff --git a/wui/src/task-omatic/task_host.rb b/wui/src/task-omatic/task_host.rb
new file mode 100644
index 0000000..5f42da3
--- /dev/null
+++ b/wui/src/task-omatic/task_host.rb
@@ -0,0 +1,33 @@
+# Copyright (C) 2008 Red Hat, Inc.
+# Written by Chris Lalancette <clalance at redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA 02110-1301, USA. A copy of the GNU General Public License is
+# also available at http://www.gnu.org/copyleft/gpl.html.
+
+require 'utils'
+
+# FIXME: a little ugly to be including all of task_vm here, but
+# utils really isn't the right place for the migrate() method
+require 'task_vm'
+
+def clear_vms_host(task)
+ puts "clear_vms_host"
+
+ src_host = findHost(task.host_id)
+
+ src_host.vms.each do |vm|
+ migrate(vm)
+ end
+end
diff --git a/wui/src/task-omatic/task_vm.rb b/wui/src/task-omatic/task_vm.rb
index d7f0869..ed4d366 100644
--- a/wui/src/task-omatic/task_vm.rb
+++ b/wui/src/task-omatic/task_vm.rb
@@ -128,17 +128,6 @@ def findVM(task, fail_on_nil_host_id = true)
return vm
end
-def findHost(task, host_id)
- host = Host.find(:first, :conditions => [ "id = ?", host_id])
-
- if host == nil
- # Hm, we didn't find the host_id. Seems odd. Return a failure
- raise "Could not find host_id " + host_id
- end
-
- return host
-end
-
def setVmShutdown(vm)
vm.host_id = nil
vm.memory_used = nil
@@ -189,7 +178,7 @@ def shutdown_vm(task)
begin
# OK, now that we found the VM, go looking in the hosts table
- host = findHost(task, vm.host_id)
+ host = findHost(vm.host_id)
conn = Libvirt::open("qemu+tcp://" + host.hostname + "/system")
dom = conn.lookup_domain_by_uuid(vm.uuid)
@@ -200,10 +189,21 @@ def shutdown_vm(task)
# of problems. Needs more thought
#dom.shutdown
dom.destroy
- dom.undefine
+
+ begin
+ dom.undefine
+ rescue
+ # undefine can fail, for instance, if we live migrated from A -> B, and
+ # then we are shutting down the VM on B (because it only has "transient"
+ # XML). Therefore, just ignore undefine errors so we do the rest
+ # FIXME: we really should have a marker in the database somehow so that
+ # we can tell if this domain was migrated; that way, we can tell the
+ # difference between a real undefine failure and one because of migration
+ end
+
+ teardown_storage_pools(conn)
+
conn.close
- # FIXME: hm. We probably want to undefine the storage pool that this host
- # was using if and only if it's not in use by another VM.
rescue => ex
setVmState(vm, vm_orig_state)
raise ex
@@ -229,7 +229,6 @@ def start_vm(task)
end
# FIXME: Validate that the VM is still within quota
- #vm.validate
vm_orig_state = vm.state
setVmState(vm, Vm::STATE_STARTING)
@@ -246,93 +245,17 @@ def start_vm(task)
# OK, now that we found the VM, go looking in the hardware_pool
# hosts to see if there is a host that will fit these constraints
- host = nil
-
- vm.vm_resource_pool.get_hardware_pool.hosts.each do |host|
- if host.num_cpus >= vm.num_vcpus_allocated \
- and host.memory >= vm.memory_allocated \
- and not host.is_disabled
- host = curr
- break
- end
- end
-
- if host == nil
- # we couldn't find a host that matches this description; report ERROR
- raise "No host matching VM parameters could be found"
- end
+ host = findHostSLA(vm)
conn = Libvirt::open("qemu+tcp://" + host.hostname + "/system")
- # here, build up a list of already defined pools. We'll use it
- # later to see if we need to define new pools for the storage or just
- # keep using existing ones
-
- defined_pools = []
- all_storage_pools(conn).each do |remote_pool_name|
- tmppool = conn.lookup_storage_pool_by_name(remote_pool_name)
- defined_pools << tmppool
- end
-
- storagedevs = []
- vm.storage_volumes.each do |volume|
- # here, we need to iterate through each volume and possibly attach it
- # to the host we are going to be using
- storage_pool = volume.storage_pool
-
- if storage_pool == nil
- # Hum. Specified by the VM description, but not in the storage pool?
- # continue on and hope for the best
- next
- end
-
- if storage_pool[:type] == "IscsiStoragePool"
- thisstorage = Iscsi.new(storage_pool.ip_addr, storage_pool[:target])
- elsif storage_pool[:type] == "NfsStoragePool"
- thisstorage = NFS.new(storage_pool.ip_addr, storage_pool.export_path)
- else
- # Hm, a storage type we don't understand; skip it
- next
- end
-
- thepool = nil
- defined_pools.each do |pool|
- doc = Document.new(pool.xml_desc(0))
- root = doc.root
-
- if thisstorage.xmlequal?(doc.root)
- thepool = pool
- break
- end
- end
-
- if thepool == nil
- thepool = conn.define_storage_pool_xml(thisstorage.getxml, 0)
- thepool.build(0)
- thepool.create(0)
- elsif thepool.info.state == Libvirt::StoragePool::INACTIVE
- # only try to start the pool if it is currently inactive; in all other
- # states, assume it is already running
- thepool.create(0)
- end
-
- storagedevs << thepool.lookup_volume_by_name(volume.read_attribute(thisstorage.db_column)).path
- end
-
- conn.close
-
- if storagedevs.length > 4
- raise "Too many storage volumes; maximum is 4"
- end
-
- # OK, we found a host that will work; now let's build up the XML
+ storagedevs = connect_storage_pools(conn, vm)
# FIXME: get rid of the hardcoded bridge
xml = create_vm_xml(vm.description, vm.uuid, vm.memory_allocated,
vm.memory_used, vm.num_vcpus_allocated, vm.boot_device,
vm.vnic_mac_addr, "ovirtbr0", storagedevs)
- conn = Libvirt::open("qemu+tcp://" + host.hostname + "/system")
dom = conn.define_domain_xml(xml.to_s)
dom.create
@@ -373,7 +296,7 @@ def save_vm(task)
begin
# OK, now that we found the VM, go looking in the hosts table
- host = findHost(task, vm.host_id)
+ host = findHost(vm.host_id)
conn = Libvirt::open("qemu+tcp://" + host.hostname + "/system")
dom = conn.lookup_domain_by_uuid(vm.uuid)
@@ -416,7 +339,7 @@ def restore_vm(task)
begin
# OK, now that we found the VM, go looking in the hosts table
- host = findHost(task, vm.host_id)
+ host = findHost(vm.host_id)
# FIXME: we should probably go out to the host and check what it thinks
# the state is
@@ -458,7 +381,7 @@ def suspend_vm(task)
begin
# OK, now that we found the VM, go looking in the hosts table
- host = findHost(task, vm.host_id)
+ host = findHost(vm.host_id)
conn = Libvirt::open("qemu+tcp://" + host.hostname + "/system")
dom = conn.lookup_domain_by_uuid(vm.uuid)
@@ -498,7 +421,7 @@ def resume_vm(task)
begin
# OK, now that we found the VM, go looking in the hosts table
- host = findHost(task, vm.host_id)
+ host = findHost(vm.host_id)
conn = Libvirt::open("qemu+tcp://" + host.hostname + "/system")
dom = conn.lookup_domain_by_uuid(vm.uuid)
@@ -534,3 +457,74 @@ def update_state_vm(task)
puts "Updated state to " + task.args
end
end
+
+def migrate(vm, dest = nil)
+ if vm.state == Vm::STATE_STOPPED
+ raise "Cannot migrate stopped domain"
+ elsif vm.state == Vm::STATE_SUSPENDED
+ raise "Cannot migrate suspended domain"
+ elsif vm.state == Vm::STATE_SAVED
+ raise "Cannot migrate saved domain"
+ end
+
+ vm_orig_state = vm.state
+ setVmState(vm, Vm::STATE_MIGRATING)
+
+ begin
+ src_host = findHost(vm.host_id)
+ if not dest.empty?
+ if dest.to_i == vm.host_id
+ raise "Cannot migrate from host " + src_host.hostname + " to itself!"
+ end
+ dst_host = findHost(dest.to_i)
+ else
+ dst_host = findHostSLA(vm)
+ end
+
+ src_conn = Libvirt::open("qemu+tcp://" + src_host.hostname + "/system")
+ dst_conn = Libvirt::open("qemu+tcp://" + dst_host.hostname + "/system")
+
+ connect_storage_pools(dst_conn, vm)
+
+ dom = src_conn.lookup_domain_by_uuid(vm.uuid)
+ dom.migrate(dst_conn, Libvirt::Domain::MIGRATE_LIVE)
+
+ # if we didn't raise an exception, then the migration was successful. We
+ # still have a pointer to the now-shutdown domain on the source side, so
+ # undefine it
+ begin
+ dom.undefine
+ rescue
+ # undefine can fail, for instance, if we live migrated from A -> B, and
+ # then we are shutting down the VM on B (because it only has "transient"
+ # XML). Therefore, just ignore undefine errors so we do the rest
+ # FIXME: we really should have a marker in the database somehow so that
+ # we can tell if this domain was migrated; that way, we can tell the
+ # difference between a real undefine failure and one because of migration
+ end
+
+ teardown_storage_pools(src_conn)
+ dst_conn.close
+ src_conn.close
+ rescue => ex
+ # FIXME: ug. We may have open connections that we need to close; not
+ # sure how to handle that
+ setVmState(vm, vm_orig_state)
+ raise ex
+ end
+
+ setVmState(vm, Vm::STATE_RUNNING)
+ vm.host_id = dst_host.id
+ vm.save
+end
+
+def migrate_vm(task)
+ puts "migrate_vm"
+
+ # here, we are given an id for a VM to migrate; we have to lookup which
+ # physical host it is running on
+
+ vm = findVM(task)
+
+ migrate(vm, task.args)
+end
diff --git a/wui/src/task-omatic/taskomatic.rb b/wui/src/task-omatic/taskomatic.rb
index bb70247..7d6e950 100755
--- a/wui/src/task-omatic/taskomatic.rb
+++ b/wui/src/task-omatic/taskomatic.rb
@@ -63,9 +63,9 @@ end
require 'task_vm'
require 'task_storage'
+require 'task_host'
loop do
- first = true
tasks = Array.new
begin
tasks = Task.find(:all, :conditions => [ "state = ?", Task::STATE_QUEUED ])
@@ -86,11 +86,10 @@ loop do
end
end
tasks.each do |task|
- if first
- # make sure we get our credentials up-front
- get_credentials
- first = false
- end
+ # make sure we get our credentials up-front
+ get_credentials
+
+ task.time_started = Time.now
state = Task::STATE_FINISHED
begin
@@ -103,7 +102,9 @@ loop do
when VmTask::ACTION_SAVE_VM then save_vm(task)
when VmTask::ACTION_RESTORE_VM then restore_vm(task)
when VmTask::ACTION_UPDATE_STATE_VM then update_state_vm(task)
+ when VmTask::ACTION_MIGRATE_VM then migrate_vm(task)
when StorageTask::ACTION_REFRESH_POOL then refresh_pool(task)
+ when HostTask::ACTION_CLEAR_VMS then clear_vms_host(task)
else
puts "unknown task " + task.action
state = Task::STATE_FAILED
diff --git a/wui/src/task-omatic/utils.rb b/wui/src/task-omatic/utils.rb
index e6401dc..9e60122 100644
--- a/wui/src/task-omatic/utils.rb
+++ b/wui/src/task-omatic/utils.rb
@@ -1,7 +1,39 @@
require 'rexml/document'
include REXML
-require 'models/task'
+def findHostSLA(vm)
+ host = nil
+
+ vm.vm_resource_pool.get_hardware_pool.hosts.each do |curr|
+ # FIXME: we probably need to add in some notion of "load" into this check
+ if curr.num_cpus >= vm.num_vcpus_allocated \
+ and curr.memory >= vm.memory_allocated \
+ and not curr.is_disabled.nil? and curr.is_disabled == 0 \
+ and curr.state == Host::STATE_AVAILABLE \
+ and (vm.host_id.nil? or (not vm.host_id.nil? and vm.host_id != curr.id))
+ host = curr
+ break
+ end
+ end
+
+ if host == nil
+ # we couldn't find a host that matches this criteria
+ raise "No host matching VM parameters could be found"
+ end
+
+ return host
+end
+
+def findHost(host_id)
+ host = Host.find(:first, :conditions => [ "id = ?", host_id])
+
+ if host == nil
+ # Hm, we didn't find the host_id. Seems odd. Return a failure
+ raise "Could not find host_id " + host_id.to_s
+ end
+
+ return host
+end
def String.random_alphanumeric(size=16)
s = ""
@@ -10,12 +42,91 @@ def String.random_alphanumeric(size=16)
end
def all_storage_pools(conn)
- all_pools = []
- all_pools.concat(conn.list_defined_storage_pools)
+ all_pools = conn.list_defined_storage_pools
all_pools.concat(conn.list_storage_pools)
return all_pools
end
+def teardown_storage_pools(conn)
+ # FIXME: this needs to get a *lot* smarter. In particular, we want to make
+ # sure we can tear down unused pools even when there are other guests running
+ if conn.list_domains.empty?
+ # OK, there are no running guests on this host anymore. We can teardown
+ # any storage pools that are there without fear
+ all_storage_pools(conn).each do |remote_pool_name|
+ begin
+ pool = conn.lookup_storage_pool_by_name(remote_pool_name)
+ pool.destroy
+ pool.undefine
+ rescue
+ # do nothing if any of this failed; the worst that happens is that
+ # we leave a pool configured
+ puts "Could not teardown pool " + remote_pool_name + "; skipping"
+ end
+ end
+ end
+end
+
+def connect_storage_pools(conn, vm)
+ # here, build up a list of already defined pools. We'll use it
+ # later to see if we need to define new pools for the storage or just
+ # keep using existing ones
+
+ defined_pools = []
+ all_storage_pools(conn).each do |remote_pool_name|
+ defined_pools << conn.lookup_storage_pool_by_name(remote_pool_name)
+ end
+
+ storagedevs = []
+ vm.storage_volumes.each do |volume|
+ # here, we need to iterate through each volume and possibly attach it
+ # to the host we are going to be using
+ storage_pool = volume.storage_pool
+
+ if storage_pool == nil
+ # Hum. Specified by the VM description, but not in the storage pool?
+ # continue on and hope for the best
+ # FIXME: probably want a print to the logs here
+ next
+ end
+
+ if storage_pool[:type] == "IscsiStoragePool"
+ thisstorage = Iscsi.new(storage_pool.ip_addr, storage_pool[:target])
+ elsif storage_pool[:type] == "NfsStoragePool"
+ thisstorage = NFS.new(storage_pool.ip_addr, storage_pool.export_path)
+ else
+ # Hm, a storage type we don't understand; skip it
+ puts "Storage type " + storage_pool[:type] + " is not understood; skipping"
+ next
+ end
+
+ thepool = nil
+ defined_pools.each do |pool|
+ doc = Document.new(pool.xml_desc)
+ root = doc.root
+
+ if thisstorage.xmlequal?(doc.root)
+ thepool = pool
+ break
+ end
+ end
+
+ if thepool == nil
+ thepool = conn.define_storage_pool_xml(thisstorage.getxml)
+ thepool.build
+ thepool.create
+ elsif thepool.info.state == Libvirt::StoragePool::INACTIVE
+ # only try to start the pool if it is currently inactive; in all other
+ # states, assume it is already running
+ thepool.create
+ end
+
+ storagedevs << thepool.lookup_volume_by_name(volume.read_attribute(thisstorage.db_column)).path
+ end
+
+ return storagedevs
+end
+
class StorageType
attr_reader :db_column
--
1.5.5.1
More information about the ovirt-devel
mailing list