[Ovirt-devel] oVirt Nodes become unavailable

Василец Дмитрий pronix.service at gmail.com
Fri Jan 30 20:07:24 UTC 2009


this is because in db not actual data

apply this patch and restart db-omatic

diff --git a/src/db-omatic/db_omatic.rb b/src/db-omatic/db_omatic.rb
index 4afffb1..c499610 100755
--- a/src/db-omatic/db_omatic.rb
+++ b/src/db-omatic/db_omatic.rb
@@ -74,6 +74,64 @@ class DbOmatic < Qpid::Qmf::Console
         domain[:synced] = true
     end

+    #find hostname from values['node'] where values['class_type'] ==
'domain'
+    def get_host_id(abank,bbank)
+     begin
+       @cached_objects.keys.each do |objkey|
+         if @cached_objects[objkey][:agent_bank].to_s == abank and
@cached_objects[objkey][:broker_bank].to_s == bbank and
@cached_objects[objkey][:class_type].to_s == 'node'
+               return Host.find(:first, :conditions => ['hostname =
?', at cached_objects[objkey]["hostname"].to_s]).id
+                   break
+         end
+       end
+      rescue => ex
+       log("error in get_host_id")
+       log(ex)
+     end
+    end
+
+    def set_host(values,digit)
+      begin
+       vm = Vm.find(:first, :conditions => ['description =
?',values["name"].to_s])
+       if vm and digit
+       vm.host_id = digit
+       vm.save!
+       else
+               log("this vm not exist #{values["name"]}")
+       end
+      rescue => ex
+       puts "error when set_host for #{values["name"]}"
+       puts ex
+      end
+    end
+
+    def start_crashed_vm(vm)
+       task = VmTask.new( :user => 'db-omatic', :task_target => vm, :action
=> 'start_vm', :state => 'queued')
+       task.save!
+       log("set task for start crashed vm #{vm.id}")
+    end
+
+    def set_domain_stopped(domain)
+      begin
+        vm = Vm.find(:first, :conditions => ['uuid = ?', domain['uuid']])
+        if vm != nil
+         curstate = vm.state
+          vm.state = Vm::STATE_STOPPED
+          vm.host_id = nil
+          vm.save
+         domain['state'] = 'crashed'  # and now i will use ipmi for reboot
anavailable host - simple fencing
+        # if curstate == Vm::STATE_RUNNING and vm.ha # vm.ha true or false
+        #    start_crashed_vm(vm)
+        # end
+        else
+          log('vm == nil ')
+        end
+        log("domain  #{domain['id']} already stopped")
+      rescue => ex
+        log("can\'t set domain #{domain['id']} stopped")
+       log(ex)
+      end
+    end
+
     def update_host_state(host_info, state)
         db_host = Host.find(:first, :conditions => [ "hostname = ?",
host_info['hostname'] ])
         if db_host
@@ -131,6 +189,7 @@ class DbOmatic < Qpid::Qmf::Console

             domain_state_change = false

+        change_node = false
             obj.properties.each do |key, newval|
                 if values[key.to_s] != newval
                     values[key.to_s] = newval
@@ -138,12 +197,30 @@ class DbOmatic < Qpid::Qmf::Console
                     if type == "domain" and key.to_s == "state"
                         domain_state_change = true
                     end
+                   if type == "domain" and key.to_s == "node"
+                       change_node = true
+                   end
+
+
                 end
             end

             if domain_state_change
                 update_domain_state(values)
             end
+       if change_node
+       values.each do |key,val|
+         if key == 'state' and val == 'running'
+               abank = values['node'].to_s.split('-')[3]
+               bbank = values['node'].to_s.split('-')[4]
+               @@host_id = get_host_id(abank,bbank)
+               set_host(values,@@host_id)
+               log("update node data for #{values['name']}")
+               break
+         end
+       end
+       end
+

             if new_object
                 if type == "node"
@@ -187,11 +264,6 @@ class DbOmatic < Qpid::Qmf::Console
         end
     end

-
-    def del_agent(agent)
-        agent_disconnected(agent)
-    end
-
     # This method marks objects associated with the given agent as timed
out/invalid.  Called either
     # when the agent heartbeats out, or we get a del_agent callback.
     def agent_disconnected(agent)
@@ -205,8 +277,10 @@ class DbOmatic < Qpid::Qmf::Console
                     if values[:class_type] == 'node'
                         update_host_state(values, Host::STATE_UNAVAILABLE)
                     elsif values[:class_type] == 'domain'
-                        update_domain_state(values, Vm::STATE_UNREACHABLE)
-                    end
+                       set_domain_stopped(values)
+                       values[:timed_out] = true
+                   @cached_objects.delete(objkey)
+                   end
                 end
             values[:timed_out] = true
             end
@@ -248,6 +322,7 @@ class DbOmatic < Qpid::Qmf::Console
         db_vm = Vm.find(:all)
         db_vm.each do |vm|
             log "Marking vm #{vm.description} as stopped."
+        vm.host_id = nil
             vm.state = Vm::STATE_STOPPED
             vm.save
         end

2009/1/30 Hugh O. Brock <hbrock at redhat.com>

> On Fri, Jan 30, 2009 at 01:24:36PM -0600, Carb, Brian A wrote:
> > date --utc on all nodes and the appliance shows that they are within a
> few seconds of each other.
> >
> > brian carb
> > unisys corporation - malvern, pa
> > brian.carb at unisys.com
> >
> > -----Original Message-----
> > From: Perry Myers [mailto:pmyers at redhat.com]
> > Sent: Friday, January 30, 2009 2:19 PM
> > To: Carb, Brian A
> > Cc: ovirt-devel at redhat.com; Ian Main
> > Subject: Re: [Ovirt-devel] oVirt Nodes become unavailable
> >
> > Carb, Brian A wrote:
> > > running oVirt 0.96 on fedora10...
> > >
> > > I noticed that after some time has elapsed (machines sitting idle
> overnight), the oVirt dashboard shows 2 of my 4 nodes as
> "unavailable(enabled)". I can access them via their consoles though, so the
> machines are up. Restarting the browser session does not change this. If I
> shutdown and then restart the oVirt server appliance, all 4 show as
> unavailable. Do I have to do something to make the nodes available (short of
> restarting them)?
> > >
> > > Any ideas? Thanks.
> >
> > Really quick thing.... check timestamps on each Node and the appliance
> (date --utc)
> >
> > If they're not within a few seconds of each other then kerberos gets
> unhappy...  If that is the case then our NTP setup is not working which
> needs to be fixed.
> >
> > If they are in sync then Ian should be able to help
> >
> > Perry
>
> Hmm... sounds like it could be a qpid or host-browser problem. Ian?
>
> --Hugh
>
> _______________________________________________
> Ovirt-devel mailing list
> Ovirt-devel at redhat.com
> https://www.redhat.com/mailman/listinfo/ovirt-devel
>



-- 
С уважением, Дмитрий.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://listman.redhat.com/archives/ovirt-devel/attachments/20090130/db45980c/attachment.htm>


More information about the ovirt-devel mailing list