pronix.service at gmail.com
2009-Feb-03 16:45 UTC
[Ovirt-devel] [PATCH] Add virtual machine restart if host crashed. Use fencing by ssh and id_rsa key.
From: root <root at voip.adenin.ru> --- src/app/controllers/vm_controller.rb | 13 +++- src/app/views/vm/show.rhtml | 16 ++++ src/db-omatic/db_omatic.rb | 126 ++++++++++++++++++++++++++++++--- src/db/migrate/006_create_vms.rb | 1 + 4 files changed, 143 insertions(+), 13 deletions(-) diff --git a/src/app/controllers/vm_controller.rb b/src/app/controllers/vm_controller.rb index 56501fd..0f43680 100644 --- a/src/app/controllers/vm_controller.rb +++ b/src/app/controllers/vm_controller.rb @@ -20,11 +20,22 @@ require 'socket' class VmController < ApplicationController # GETs should be safe (see http://www.w3.org/2001/tag/doc/whenToUseGet.html) - verify :method => :post, :only => [ :destroy, :create, :update ], + verify :method => :post, :only => [ :destroy, :create, :update , :change_ha_vm], :redirect_to => { :controller => 'dashboard' } before_filter :pre_vm_action, :only => [:vm_action, :cancel_queued_tasks, :console] + def change_ha_vm + vm = Vm.find_by_id(params[:id]) + if vm.ha + then vm.ha = false + else vm.ha = true + end + vm.save! + alert = "vm was change ha successfully " + render :json => { :object => "vm", :success => true, :alert => alert } + end + def index roles = "('" + Permission::roles_for_privilege(Permission::PRIV_VIEW).join("', '") + diff --git a/src/app/views/vm/show.rhtml b/src/app/views/vm/show.rhtml index f361131..b0479a6 100644 --- a/src/app/views/vm/show.rhtml +++ b/src/app/views/vm/show.rhtml @@ -29,6 +29,9 @@ </a> <% end -%> <% end %> + <a href="#" onClick="change_ha_vm()" rel="facebox[.bolder]"> + <%= image_tag "icon_x.png" %> <% if @vm.ha %> disable_ha (cur. enabled)<% else %> enable_ha (cur. disabled) <% end %> + </a> <a href="#confirm_cancel" rel="facebox[.bolder]"> <%= image_tag "icon_x.png" %> Cancel queued tasks </a> @@ -40,6 +43,19 @@ <%= confirmation_dialog("confirm_cancel", "Are you sure?", "cancel_queued_tasks()") %> <%= confirmation_dialog("confirm_delete", "Are you sure?", "delete_vm()") %> <script type="text/javascript"> + function change_ha_vm() + { + $(document).trigger('close.facebox'); + $.post('<%= url_for :controller => "vm", :action => "change_ha_vm", :id => @vm %>', + {x: 1}, + function(data,status){ + $("#vms_grid").flexReload(); + if (data.alert) { + $.jGrowl(data.alert); + } + empty_summary('vms_selection', 'Virtual Machine'); + }, 'json'); + } function cancel_queued_tasks() { $(document).trigger('close.facebox'); diff --git a/src/db-omatic/db_omatic.rb b/src/db-omatic/db_omatic.rb index 4afffb1..94d93fc 100755 --- a/src/db-omatic/db_omatic.rb +++ b/src/db-omatic/db_omatic.rb @@ -52,6 +52,8 @@ class DbOmatic < Qpid::Qmf::Console state = Vm::STATE_PENDING when "running" state = Vm::STATE_RUNNING + # must find host for current vm and set host_id + # domain.broker when "blocked" state = Vm::STATE_SUSPENDED #? when "paused" @@ -74,6 +76,91 @@ class DbOmatic < Qpid::Qmf::Console domain[:synced] = true end + #find hostname from values['node'] where values['class_type'] == 'domain' + def get_host_id(abank,bbank) #???????????????????? ???????????????? ???? ???? ?????? ???????? - ???????????????? ?????????????????????????? ???????? ???????????????????? + begin + @cached_objects.keys.each do |objkey| + if @cached_objects[objkey][:agent_bank].to_s == abank and @cached_objects[objkey][:broker_bank].to_s == bbank and @cached_objects[objkey][:class_type].to_s == 'node' + return Host.find(:first, :conditions => ['hostname = ?', at cached_objects[objkey]["hostname"].to_s]).id + break + end + end + rescue => ex + log("error in get_host_id") + log(ex) + end + end + + def set_host(values,digit) + begin + vm = Vm.find(:first, :conditions => ['description = ?',values["name"].to_s]) + if vm and digit + vm.host_id = digit + vm.save! + else + log("this vm not exist #{values["name"]}") + end + rescue => ex + puts "error when set_host for #{values["name"]}" + puts ex + end + end + + def start_crashed_vm(vm) + task = VmTask.new( :user => 'db-omatic', :task_target => vm, :action => 'start_vm', :state => 'queued') + task.save! + log("set task for start crashed vm #{vm.id}") + end + + def fencing_by_ssh_reboot(hname) + begin + # Fix need - very insecure, and need manual configure keys, may be kerberos ticket is better ? + Process.fork do + `ssh #{hname} -i /root/.ssh/id_rsa -o StrictHostKeyChecking=no "/sbin/init 6" & ` + end + # also can use ipmi - `ipmitool -I lan -H #{"ipmi."+hname} -U admin -P password chassis power reset ` + log("success fencing#{hname}") + # execute in shell, must be identify key for login as root to storage server + # after restart crashed host need manualy remove this rule from iptables + + return true + rescue => ex + log("error in fencing #{hname}") + log(ex) + end + end + + def set_domain_stopped(domain) + begin + vm = Vm.find(:first, :conditions => ['uuid = ?', domain['uuid']]) + if vm != nil + + if vm.state == Vm::STATE_RUNNING and vm.ha + fencing_and_restart = true + fencing_by_ssh_reboot(vm.host.hostname) + end + puts fencing_and_restart + vm.state = Vm::STATE_STOPPED + vm.host_id = nil + vm.save! + domain['state'] = 'crashed' + if fencing_and_restart + log("must restart vm ") + start_crashed_vm(vm) + else + log("not running and not restart") + end + else + log('vm == nil ') + end + log("domain #{domain['id']} already stopped") + rescue => ex + log("can\'t set domain #{domain['id']} stopped") + log(ex) + end + end + + def update_host_state(host_info, state) db_host = Host.find(:first, :conditions => [ "hostname = ?", host_info['hostname'] ]) if db_host @@ -116,7 +203,6 @@ class DbOmatic < Qpid::Qmf::Console if values == nil values = {} @cached_objects[obj.object_id.to_s] = values - # Save the agent and broker bank so that we can tell what objects # are expired when the heartbeat for them stops. values[:broker_bank] = obj.object_id.broker_bank @@ -130,20 +216,38 @@ class DbOmatic < Qpid::Qmf::Console end domain_state_change = false - + change_node = false obj.properties.each do |key, newval| if values[key.to_s] != newval values[key.to_s] = newval #log "new value for property #{key} : #{newval}" if type == "domain" and key.to_s == "state" - domain_state_change = true + domain_state_change = true end + if type == "domain" and key.to_s == "node" + log("DEBUG change node = true") + change_node = true + end + end end if domain_state_change + log("DEBUG update_domain_state") update_domain_state(values) end + if change_node + values.each do |key,val| + if key == 'state' and val == 'running' + abank = values['node'].to_s.split('-')[3] + bbank = values['node'].to_s.split('-')[4] + @@host_id = get_host_id(abank,bbank) + set_host(values,@@host_id) + log("update node data for #{values['name']}") + break + end + end + end if new_object if type == "node" @@ -187,11 +291,6 @@ class DbOmatic < Qpid::Qmf::Console end end - - def del_agent(agent) - agent_disconnected(agent) - end - # This method marks objects associated with the given agent as timed out/invalid. Called either # when the agent heartbeats out, or we get a del_agent callback. def agent_disconnected(agent) @@ -204,11 +303,14 @@ class DbOmatic < Qpid::Qmf::Console if values[:timed_out] == false if values[:class_type] == 'node' update_host_state(values, Host::STATE_UNAVAILABLE) + values[:timed_out] = true elsif values[:class_type] == 'domain' - update_domain_state(values, Vm::STATE_UNREACHABLE) - end + log("set_domain_stopped") + set_domain_stopped(values) + values[:timed_out] = true + @cached_objects.delete(objkey) + end end - values[:timed_out] = true end end end @@ -216,7 +318,6 @@ class DbOmatic < Qpid::Qmf::Console # The opposite of above, this is called when an agent is alive and well and makes sure # all of the objects associated with it are marked as valid. def agent_connected(agent) - @cached_objects.keys.each do |objkey| if @cached_objects[objkey][:broker_bank] == agent.broker.broker_bank and @cached_objects[objkey][:agent_bank] == agent.agent_bank @@ -248,6 +349,7 @@ class DbOmatic < Qpid::Qmf::Console db_vm = Vm.find(:all) db_vm.each do |vm| log "Marking vm #{vm.description} as stopped." + vm.host_id = nil vm.state = Vm::STATE_STOPPED vm.save end diff --git a/src/db/migrate/006_create_vms.rb b/src/db/migrate/006_create_vms.rb index 74794b6..a5460f0 100644 --- a/src/db/migrate/006_create_vms.rb +++ b/src/db/migrate/006_create_vms.rb @@ -34,6 +34,7 @@ class CreateVms < ActiveRecord::Migration t.string :boot_device, :null => false t.integer :vnc_port t.integer :lock_version, :default => 0 + t.boolean :ha , :default => true end execute "alter table vms add constraint fk_vms_hosts foreign key (host_id) references hosts(id)" -- 1.5.5.6
Ian Main
2009-Feb-04 19:46 UTC
[Ovirt-devel] [PATCH] Add virtual machine restart if host crashed. Use fencing by ssh and id_rsa key.
On Tue, 3 Feb 2009 19:45:01 +0300 pronix.service at gmail.com wrote:> From: root <root at voip.adenin.ru>[snip] Thanks for reposting! I just wanted you to know that I'm madly working on LVM support right now and will review your patch as soon as I can. Thanks again, Ian
Ian Main
2009-Feb-17 22:19 UTC
[Ovirt-devel] [PATCH] Add virtual machine restart if host crashed. Use fencing by ssh and id_rsa key.
On Tue, 3 Feb 2009 19:45:01 +0300 pronix.service at gmail.com wrote: OK, sorry it took me so long to finally review this properly! It applied! WOOT! I get trailing whitespace warnings but that's ok :) A few small things.. I notice you are using tabs to indent. I think you have your editor set to 4 space tabs? In ruby we always use spaces (and most projects I know use spaces now). Usually it's 2 spaces but in this one it's 4 :) In my editor which is set to 8 space tabs it looks to be indented wrong FYI. I think the whole fencing with ssh might be a bit beyond what we are currently trying to do and I'm not sure it's the right way to do this in a larger scale. I do believe there are plans to use a more comprehensive project for that in near future. Also a better way I would think would be to use libvirt calls via qpid to ensure VMs are destroyed? I'm not sure I understand some of the logic in here either. Would you be willing to explain some of the dbomatic changes? I also can't understand the comments in Russian :). I think for consistency it would be good to have all comments in english.. most of our developers read/speak english. I notice also that you removed del_agent callback. While I think there is a bug there which causes the node to be unavailable if you restart libvirt-qpid, I don't know if this is the right way to fix it. I'm going to look at that shortly. Thanks again for the patch! Ian> From: root <root at voip.adenin.ru> > > --- > src/app/controllers/vm_controller.rb | 13 +++- > src/app/views/vm/show.rhtml | 16 ++++ > src/db-omatic/db_omatic.rb | 126 ++++++++++++++++++++++++++++++--- > src/db/migrate/006_create_vms.rb | 1 + > 4 files changed, 143 insertions(+), 13 deletions(-) > > diff --git a/src/app/controllers/vm_controller.rb b/src/app/controllers/vm_controller.rb > index 56501fd..0f43680 100644 > --- a/src/app/controllers/vm_controller.rb > +++ b/src/app/controllers/vm_controller.rb > @@ -20,11 +20,22 @@ require 'socket' > > class VmController < ApplicationController > # GETs should be safe (see http://www.w3.org/2001/tag/doc/whenToUseGet.html) > - verify :method => :post, :only => [ :destroy, :create, :update ], > + verify :method => :post, :only => [ :destroy, :create, :update , :change_ha_vm], > :redirect_to => { :controller => 'dashboard' } > > before_filter :pre_vm_action, :only => [:vm_action, :cancel_queued_tasks, :console] > > + def change_ha_vm > + vm = Vm.find_by_id(params[:id]) > + if vm.ha > + then vm.ha = false > + else vm.ha = true > + end > + vm.save! > + alert = "vm was change ha successfully " > + render :json => { :object => "vm", :success => true, :alert => alert } > + end > + > def index > roles = "('" + > Permission::roles_for_privilege(Permission::PRIV_VIEW).join("', '") + > diff --git a/src/app/views/vm/show.rhtml b/src/app/views/vm/show.rhtml > index f361131..b0479a6 100644 > --- a/src/app/views/vm/show.rhtml > +++ b/src/app/views/vm/show.rhtml > @@ -29,6 +29,9 @@ > </a> > <% end -%> > <% end %> > + <a href="#" onClick="change_ha_vm()" rel="facebox[.bolder]"> > + <%= image_tag "icon_x.png" %> <% if @vm.ha %> disable_ha (cur. enabled)<% else %> enable_ha (cur. disabled) <% end %> > + </a> > <a href="#confirm_cancel" rel="facebox[.bolder]"> > <%= image_tag "icon_x.png" %> Cancel queued tasks > </a> > @@ -40,6 +43,19 @@ > <%= confirmation_dialog("confirm_cancel", "Are you sure?", "cancel_queued_tasks()") %> > <%= confirmation_dialog("confirm_delete", "Are you sure?", "delete_vm()") %> > <script type="text/javascript"> > + function change_ha_vm() > + { > + $(document).trigger('close.facebox'); > + $.post('<%= url_for :controller => "vm", :action => "change_ha_vm", :id => @vm %>', > + {x: 1}, > + function(data,status){ > + $("#vms_grid").flexReload(); > + if (data.alert) { > + $.jGrowl(data.alert); > + } > + empty_summary('vms_selection', 'Virtual Machine'); > + }, 'json'); > + } > function cancel_queued_tasks() > { > $(document).trigger('close.facebox'); > diff --git a/src/db-omatic/db_omatic.rb b/src/db-omatic/db_omatic.rb > index 4afffb1..94d93fc 100755 > --- a/src/db-omatic/db_omatic.rb > +++ b/src/db-omatic/db_omatic.rb > @@ -52,6 +52,8 @@ class DbOmatic < Qpid::Qmf::Console > state = Vm::STATE_PENDING > when "running" > state = Vm::STATE_RUNNING > + # must find host for current vm and set host_id > + # domain.broker > when "blocked" > state = Vm::STATE_SUSPENDED #? > when "paused" > @@ -74,6 +76,91 @@ class DbOmatic < Qpid::Qmf::Console > domain[:synced] = true > end > > + #find hostname from values['node'] where values['class_type'] == 'domain' > + def get_host_id(abank,bbank) #???????????????????? ???????????????? ???? ???? ?????? ???????? - ???????????????? ?????????????????????????? ???????? ???????????????????? > + begin > + @cached_objects.keys.each do |objkey| > + if @cached_objects[objkey][:agent_bank].to_s == abank and @cached_objects[objkey][:broker_bank].to_s == bbank and @cached_objects[objkey][:class_type].to_s == 'node' > + return Host.find(:first, :conditions => ['hostname = ?', at cached_objects[objkey]["hostname"].to_s]).id > + break > + end > + end > + rescue => ex > + log("error in get_host_id") > + log(ex) > + end > + end > + > + def set_host(values,digit) > + begin > + vm = Vm.find(:first, :conditions => ['description = ?',values["name"].to_s]) > + if vm and digit > + vm.host_id = digit > + vm.save! > + else > + log("this vm not exist #{values["name"]}") > + end > + rescue => ex > + puts "error when set_host for #{values["name"]}" > + puts ex > + end > + end > + > + def start_crashed_vm(vm) > + task = VmTask.new( :user => 'db-omatic', :task_target => vm, :action => 'start_vm', :state => 'queued') > + task.save! > + log("set task for start crashed vm #{vm.id}") > + end > + > + def fencing_by_ssh_reboot(hname) > + begin > + # Fix need - very insecure, and need manual configure keys, may be kerberos ticket is better ? > + Process.fork do > + `ssh #{hname} -i /root/.ssh/id_rsa -o StrictHostKeyChecking=no "/sbin/init 6" & ` > + end > + # also can use ipmi - `ipmitool -I lan -H #{"ipmi."+hname} -U admin -P password chassis power reset ` > + log("success fencing#{hname}") > + # execute in shell, must be identify key for login as root to storage server > + # after restart crashed host need manualy remove this rule from iptables > + > + return true > + rescue => ex > + log("error in fencing #{hname}") > + log(ex) > + end > + end > + > + def set_domain_stopped(domain) > + begin > + vm = Vm.find(:first, :conditions => ['uuid = ?', domain['uuid']]) > + if vm != nil > + > + if vm.state == Vm::STATE_RUNNING and vm.ha > + fencing_and_restart = true > + fencing_by_ssh_reboot(vm.host.hostname) > + end > + puts fencing_and_restart > + vm.state = Vm::STATE_STOPPED > + vm.host_id = nil > + vm.save! > + domain['state'] = 'crashed' > + if fencing_and_restart > + log("must restart vm ") > + start_crashed_vm(vm) > + else > + log("not running and not restart") > + end > + else > + log('vm == nil ') > + end > + log("domain #{domain['id']} already stopped") > + rescue => ex > + log("can\'t set domain #{domain['id']} stopped") > + log(ex) > + end > + end > + > + > def update_host_state(host_info, state) > db_host = Host.find(:first, :conditions => [ "hostname = ?", host_info['hostname'] ]) > if db_host > @@ -116,7 +203,6 @@ class DbOmatic < Qpid::Qmf::Console > if values == nil > values = {} > @cached_objects[obj.object_id.to_s] = values > - > # Save the agent and broker bank so that we can tell what objects > # are expired when the heartbeat for them stops. > values[:broker_bank] = obj.object_id.broker_bank > @@ -130,20 +216,38 @@ class DbOmatic < Qpid::Qmf::Console > end > > domain_state_change = false > - > + change_node = false > obj.properties.each do |key, newval| > if values[key.to_s] != newval > values[key.to_s] = newval > #log "new value for property #{key} : #{newval}" > if type == "domain" and key.to_s == "state" > - domain_state_change = true > + domain_state_change = true > end > + if type == "domain" and key.to_s == "node" > + log("DEBUG change node = true") > + change_node = true > + end > + > end > end > > if domain_state_change > + log("DEBUG update_domain_state") > update_domain_state(values) > end > + if change_node > + values.each do |key,val| > + if key == 'state' and val == 'running' > + abank = values['node'].to_s.split('-')[3] > + bbank = values['node'].to_s.split('-')[4] > + @@host_id = get_host_id(abank,bbank) > + set_host(values,@@host_id) > + log("update node data for #{values['name']}") > + break > + end > + end > + end > > if new_object > if type == "node" > @@ -187,11 +291,6 @@ class DbOmatic < Qpid::Qmf::Console > end > end > > - > - def del_agent(agent) > - agent_disconnected(agent) > - end > - > # This method marks objects associated with the given agent as timed out/invalid. Called either > # when the agent heartbeats out, or we get a del_agent callback. > def agent_disconnected(agent) > @@ -204,11 +303,14 @@ class DbOmatic < Qpid::Qmf::Console > if values[:timed_out] == false > if values[:class_type] == 'node' > update_host_state(values, Host::STATE_UNAVAILABLE) > + values[:timed_out] = true > elsif values[:class_type] == 'domain' > - update_domain_state(values, Vm::STATE_UNREACHABLE) > - end > + log("set_domain_stopped") > + set_domain_stopped(values) > + values[:timed_out] = true > + @cached_objects.delete(objkey) > + end > end > - values[:timed_out] = true > end > end > end > @@ -216,7 +318,6 @@ class DbOmatic < Qpid::Qmf::Console > # The opposite of above, this is called when an agent is alive and well and makes sure > # all of the objects associated with it are marked as valid. > def agent_connected(agent) > - > @cached_objects.keys.each do |objkey| > if @cached_objects[objkey][:broker_bank] == agent.broker.broker_bank and > @cached_objects[objkey][:agent_bank] == agent.agent_bank > @@ -248,6 +349,7 @@ class DbOmatic < Qpid::Qmf::Console > db_vm = Vm.find(:all) > db_vm.each do |vm| > log "Marking vm #{vm.description} as stopped." > + vm.host_id = nil > vm.state = Vm::STATE_STOPPED > vm.save > end > diff --git a/src/db/migrate/006_create_vms.rb b/src/db/migrate/006_create_vms.rb > index 74794b6..a5460f0 100644 > --- a/src/db/migrate/006_create_vms.rb > +++ b/src/db/migrate/006_create_vms.rb > @@ -34,6 +34,7 @@ class CreateVms < ActiveRecord::Migration > t.string :boot_device, :null => false > t.integer :vnc_port > t.integer :lock_version, :default => 0 > + t.boolean :ha , :default => true > end > execute "alter table vms add constraint fk_vms_hosts > foreign key (host_id) references hosts(id)" > -- > 1.5.5.6 > > _______________________________________________ > Ovirt-devel mailing list > Ovirt-devel at redhat.com > https://www.redhat.com/mailman/listinfo/ovirt-devel