pronix.service at gmail.com
2009-Feb-03 16:45 UTC
[Ovirt-devel] [PATCH] Add virtual machine restart if host crashed. Use fencing by ssh and id_rsa key.
From: root <root at voip.adenin.ru>
---
src/app/controllers/vm_controller.rb | 13 +++-
src/app/views/vm/show.rhtml | 16 ++++
src/db-omatic/db_omatic.rb | 126 ++++++++++++++++++++++++++++++---
src/db/migrate/006_create_vms.rb | 1 +
4 files changed, 143 insertions(+), 13 deletions(-)
diff --git a/src/app/controllers/vm_controller.rb
b/src/app/controllers/vm_controller.rb
index 56501fd..0f43680 100644
--- a/src/app/controllers/vm_controller.rb
+++ b/src/app/controllers/vm_controller.rb
@@ -20,11 +20,22 @@ require 'socket'
class VmController < ApplicationController
# GETs should be safe (see http://www.w3.org/2001/tag/doc/whenToUseGet.html)
- verify :method => :post, :only => [ :destroy, :create, :update ],
+ verify :method => :post, :only => [ :destroy, :create, :update ,
:change_ha_vm],
:redirect_to => { :controller => 'dashboard' }
before_filter :pre_vm_action, :only => [:vm_action, :cancel_queued_tasks,
:console]
+ def change_ha_vm
+ vm = Vm.find_by_id(params[:id])
+ if vm.ha
+ then vm.ha = false
+ else vm.ha = true
+ end
+ vm.save!
+ alert = "vm was change ha successfully "
+ render :json => { :object => "vm", :success => true,
:alert => alert }
+ end
+
def index
roles = "('" +
Permission::roles_for_privilege(Permission::PRIV_VIEW).join("',
'") +
diff --git a/src/app/views/vm/show.rhtml b/src/app/views/vm/show.rhtml
index f361131..b0479a6 100644
--- a/src/app/views/vm/show.rhtml
+++ b/src/app/views/vm/show.rhtml
@@ -29,6 +29,9 @@
</a>
<% end -%>
<% end %>
+ <a href="#" onClick="change_ha_vm()"
rel="facebox[.bolder]">
+ <%= image_tag "icon_x.png" %> <% if @vm.ha %>
disable_ha (cur. enabled)<% else %> enable_ha (cur. disabled) <% end
%>
+ </a>
<a href="#confirm_cancel"
rel="facebox[.bolder]">
<%= image_tag "icon_x.png" %> Cancel queued tasks
</a>
@@ -40,6 +43,19 @@
<%= confirmation_dialog("confirm_cancel", "Are you
sure?", "cancel_queued_tasks()") %>
<%= confirmation_dialog("confirm_delete", "Are you
sure?", "delete_vm()") %>
<script type="text/javascript">
+ function change_ha_vm()
+ {
+ $(document).trigger('close.facebox');
+ $.post('<%= url_for :controller => "vm", :action =>
"change_ha_vm", :id => @vm %>',
+ {x: 1},
+ function(data,status){
+ $("#vms_grid").flexReload();
+ if (data.alert) {
+ $.jGrowl(data.alert);
+ }
+ empty_summary('vms_selection', 'Virtual
Machine');
+ }, 'json');
+ }
function cancel_queued_tasks()
{
$(document).trigger('close.facebox');
diff --git a/src/db-omatic/db_omatic.rb b/src/db-omatic/db_omatic.rb
index 4afffb1..94d93fc 100755
--- a/src/db-omatic/db_omatic.rb
+++ b/src/db-omatic/db_omatic.rb
@@ -52,6 +52,8 @@ class DbOmatic < Qpid::Qmf::Console
state = Vm::STATE_PENDING
when "running"
state = Vm::STATE_RUNNING
+ # must find host for current vm and set host_id
+ # domain.broker
when "blocked"
state = Vm::STATE_SUSPENDED #?
when "paused"
@@ -74,6 +76,91 @@ class DbOmatic < Qpid::Qmf::Console
domain[:synced] = true
end
+ #find hostname from values['node'] where
values['class_type'] == 'domain'
+ def get_host_id(abank,bbank) #???????????????????? ???????????????? ????
???? ?????? ???????? - ???????????????? ?????????????????????????? ????????
????????????????????
+ begin
+ @cached_objects.keys.each do |objkey|
+ if @cached_objects[objkey][:agent_bank].to_s == abank and
@cached_objects[objkey][:broker_bank].to_s == bbank and
@cached_objects[objkey][:class_type].to_s == 'node'
+ return Host.find(:first, :conditions => ['hostname = ?', at
cached_objects[objkey]["hostname"].to_s]).id
+ break
+ end
+ end
+ rescue => ex
+ log("error in get_host_id")
+ log(ex)
+ end
+ end
+
+ def set_host(values,digit)
+ begin
+ vm = Vm.find(:first, :conditions => ['description =
?',values["name"].to_s])
+ if vm and digit
+ vm.host_id = digit
+ vm.save!
+ else
+ log("this vm not exist #{values["name"]}")
+ end
+ rescue => ex
+ puts "error when set_host for #{values["name"]}"
+ puts ex
+ end
+ end
+
+ def start_crashed_vm(vm)
+ task = VmTask.new( :user => 'db-omatic', :task_target => vm,
:action => 'start_vm', :state => 'queued')
+ task.save!
+ log("set task for start crashed vm #{vm.id}")
+ end
+
+ def fencing_by_ssh_reboot(hname)
+ begin
+ # Fix need - very insecure, and need manual configure keys, may be kerberos
ticket is better ?
+ Process.fork do
+ `ssh #{hname} -i /root/.ssh/id_rsa -o StrictHostKeyChecking=no
"/sbin/init 6" & `
+ end
+ # also can use ipmi - `ipmitool -I lan -H #{"ipmi."+hname} -U admin
-P password chassis power reset `
+ log("success fencing#{hname}")
+ # execute in shell, must be identify key for login as root to storage server
+ # after restart crashed host need manualy remove this rule from iptables
+
+ return true
+ rescue => ex
+ log("error in fencing #{hname}")
+ log(ex)
+ end
+ end
+
+ def set_domain_stopped(domain)
+ begin
+ vm = Vm.find(:first, :conditions => ['uuid = ?',
domain['uuid']])
+ if vm != nil
+
+ if vm.state == Vm::STATE_RUNNING and vm.ha
+ fencing_and_restart = true
+ fencing_by_ssh_reboot(vm.host.hostname)
+ end
+ puts fencing_and_restart
+ vm.state = Vm::STATE_STOPPED
+ vm.host_id = nil
+ vm.save!
+ domain['state'] = 'crashed'
+ if fencing_and_restart
+ log("must restart vm ")
+ start_crashed_vm(vm)
+ else
+ log("not running and not restart")
+ end
+ else
+ log('vm == nil ')
+ end
+ log("domain #{domain['id']} already stopped")
+ rescue => ex
+ log("can\'t set domain #{domain['id']} stopped")
+ log(ex)
+ end
+ end
+
+
def update_host_state(host_info, state)
db_host = Host.find(:first, :conditions => [ "hostname =
?", host_info['hostname'] ])
if db_host
@@ -116,7 +203,6 @@ class DbOmatic < Qpid::Qmf::Console
if values == nil
values = {}
@cached_objects[obj.object_id.to_s] = values
-
# Save the agent and broker bank so that we can tell what
objects
# are expired when the heartbeat for them stops.
values[:broker_bank] = obj.object_id.broker_bank
@@ -130,20 +216,38 @@ class DbOmatic < Qpid::Qmf::Console
end
domain_state_change = false
-
+ change_node = false
obj.properties.each do |key, newval|
if values[key.to_s] != newval
values[key.to_s] = newval
#log "new value for property #{key} : #{newval}"
if type == "domain" and key.to_s ==
"state"
- domain_state_change = true
+ domain_state_change = true
end
+ if type == "domain" and key.to_s == "node"
+ log("DEBUG change node = true")
+ change_node = true
+ end
+
end
end
if domain_state_change
+ log("DEBUG update_domain_state")
update_domain_state(values)
end
+ if change_node
+ values.each do |key,val|
+ if key == 'state' and val == 'running'
+ abank = values['node'].to_s.split('-')[3]
+ bbank = values['node'].to_s.split('-')[4]
+ @@host_id = get_host_id(abank,bbank)
+ set_host(values,@@host_id)
+ log("update node data for #{values['name']}")
+ break
+ end
+ end
+ end
if new_object
if type == "node"
@@ -187,11 +291,6 @@ class DbOmatic < Qpid::Qmf::Console
end
end
-
- def del_agent(agent)
- agent_disconnected(agent)
- end
-
# This method marks objects associated with the given agent as timed
out/invalid. Called either
# when the agent heartbeats out, or we get a del_agent callback.
def agent_disconnected(agent)
@@ -204,11 +303,14 @@ class DbOmatic < Qpid::Qmf::Console
if values[:timed_out] == false
if values[:class_type] == 'node'
update_host_state(values, Host::STATE_UNAVAILABLE)
+ values[:timed_out] = true
elsif values[:class_type] == 'domain'
- update_domain_state(values, Vm::STATE_UNREACHABLE)
- end
+ log("set_domain_stopped")
+ set_domain_stopped(values)
+ values[:timed_out] = true
+ @cached_objects.delete(objkey)
+ end
end
- values[:timed_out] = true
end
end
end
@@ -216,7 +318,6 @@ class DbOmatic < Qpid::Qmf::Console
# The opposite of above, this is called when an agent is alive and well and
makes sure
# all of the objects associated with it are marked as valid.
def agent_connected(agent)
-
@cached_objects.keys.each do |objkey|
if @cached_objects[objkey][:broker_bank] ==
agent.broker.broker_bank and
@cached_objects[objkey][:agent_bank] == agent.agent_bank
@@ -248,6 +349,7 @@ class DbOmatic < Qpid::Qmf::Console
db_vm = Vm.find(:all)
db_vm.each do |vm|
log "Marking vm #{vm.description} as stopped."
+ vm.host_id = nil
vm.state = Vm::STATE_STOPPED
vm.save
end
diff --git a/src/db/migrate/006_create_vms.rb b/src/db/migrate/006_create_vms.rb
index 74794b6..a5460f0 100644
--- a/src/db/migrate/006_create_vms.rb
+++ b/src/db/migrate/006_create_vms.rb
@@ -34,6 +34,7 @@ class CreateVms < ActiveRecord::Migration
t.string :boot_device, :null => false
t.integer :vnc_port
t.integer :lock_version, :default => 0
+ t.boolean :ha , :default => true
end
execute "alter table vms add constraint fk_vms_hosts
foreign key (host_id) references hosts(id)"
--
1.5.5.6
Ian Main
2009-Feb-04 19:46 UTC
[Ovirt-devel] [PATCH] Add virtual machine restart if host crashed. Use fencing by ssh and id_rsa key.
On Tue, 3 Feb 2009 19:45:01 +0300 pronix.service at gmail.com wrote:> From: root <root at voip.adenin.ru>[snip] Thanks for reposting! I just wanted you to know that I'm madly working on LVM support right now and will review your patch as soon as I can. Thanks again, Ian
Ian Main
2009-Feb-17 22:19 UTC
[Ovirt-devel] [PATCH] Add virtual machine restart if host crashed. Use fencing by ssh and id_rsa key.
On Tue, 3 Feb 2009 19:45:01 +0300
pronix.service at gmail.com wrote:
OK, sorry it took me so long to finally review this properly!
It applied! WOOT! I get trailing whitespace warnings but that's ok :)
A few small things.. I notice you are using tabs to indent. I think you have
your editor set to 4 space tabs? In ruby we always use spaces (and most
projects I know use spaces now). Usually it's 2 spaces but in this one
it's 4 :) In my editor which is set to 8 space tabs it looks to be indented
wrong FYI.
I think the whole fencing with ssh might be a bit beyond what we are currently
trying to do and I'm not sure it's the right way to do this in a larger
scale. I do believe there are plans to use a more comprehensive project for
that in near future.
Also a better way I would think would be to use libvirt calls via qpid to ensure
VMs are destroyed?
I'm not sure I understand some of the logic in here either. Would you be
willing to explain some of the dbomatic changes? I also can't understand
the comments in Russian :). I think for consistency it would be good to have
all comments in english.. most of our developers read/speak english.
I notice also that you removed del_agent callback. While I think there is a bug
there which causes the node to be unavailable if you restart libvirt-qpid, I
don't know if this is the right way to fix it. I'm going to look at
that shortly.
Thanks again for the patch!
Ian
> From: root <root at voip.adenin.ru>
>
> ---
> src/app/controllers/vm_controller.rb | 13 +++-
> src/app/views/vm/show.rhtml | 16 ++++
> src/db-omatic/db_omatic.rb | 126
++++++++++++++++++++++++++++++---
> src/db/migrate/006_create_vms.rb | 1 +
> 4 files changed, 143 insertions(+), 13 deletions(-)
>
> diff --git a/src/app/controllers/vm_controller.rb
b/src/app/controllers/vm_controller.rb
> index 56501fd..0f43680 100644
> --- a/src/app/controllers/vm_controller.rb
> +++ b/src/app/controllers/vm_controller.rb
> @@ -20,11 +20,22 @@ require 'socket'
>
> class VmController < ApplicationController
> # GETs should be safe (see
http://www.w3.org/2001/tag/doc/whenToUseGet.html)
> - verify :method => :post, :only => [ :destroy, :create, :update ],
> + verify :method => :post, :only => [ :destroy, :create, :update ,
:change_ha_vm],
> :redirect_to => { :controller => 'dashboard' }
>
> before_filter :pre_vm_action, :only => [:vm_action,
:cancel_queued_tasks, :console]
>
> + def change_ha_vm
> + vm = Vm.find_by_id(params[:id])
> + if vm.ha
> + then vm.ha = false
> + else vm.ha = true
> + end
> + vm.save!
> + alert = "vm was change ha successfully "
> + render :json => { :object => "vm", :success =>
true, :alert => alert }
> + end
> +
> def index
> roles = "('" +
>
Permission::roles_for_privilege(Permission::PRIV_VIEW).join("',
'") +
> diff --git a/src/app/views/vm/show.rhtml b/src/app/views/vm/show.rhtml
> index f361131..b0479a6 100644
> --- a/src/app/views/vm/show.rhtml
> +++ b/src/app/views/vm/show.rhtml
> @@ -29,6 +29,9 @@
> </a>
> <% end -%>
> <% end %>
> + <a href="#" onClick="change_ha_vm()"
rel="facebox[.bolder]">
> + <%= image_tag "icon_x.png" %> <% if @vm.ha %>
disable_ha (cur. enabled)<% else %> enable_ha (cur. disabled) <% end
%>
> + </a>
> <a href="#confirm_cancel"
rel="facebox[.bolder]">
> <%= image_tag "icon_x.png" %> Cancel queued tasks
> </a>
> @@ -40,6 +43,19 @@
> <%= confirmation_dialog("confirm_cancel", "Are you
sure?", "cancel_queued_tasks()") %>
> <%= confirmation_dialog("confirm_delete", "Are you
sure?", "delete_vm()") %>
> <script type="text/javascript">
> + function change_ha_vm()
> + {
> + $(document).trigger('close.facebox');
> + $.post('<%= url_for :controller => "vm", :action
=> "change_ha_vm", :id => @vm %>',
> + {x: 1},
> + function(data,status){
> + $("#vms_grid").flexReload();
> + if (data.alert) {
> + $.jGrowl(data.alert);
> + }
> + empty_summary('vms_selection', 'Virtual
Machine');
> + }, 'json');
> + }
> function cancel_queued_tasks()
> {
> $(document).trigger('close.facebox');
> diff --git a/src/db-omatic/db_omatic.rb b/src/db-omatic/db_omatic.rb
> index 4afffb1..94d93fc 100755
> --- a/src/db-omatic/db_omatic.rb
> +++ b/src/db-omatic/db_omatic.rb
> @@ -52,6 +52,8 @@ class DbOmatic < Qpid::Qmf::Console
> state = Vm::STATE_PENDING
> when "running"
> state = Vm::STATE_RUNNING
> + # must find host for current vm and set host_id
> + # domain.broker
> when "blocked"
> state = Vm::STATE_SUSPENDED #?
> when "paused"
> @@ -74,6 +76,91 @@ class DbOmatic < Qpid::Qmf::Console
> domain[:synced] = true
> end
>
> + #find hostname from values['node'] where
values['class_type'] == 'domain'
> + def get_host_id(abank,bbank) #???????????????????? ????????????????
???? ???? ?????? ???????? - ???????????????? ?????????????????????????? ????????
????????????????????
> + begin
> + @cached_objects.keys.each do |objkey|
> + if @cached_objects[objkey][:agent_bank].to_s == abank and
@cached_objects[objkey][:broker_bank].to_s == bbank and
@cached_objects[objkey][:class_type].to_s == 'node'
> + return Host.find(:first, :conditions => ['hostname = ?', at
cached_objects[objkey]["hostname"].to_s]).id
> + break
> + end
> + end
> + rescue => ex
> + log("error in get_host_id")
> + log(ex)
> + end
> + end
> +
> + def set_host(values,digit)
> + begin
> + vm = Vm.find(:first, :conditions => ['description =
?',values["name"].to_s])
> + if vm and digit
> + vm.host_id = digit
> + vm.save!
> + else
> + log("this vm not exist #{values["name"]}")
> + end
> + rescue => ex
> + puts "error when set_host for #{values["name"]}"
> + puts ex
> + end
> + end
> +
> + def start_crashed_vm(vm)
> + task = VmTask.new( :user => 'db-omatic', :task_target =>
vm, :action => 'start_vm', :state => 'queued')
> + task.save!
> + log("set task for start crashed vm #{vm.id}")
> + end
> +
> + def fencing_by_ssh_reboot(hname)
> + begin
> + # Fix need - very insecure, and need manual configure keys, may be
kerberos ticket is better ?
> + Process.fork do
> + `ssh #{hname} -i /root/.ssh/id_rsa -o StrictHostKeyChecking=no
"/sbin/init 6" & `
> + end
> + # also can use ipmi - `ipmitool -I lan -H #{"ipmi."+hname} -U
admin -P password chassis power reset `
> + log("success fencing#{hname}")
> + # execute in shell, must be identify key for login as root to storage
server
> + # after restart crashed host need manualy remove this rule from iptables
> +
> + return true
> + rescue => ex
> + log("error in fencing #{hname}")
> + log(ex)
> + end
> + end
> +
> + def set_domain_stopped(domain)
> + begin
> + vm = Vm.find(:first, :conditions => ['uuid = ?',
domain['uuid']])
> + if vm != nil
> +
> + if vm.state == Vm::STATE_RUNNING and vm.ha
> + fencing_and_restart = true
> + fencing_by_ssh_reboot(vm.host.hostname)
> + end
> + puts fencing_and_restart
> + vm.state = Vm::STATE_STOPPED
> + vm.host_id = nil
> + vm.save!
> + domain['state'] = 'crashed'
> + if fencing_and_restart
> + log("must restart vm ")
> + start_crashed_vm(vm)
> + else
> + log("not running and not restart")
> + end
> + else
> + log('vm == nil ')
> + end
> + log("domain #{domain['id']} already stopped")
> + rescue => ex
> + log("can\'t set domain #{domain['id']}
stopped")
> + log(ex)
> + end
> + end
> +
> +
> def update_host_state(host_info, state)
> db_host = Host.find(:first, :conditions => [ "hostname =
?", host_info['hostname'] ])
> if db_host
> @@ -116,7 +203,6 @@ class DbOmatic < Qpid::Qmf::Console
> if values == nil
> values = {}
> @cached_objects[obj.object_id.to_s] = values
> -
> # Save the agent and broker bank so that we can tell what
objects
> # are expired when the heartbeat for them stops.
> values[:broker_bank] = obj.object_id.broker_bank
> @@ -130,20 +216,38 @@ class DbOmatic < Qpid::Qmf::Console
> end
>
> domain_state_change = false
> -
> + change_node = false
> obj.properties.each do |key, newval|
> if values[key.to_s] != newval
> values[key.to_s] = newval
> #log "new value for property #{key} :
#{newval}"
> if type == "domain" and key.to_s ==
"state"
> - domain_state_change = true
> + domain_state_change = true
> end
> + if type == "domain" and key.to_s == "node"
> + log("DEBUG change node = true")
> + change_node = true
> + end
> +
> end
> end
>
> if domain_state_change
> + log("DEBUG update_domain_state")
> update_domain_state(values)
> end
> + if change_node
> + values.each do |key,val|
> + if key == 'state' and val == 'running'
> + abank = values['node'].to_s.split('-')[3]
> + bbank = values['node'].to_s.split('-')[4]
> + @@host_id = get_host_id(abank,bbank)
> + set_host(values,@@host_id)
> + log("update node data for #{values['name']}")
> + break
> + end
> + end
> + end
>
> if new_object
> if type == "node"
> @@ -187,11 +291,6 @@ class DbOmatic < Qpid::Qmf::Console
> end
> end
>
> -
> - def del_agent(agent)
> - agent_disconnected(agent)
> - end
> -
> # This method marks objects associated with the given agent as timed
out/invalid. Called either
> # when the agent heartbeats out, or we get a del_agent callback.
> def agent_disconnected(agent)
> @@ -204,11 +303,14 @@ class DbOmatic < Qpid::Qmf::Console
> if values[:timed_out] == false
> if values[:class_type] == 'node'
> update_host_state(values, Host::STATE_UNAVAILABLE)
> + values[:timed_out] = true
> elsif values[:class_type] == 'domain'
> - update_domain_state(values, Vm::STATE_UNREACHABLE)
> - end
> + log("set_domain_stopped")
> + set_domain_stopped(values)
> + values[:timed_out] = true
> + @cached_objects.delete(objkey)
> + end
> end
> - values[:timed_out] = true
> end
> end
> end
> @@ -216,7 +318,6 @@ class DbOmatic < Qpid::Qmf::Console
> # The opposite of above, this is called when an agent is alive and
well and makes sure
> # all of the objects associated with it are marked as valid.
> def agent_connected(agent)
> -
> @cached_objects.keys.each do |objkey|
> if @cached_objects[objkey][:broker_bank] ==
agent.broker.broker_bank and
> @cached_objects[objkey][:agent_bank] == agent.agent_bank
> @@ -248,6 +349,7 @@ class DbOmatic < Qpid::Qmf::Console
> db_vm = Vm.find(:all)
> db_vm.each do |vm|
> log "Marking vm #{vm.description} as stopped."
> + vm.host_id = nil
> vm.state = Vm::STATE_STOPPED
> vm.save
> end
> diff --git a/src/db/migrate/006_create_vms.rb
b/src/db/migrate/006_create_vms.rb
> index 74794b6..a5460f0 100644
> --- a/src/db/migrate/006_create_vms.rb
> +++ b/src/db/migrate/006_create_vms.rb
> @@ -34,6 +34,7 @@ class CreateVms < ActiveRecord::Migration
> t.string :boot_device, :null => false
> t.integer :vnc_port
> t.integer :lock_version, :default => 0
> + t.boolean :ha , :default => true
> end
> execute "alter table vms add constraint fk_vms_hosts
> foreign key (host_id) references hosts(id)"
> --
> 1.5.5.6
>
> _______________________________________________
> Ovirt-devel mailing list
> Ovirt-devel at redhat.com
> https://www.redhat.com/mailman/listinfo/ovirt-devel