[Linux-cluster] cluster3 - service fails, doesn't failover/fence

Fri Dec 18 17:35:52 UTC 2009

I've got an otherwise working fine two node + qdisk cluster3 (3.0.0) 
setup running under Debian with 2.6.30 kern. In the past it has fenced 
and failed over properly to recover from a failed node.
But, yesterday one of the status checks returned a 1 and the subsequent 
automatic start/stop of the service also returned non-good. This set my 
cluster service into a 'failed' state and all related components were 
stopped. Everything was resolved with a manual service disable and enable.

Should the secondary have fenced in this case or is that reserved for 
only when communications in the cluster fail? I would have thought that 
it would have tried to start the service at least. A clustat on either 
machine showed the service "failed' and nothing was logged on the 
non-active node.

Since a failover (rather then a give up) would be the proper thing, I'm 
assuming a config issue. Any pointers?

<?xml version="1.0"?>
<cluster name="alpha" config_version="42">

<cman two_node="0" expected_votes="3">
</cman>

<clusternodes>
<clusternode name="wonder-p" votes="1" nodeid="1">
       <fence>
               <method name="single">
                       <device name="pwr01" option="off"/>
                       <device name="pwr02" option="off"/>
                       <device name="pwr01" option="on"/>
                       <device name="pwr02" option="on"/>
               </method>
       </fence>
</clusternode>
<clusternode name="nicks-p" votes="1" nodeid="2">
       <fence>
               <method name="single">
                       <device name="pwr03" option="off"/>
                       <device name="pwr04" option="off"/>
                       <device name="pwr03" option="on"/>
                       <device name="pwr04" option="on"/>
               </method>
       </fence>
</clusternode>
</clusternodes>

<quorumd interval="1" tko="10" votes="1" label="quorumdisk">
       <heuristic program="ping 172.25.19.254 -c1 -t1" score="1" 
interval="2" tko="3"/>
</quorumd>

<fence_daemon post_join_delay="20">
</fence_daemon>

<fencedevices>
       <fencedevice agent="fence_apc_snmp" ipaddr="pdu-paul-2-2" 
port="4" name="pwr01" udpport="161" />
       <fencedevice agent="fence_apc_snmp" ipaddr="pdu-paul-2-3" 
port="4" name="pwr02" udpport="161" />
       <fencedevice agent="fence_apc_snmp" ipaddr="pdu-paul-2-2" 
port="3" name="pwr03" udpport="161" />
       <fencedevice agent="fence_apc_snmp" ipaddr="pdu-paul-2-3" 
port="3" name="pwr04" udpport="161" />
</fencedevices>

<rm>

  <failoverdomains>
          <failoverdomain name="mailcluster" restricted="1" ordered="0" >
               <failoverdomainnode name="wonder-p" priority="1"/>
               <failoverdomainnode name="nicks-p" priority="1"/>
          </failoverdomain>
  </failoverdomains>

  <service name="MailHost" autostart="1" domain="mailcluster" >
          <script name="MailHost-early" 
file="/etc/cluster/MailHost-misc-early" />
          <fs name="mailhome" mountpoint="/home" device="LABEL=home" 
fstype="ext4" force_unmount="1" active_monitor="1" 
options="defaults,noatime,nodiratime" />
          <fs name="mailcluster" mountpoint="/var/cluster" 
device="LABEL=cluster" fstype="ext3" force_unmount="1" 
active_monitor="1" options="defaults" />
          <ip address="172.25.16.58" monitor_link="1" />
          <script name="saslauthd" file="/etc/cluster/saslauthd-cluster" />
          <script name="postfix" file="/etc/cluster/postfix-cluster" />
          <script name="dovecot" file="/etc/cluster/dovecot-wrapper" 
__independent_subtree="1" />
          <script name="mailman" file="/etc/cluster/mailman-wrapper" 
__independent_subtree="1" />
          <script name="apache2-mailhost" 
file="/etc/cluster/apache2-mailhost" __independent_subtree="1" />
          <script name="usermin" file="/etc/init.d/usermin" 
__independent_subtree="1" />
          <script name="MailHost-late" 
file="/etc/cluster/MailHost-misc-late" />
  </service>

</rm>
</cluster>

Dec 15 12:37:00 bash Executing /etc/cluster/postfix-cluster status
Dec 15 12:37:00 bash Executing /etc/cluster/dovecot-wrapper status
Dec 15 12:37:00 bash Executing /etc/cluster/mailman-wrapper status
Dec 15 12:37:00 bash Executing /etc/cluster/apache2-mailhost status
Dec 15 12:37:00 bash Executing /etc/init.d/usermin status
Dec 15 12:37:00 bash script:usermin: status of /etc/init.d/usermin 
failed (return
ed 1)
Dec 15 12:37:01 bash Executing /etc/cluster/MailHost-misc-late status
Dec 15 12:37:01 bash Executing /etc/init.d/usermin stop
Dec 15 12:37:03 bash Executing /etc/init.d/usermin start
Dec 15 12:37:19 bash script:usermin: start of /etc/init.d/usermin failed 
(returne
d 98)
Dec 15 12:37:20 bash Executing /etc/cluster/MailHost-misc-late stop
Dec 15 12:37:21 bash Executing /etc/init.d/usermin stop
Dec 15 12:37:21 bash script:usermin: stop of /etc/init.d/usermin failed 
(returned
1)
Dec 15 12:37:21 bash Executing /etc/cluster/apache2-mailhost stop
Dec 15 12:37:24 bash Executing /etc/cluster/mailman-wrapper stop
Dec 15 12:37:42 bash Executing /etc/cluster/dovecot-wrapper stop
Dec 15 12:37:43 bash Executing /etc/cluster/postfix-cluster stop
Dec 15 12:37:56 bash Executing /etc/cluster/saslauthd-cluster stop
Dec 15 12:38:07 bash Executing /etc/cluster/MailHost-misc-early stop
Dec 15 12:38:08 bash Removing IPv4 address 172.25.16.58/22 from eth0
Dec 15 12:38:21 bash unmounting /var/cluster
Dec 15 12:38:21 bash Forcefully unmounting /var/cluster
Dec 15 12:38:22 bash killing process 6844 (daemon atd /var/cluster)
Dec 15 12:38:22 bash killing process 4274 (root bash /var/cluster)
Dec 15 12:38:22 bash killing process 6836 (root cron /var/cluster)
Dec 15 12:38:30 bash unmounting /var/cluster
Dec 15 12:38:32 bash unmounting /home
Dec 15 12:38:32 bash Forcefully unmounting /home
Dec 15 12:38:33 bash killing process 27678 (root bacula-fd /home)
Dec 15 12:38:41 bash unmounting /home
Dec 15 12:50:08 bash Executing /etc/cluster/MailHost-misc-late stop
Dec 15 12:50:08 bash Executing /etc/init.d/usermin stop
Dec 15 12:50:08 bash script:usermin: stop of /etc/init.d/usermin failed 
(returned
1)
Dec 15 12:50:08 bash Executing /etc/cluster/apache2-mailhost stop
Dec 15 12:50:09 bash Executing /etc/cluster/mailman-wrapper stop
Dec 15 12:50:09 bash script:mailman: stop of 
/etc/cluster/mailman-wrapper failed
(returned 1)
Dec 15 12:50:09 bash Executing /etc/cluster/dovecot-wrapper stop
Dec 15 12:50:09 bash Executing /etc/cluster/postfix-cluster stop
Dec 15 12:50:09 bash Executing /etc/cluster/saslauthd-cluster stop
Dec 15 12:50:10 bash Executing /etc/cluster/MailHost-misc-early stop
Dec 15 12:50:10 bash 172.25.16.58 is not configured
Dec 15 12:50:10 bash /dev/dm-1 is not mounted
Dec 15 12:50:10 bash /dev/dm-0 is not mounted
Dec 15 12:50:20 bash Unknown file system type 'ext4' for device 
/dev/dm-0.  Assum
ing fsck is required.
Dec 15 12:50:20 bash Running fsck on /dev/dm-0
Dec 15 12:50:21 bash mounting /dev/dm-0 on /home
Dec 15 12:50:21 bash mount -t ext4 -o defaults,noatime,nodiratime 
/dev/dm-0 /home
Dec 15 12:50:22 bash quotaon not found in /bin:/sbin:/usr/bin:/usr/sbin
Dec 15 12:50:22 bash mounting /dev/dm-1 on /var/cluster
Dec 15 12:50:23 bash mount -t ext3 -o defaults /dev/dm-1 /var/cluster
Dec 15 12:50:23 bash quotaon not found in /bin:/sbin:/usr/bin:/usr/sbin
Dec 15 12:50:23 bash Link for eth0: Detected
Dec 15 12:50:23 bash Adding IPv4 address 172.25.16.58/22 to eth0
Dec 15 12:50:23 bash Sending gratuitous ARP: 172.25.16.58 
00:30:48:c6:de:24 brd f
f:ff:ff:ff:ff:ff
Dec 15 12:50:24 bash Executing /etc/cluster/MailHost-misc-early start
... startup continues fine