[Linux-cluster] rgmanager causing hard lock ups

Ryan Thomson thomsonr at ucalgary.ca
Fri Dec 9 23:38:06 UTC 2005


Hi List,

I have an RHCS cluster with four nodes on RHEL4U2 using the RHN RPMs and
GFS CVS (RHEL4) and LVM2 (clvmd) from source tarball (2.2.01.09).

I'm seeing some rather disturbing behavior from my cluster. I can get all
the nodes to join, fence each other properly, etc. I also have some
services setup, mainly GFS mounts and NFS exports.

However, now if I bring up the cluster and start rgmanager, the node that
tries to start one or more of the services (I can't tell which service but
I suspect the NFS export service) will hard lock with the caps lock and
scroll lock lights blinking and the rest of the cluster is useless:
services don't start and rgmanager won't stop or reload or do anything...
on all the nodes. Also, I have all but one of my services set to NOT
autostart, yet when I start rgmanager, they begin starting anyways...

Here is my cluster.conf file, I suspect the problem is with my NFS export
service as that is the only one I've changed since I started seeing this
behavior:

<?xml version="1.0" ?>
<cluster config_version="99" name="biocomp_cluster">
        <fence_daemon clean_start="1" post_fail_delay="0"
post_join_delay="3"/>
        <clusternodes>
                <clusternode name="wolverine" votes="1">
                        <fence>
                                <method name="1">
                                        <device name="apcfence" port="1"
switch="0"/>
                                </method>
                        </fence>
                </clusternode>
                <clusternode name="skunk" votes="1">
                        <fence>
                                <method name="1">
                                        <device name="apcfence" port="2"
switch="0"/>
                                </method>
                        </fence>
                </clusternode>
                <clusternode name="cottontail" votes="1">
                        <fence>
                                <method name="1">
                                        <device name="apcfence" port="3"
switch="0"/>
                                </method>
                        </fence>
                </clusternode>
                <clusternode name="walrus" votes="1">
                        <fence>
                                <method name="1">
                                        <device name="apcfence" port="4"
switch="0"/>
                                </method>
                        </fence>
                </clusternode>
        </clusternodes>
        <cman/>
        <fencedevices>
                <fencedevice agent="fence_apc" ipaddr="10.1.1.254"
login="fence_user" name="apcfence" passwd="xxx"/>
        </fencedevices>
        <rm>
                <failoverdomains>
                        <failoverdomain name="Cluster Failover"
ordered="0" restricted="1">
                                <failoverdomainnode name="wolverine"
priority="1"/>
                                <failoverdomainnode name="skunk"
priority="1"/>
                                <failoverdomainnode name="cottontail"
priority="1"/>
                        </failoverdomain>
                        <failoverdomain name="Backup" ordered="0"
restricted="1">
                                <failoverdomainnode name="walrus"
priority="1"/>
                        </failoverdomain>
                </failoverdomains>
                <resources>
                        <nfsexport name="Cluster Export"/>
                        <nfsclient name="Biocomp Clients"
options="rw,sync" target="xxx.xxx.xxx.xxx/24"/>
                        <clusterfs device="/dev/BIOCOMP/docs"
force_unmount="0" fstype="gfs"
mountpoint="/projects/docs" name="Documentation"
options="acl"/>
                        <clusterfs device="/dev/BIOCOMP/ryan"
force_unmount="0" fstype="gfs"
mountpoint="/people/ryan" name="Home - Ryan"
options="acl"/>
                        <clusterfs device="/dev/BIOCOMP/luca"
force_unmount="0" fstype="gfs"
mountpoint="/people/luca" name="Home - Luca"
options="acl"/>
                        <clusterfs device="/dev/BIOCOMP/jlmaccal"
force_unmount="0" fstype="gfs"
mountpoint="/people/jlmaccal" name="Home - Justin"
options="acl"/>
                        <clusterfs device="/dev/BIOCOMP/jm_hexane"
force_unmount="0" fstype="gfs"
mountpoint="/projects/jlmaccal/hexane"
name="Project - JM Hexane" options="acl"/>
                        <clusterfs device="/dev/BIOCOMP/jm_LJ"
force_unmount="0" fstype="gfs"
mountpoint="/projects/jlmaccal/LJ" name="Project -
JM LJ" options="acl"/>
                        <clusterfs device="/dev/BIOCOMP/jm_sidechain_pmf"
force_unmount="0" fstype="gfs"
mountpoint="/projects/jlmaccal/sidechain_pmf"
name="Project - JM sidechain_pmf" options="acl"/>
                        <clusterfs device="/dev/BIOCOMP/jm_CG"
force_unmount="0" fstype="gfs"
mountpoint="/projects/jlmaccal/CG" name="Project -
JM CG" options="acl"/>
                        <clusterfs device="/dev/BIOCOMP/jm_CISS3"
force_unmount="0" fstype="gfs"
mountpoint="/projects/jlmaccal/CISS3"
name="Project - JM CISS3" options="acl"/>
                        <clusterfs device="/dev/BIOCOMP/jm_OPLS-sidechain"
force_unmount="0" fstype="gfs"
mountpoint="/projects/jlmaccal/OPLS-sidechain"
name="Project - JM OPLS-sidechain" options="acl"/>
                        <clusterfs device="/dev/BIOCOMP/jm_arg_pull"
force_unmount="0" fstype="gfs"
mountpoint="/projects/jlmaccal/arg_pull"
name="Project - JM arg_pull" options="acl"/>
                        <clusterfs device="/dev/BIOCOMP/jm_halo"
force_unmount="0" fstype="gfs"
mountpoint="/projects/jlmaccal/halo" name="Project
- JM halo" options="acl"/>
                        <clusterfs device="/dev/BIOCOMP/jm_old_bison"
force_unmount="0" fstype="gfs"
mountpoint="/projects/jlmaccal/old_bison"
name="Project - JM old_bison" options="acl"/>
                        <clusterfs device="/dev/BIOCOMP/jm_CISS2"
force_unmount="0" fstype="gfs"
mountpoint="/projects/jlmaccal/CISS2"
name="Project - JM CISS2" options="acl"/>
                </resources>
                <service domain="Cluster Failover" name="cluster NAT">
                        <ip address="10.1.1.1" monitor_link="1"/>
                        <script file="/cluster/scripts/cluster_nat"
name="cluster NAT script"/>
                </service>
                <service domain="Cluster Failover" name="FDS Service">
                        <ip address="xxx.xxx.xxx.xxx" monitor_link="1"/>
                        <script file="/cluster/scripts/fds" name="FDS
script"/>
                </service>
                <service domain="Cluster Failover" exclusive="1" name="NFS
Exports">
                        <ip address="xxx.xxx.xxx.xxx" monitor_link="1"/>
                        <clusterfs ref="Documentation">
                                <nfsexport ref="Cluster Export">
                                        <nfsclient ref="Biocomp Clients"/>
                                </nfsexport>
                        </clusterfs>
                        <clusterfs ref="Home - Ryan">
                                <nfsexport ref="Cluster Export">
                                        <nfsclient ref="Biocomp Clients"/>
                                </nfsexport>
                        </clusterfs>
                        <clusterfs ref="Home - Luca">
                                <nfsexport ref="Cluster Export">
                                        <nfsclient ref="Biocomp Clients"/>
                                </nfsexport>
                        </clusterfs>
                        <clusterfs ref="Home - Justin">
                                <nfsexport ref="Cluster Export">
                                        <nfsclient ref="Biocomp Clients"/>
                                </nfsexport>
                        </clusterfs>
                        <clusterfs ref="Project - JM Hexane">
                                <nfsexport ref="Cluster Export">
                                        <nfsclient ref="Biocomp Clients"/>
                                </nfsexport>
                        </clusterfs>
                        <clusterfs ref="Project - JM LJ">
                                <nfsexport ref="Cluster Export">
                                        <nfsclient ref="Biocomp Clients"/>
                                </nfsexport>
                        </clusterfs>
                        <clusterfs ref="Project - JM sidechain_pmf">
                                <nfsexport ref="Cluster Export">
                                        <nfsclient ref="Biocomp Clients"/>
                                </nfsexport>
                        </clusterfs>
                        <clusterfs ref="Project - JM CG">
                                <nfsexport ref="Cluster Export">
                                        <nfsclient ref="Biocomp Clients"/>
                                </nfsexport>
                        </clusterfs>
                        <clusterfs ref="Project - JM CISS3">
                                <nfsexport ref="Cluster Export">
                                        <nfsclient ref="Biocomp Clients"/>
                                </nfsexport>
                        </clusterfs>
                        <clusterfs ref="Project - JM OPLS-sidechain">
                                <nfsexport ref="Cluster Export">
                                        <nfsclient ref="Biocomp Clients"/>
                                </nfsexport>
                        </clusterfs>
                        <clusterfs ref="Project - JM arg_pull">
                                <nfsexport ref="Cluster Export">
                                        <nfsclient ref="Biocomp Clients"/>
                                </nfsexport>
                        </clusterfs>
                        <clusterfs ref="Project - JM halo">
                                <nfsexport ref="Cluster Export">
                                        <nfsclient ref="Biocomp Clients"/>
                                </nfsexport>
                        </clusterfs>
                        <clusterfs ref="Project - JM old_bison">
                                <nfsexport ref="Cluster Export">
                                        <nfsclient ref="Biocomp Clients"/>
                                </nfsexport>
                        </clusterfs>
                        <clusterfs ref="Project - JM CISS2">
                                <nfsexport ref="Cluster Export">
                                        <nfsclient ref="Biocomp Clients"/>
                                </nfsexport>
                        </clusterfs>
                </service>
                <service domain="Backup" name="Backup Mounts">
                        <clusterfs ref="Documentation"/>
                        <clusterfs ref="Home - Ryan"/>
                        <clusterfs ref="Home - Luca"/>
                        <clusterfs ref="Home - Justin"/>
                        <clusterfs ref="Project - JM Hexane"/>
                        <clusterfs ref="Project - JM LJ"/>
                        <clusterfs ref="Project - JM sidechain_pmf"/>
                        <clusterfs ref="Project - JM CG"/>
                        <clusterfs ref="Project - JM CISS3"/>
                        <clusterfs ref="Project - JM OPLS-sidechain"/>
                        <clusterfs ref="Project - JM arg_pull"/>
                        <clusterfs ref="Project - JM halo"/>
                        <clusterfs ref="Project - JM old_bison"/>
                        <clusterfs ref="Project - JM CISS2"/>
                </service>
        </rm>
</cluster>

I'm wondering whether I setup the NFS exports in the "correct" fashion or
not... It was working this way just fine until I started adding a lot of
GFS volumes and NFS exports for each one.

Any clues?

-- 
Ryan




More information about the Linux-cluster mailing list