[Linux-cluster] clurgmgrd hang/stuck

Guilherme G. Felix ggfelix at gmail.com
Wed Jul 15 21:41:37 UTC 2009


Hi all,

I'm having a odd problem with my 2 node cluster.

Everything starts up fine, I can relocate, restart, stop, start services
fine. I did perform all rg_test tests that were possible and everything is
working as designed.

However, for some weird reason rg_manager is freezing.

Yesterday, I raised the log_level to 7 and also started clurgmgrd with the
"-d" option. What happened is that is started to print all the status check
to my logging facility (which is what I expected to happen), however, around
8 hours later it stopped and clurgmgrd completely freezes and do not respond
to any "clusvcadm" commands. It also stopped to print anything to my log
files.

The only solution is to restart rg_manager on both nodes.

I tried to attach a strace to clurgmgrd process PID and got some timeout
errors, such as:

select(12, [10 11], NULL, NULL, {0, 908000}) = 0 (Timeout)

Although all the socks and FIFOs files are in place and with the correct
permissions. cman_tool, and ccs_tool are working just fine.

I noted that clurgmgrd isn't forking as it is expected to.

I also executed strace with the clusvcadm, here is the whole output.

[root at node1 ~]# strace clusvcadm -R "Service Web"
execve("/usr/sbin/clusvcadm", ["clusvcadm", "-R", "Service Web"], [/* 18
vars */]) = 0
brk(0)                                  = 0x83a4000
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or
directory)
open("/etc/ld.so.cache", O_RDONLY)      = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=40664, ...}) = 0
mmap2(NULL, 40664, PROT_READ, MAP_PRIVATE, 3, 0) = 0xb7fa6000
close(3)                                = 0
open("/usr/lib/libcman.so.2", O_RDONLY) = 3
read(3,
"\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0000MA\0004\0\0\0"..., 512) =
512
fstat64(3, {st_mode=S_IFREG|0755, st_size=17368, ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0xb7fa5000
mmap2(0x414000, 18952, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)
= 0x414000
mmap2(0x418000, 4096, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x3) = 0x418000
close(3)                                = 0
open("/lib/libpthread.so.0", O_RDONLY)  = 3
read(3,
"\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0000\330\265\0004\0\0\0"...,
512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=125612, ...}) = 0
mmap2(0xb59000, 90592, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)
= 0xb59000
mmap2(0xb6c000, 8192, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x12) = 0xb6c000
mmap2(0xb6e000, 4576, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0xb6e000
close(3)                                = 0
open("/lib/libdl.so.2", O_RDONLY)       = 3
read(3,
"\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0P:\265\0004\0\0\0"..., 512)
= 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=16428, ...}) = 0
mmap2(0xb53000, 12408, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0)
= 0xb53000
mmap2(0xb55000, 8192, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1) = 0xb55000
close(3)                                = 0
open("/usr/lib/libncurses.so.5", O_RDONLY) = 3
read(3,
"\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0`\204\252\0034\0\0\0"...,
512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=297464, ...}) = 0
mmap2(0x3a9a000, 297220, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3,
0) = 0x3a9a000
mmap2(0x3ada000, 32768, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x40) = 0x3ada000
mmap2(0x3ae2000, 2308, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3ae2000
close(3)                                = 0
open("/lib/libc.so.6", O_RDONLY)        = 3
read(3,
"\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\320\237\237\0004\0\0\0"...,
512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=1606808, ...}) = 0
mmap2(0x9e4000, 1324452, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3,
0) = 0x110000
mmap2(0x24e000, 12288, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x13e) = 0x24e000
mmap2(0x251000, 9636, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x251000
close(3)                                = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0xb7fa4000
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0xb7fa3000
set_thread_area({entry_number:-1 -> 6, base_addr:0xb7fa36c0, limit:1048575,
seg_32bit:1, contents:0, read_exec_only:0, limit_in_pages:1,
seg_not_present:0, useable:1}) = 0
mprotect(0x24e000, 8192, PROT_READ)     = 0
mprotect(0xb55000, 4096, PROT_READ)     = 0
mprotect(0xb6c000, 4096, PROT_READ)     = 0
mprotect(0x9e0000, 4096, PROT_READ)     = 0
munmap(0xb7fa6000, 40664)               = 0
set_tid_address(0xb7fa3708)             = 2488
set_robust_list(0xb7fa3710, 0xc)        = 0
futex(0xbfdb0074, FUTEX_WAKE_PRIVATE, 1) = 0
rt_sigaction(SIGRTMIN, {0xb5d3d0, [], SA_SIGINFO}, NULL, 8) = 0
rt_sigaction(SIGRT_1, {0xb5d2e0, [], SA_RESTART|SA_SIGINFO}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=10240*1024, rlim_max=RLIM_INFINITY}) = 0
uname({sys="Linux", node="node1", ...}) = 0
geteuid32()                             = 0
rt_sigaction(SIGPIPE, {SIG_IGN, [PIPE], SA_RESTART}, {SIG_DFL, [], 0}, 8) =
0
brk(0)                                  = 0x83a4000
brk(0x83c5000)                          = 0x83c5000
socket(PF_FILE, SOCK_STREAM, 0)         = 3
fcntl64(3, F_SETFD, FD_CLOEXEC)         = 0
connect(3, {sa_family=AF_FILE, path="/var/run/cman_client"}, 110) = 0
open("/dev/zero", O_RDONLY)             = 4
writev(3, [{"NAMC\3\0\0\20\24\0\0\0\7\0\0\0\0\0\0\0", 20}], 1) = 20
recv(3, "NAMC\24\0\0\0h\3\0\0\7\0\0@\0\0\0\0", 20, 0) = 20
read(3,
"\2\0\0\0\250\1\0\0\1\0\0\0\1\0\0\0\0\0\0\0\264\2\0\0\2\0\0\0hows"..., 852)
= 852
writev(3, [{"NAMC\3\0\0\20\24\0\0\0\7\0\0\0\0\0\0\0", 20}], 1) = 20
recv(3, "NAMC\24\0\0\0h\3\0\0\7\0\0@\0\0\0\0", 20, 0) = 20
read(3,
"\2\0\0\0\250\1\0\0\1\0\0\0\1\0\0\0\0\0\0\0\264\2\0\0\2\0\0\0hows"..., 852)
= 852
writev(3, [{"NAMC\3\0\0\20\274\1\0\0\220\0\0\0\0\0\0\0", 20},
{"x^\372\267\0\0\0\0\0P\372\267\304 at A\0008Z\372\267\300\17\236\0\0\373\332\2774\374\332\277"...,
424}], 2) = 444
recv(3, "NAMC\24\0\0\0\300\1\0\0\220\0\0@\0\0\0\0", 20, 0) = 20
read(3,
"\0\0\0\0\250\1\0\0\1\0\0\0\1\0\0\0\0\0\0\0\264\2\0\0\2\0\0\0hows"..., 428)
= 428
fstat64(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 0), ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) =
0xb7faf000
write(1, "Local machine trying to restart "..., 54Local machine trying to
restart service:Service Web...) = 54
socket(PF_FILE, SOCK_STREAM, 0)         = 5
connect(5, {sa_family=AF_FILE, path="/var/run/cluster/rgmanager.sk"}, 110) =
0
select(6, NULL, [5], [5], NULL)         = 1 (out [5])
write(5,
"h\0\0\0\4\275\321?\22:\274\0\0\0\0h\0\23\205\202\0\0\0\0\0\0\0\0\0\0\0\0"...,
112) = 112
select(6, [5], NULL, [5], NULL <unfinished ...>

     As it wasn't executing what it was supposed to do I ctrl+c'ed it.

     Following are my servers info. All of them running the same kernel and
versions.

# cat /etc/redhat-release
Red Hat Enterprise Linux Server release 5.3 (Tikanga)
# uname -a
Linux hows001nex 2.6.18-128.1.10.el5PAE #1 SMP Wed Apr 29 14:24:53 EDT 2009
i686 i686 i386 GNU/Linux
# rpm -qa | egrep 'cman|rgm'
rgmanager-2.0.46-1.el5
cman-2.0.98-1.el5
# cat /etc/cluster/cluster.conf
<?xml version="1.0"?>
<cluster alias="CLSCLU01" config_version="43" name="CLSCLU01">
        <fence_daemon post_fail_delay="0" post_join_delay="3"/>
        <clusternodes>
                <clusternode name="node1" nodeid="1" votes="1">
                        <fence>
                                <method name="1">
                                        <device name="node1-rsa"/>
                                </method>
                        </fence>
                </clusternode>
                <clusternode name="node2" nodeid="2" votes="1">
                        <fence>
                                <method name="1">
                                        <device name="node2-rsa"/>
                                </method>
                        </fence>
                </clusternode>
        </clusternodes>
        <cman expected_votes="1" two_node="1"/>
        <fencedevices>
                <fencedevice agent="fence_rsa" ipaddr="node1-rsa"
login="username" name="node1-rsa" passwd="password"/>
                <fencedevice agent="fence_rsa" ipaddr="node2-rsa"
login="username" name="node2-rsa" passwd="password"/>
        </fencedevices>
        <rm log_level="7">
                <failoverdomains>
                        <failoverdomain name="WEB" ordered="1"
restricted="1">
                                <failoverdomainnode name="node1"
priority="1"/>
                                <failoverdomainnode name="node2"
priority="2"/>
                        </failoverdomain>
                </failoverdomains>
                <resources>
                        <ip address="10.9.16.40" monitor_link="1"/>
                        <ip address="10.9.16.41" monitor_link="1"/>
                        <ip address="10.9.16.45" monitor_link="1"/>
                        <ip address="10.9.16.46" monitor_link="1"/>
                </resources>
                <service autostart="1" domain="WEB" name="Service Web"
recovery="relocate">
                        <ip ref="10.9.16.40"/>
                        <ip ref="10.9.16.41"/>
                        <ip ref="10.9.16.45"/>
                        <ip ref="10.9.16.46">
                                <script file="/etc/init.d/jboss423.sh"
name="Script Jboss423">
                                        <script file="/etc/init.d/httpd"
name="Script Apache2"/>
                                        <script file="/etc/init.d/xinetd"
name="Script Xinetd">
                                                <script
file="/etc/init.d/cron-user.sh" name="Script Crond User"/>
                                        </script>
                                </script>
                        </ip>
                </service>
        </rm>
</cluster>

Thank you,

- G. Felix
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://listman.redhat.com/archives/linux-cluster/attachments/20090715/cfc3e3e0/attachment.htm>


More information about the Linux-cluster mailing list