[Linux-cluster] clurgmgrd refuses to die

Jie Gao J.Gao at isu.usyd.edu.au
Mon Jul 24 01:29:13 UTC 2006


Hi All

I have been having a persistent problem with shutting down a cluster node.

I have a two-node cluster. If Node A starts first, there is no problem
rebooting Node B at any time. But if I try to reboot Node A, it hangs
while trying to kill "clurgmgrd":

> ps -eaf |grep clurgmgrd
116:root     25824     1  0 10:45 ?        00:00:00 clurgmgrd

> strace -f kill -TERM 25824
execve("/bin/kill", ["kill", "-TERM", "25824"], [/* 28 vars */]) = 0
uname({sys="Linux", node="mix", ...})   = 0
brk(0)                                  = 0x503000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2a95556000
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY)      = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=114663, ...}) = 0
mmap(NULL, 114663, PROT_READ, MAP_PRIVATE, 3, 0) = 0x2a95557000
close(3)                                = 0
open("/lib64/tls/libc.so.6", O_RDONLY)  = 3
read(3, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0`\305\21"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1493186, ...}) = 0
mmap(0x3e10100000, 2310056, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3e10100000
mprotect(0x3e1022b000, 1085352, PROT_NONE) = 0
mmap(0x3e1032a000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x12a000) =
0x3e1032a000
mmap(0x3e10330000, 16296, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3e10330000
close(3)                                = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2a95573000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2a95574000
mprotect(0x3e1032a000, 12288, PROT_READ) = 0
arch_prctl(ARCH_SET_FS, 0x2a95573b00)   = 0
munmap(0x2a95557000, 114663)            = 0
open("/usr/lib/locale/locale-archive", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=48516832, ...}) = 0
mmap(NULL, 48516832, PROT_READ, MAP_PRIVATE, 3, 0) = 0x2a95575000
close(3)                                = 0
brk(0)                                  = 0x503000
brk(0x524000)                           = 0x524000
kill(25824, SIGTERM)                    = 0
exit_group(0)                           = ?
Process 28578 detached

> ps -eaf |grep clurgmgrd
116:root     25824     1  0 10:45 ?        00:00:00 clurgmgrd

> strace -p 25824
Process 25824 attached - interrupt to quit
select(7, [4 5 6], NULL, NULL, {7, 735000}) = 0 (Timeout)
socket(PF_FILE, SOCK_STREAM, 0)         = 9
connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(9, "\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 20) = 20
read(9, "\1\0\0\0\0\0\0\0\210\35\0\0\0\0\0\0\0\0\0\0", 20) = 20
close(9)                                = 0
socket(PF_FILE, SOCK_STREAM, 0)         = 9
connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(9, "\3\0\0\0\0\0\0\0\210\35\0\0\0\0\0\0\31\0\0\0/cluster/@"..., 45) = 45
read(9, "\3\0\0\0\0\0\0\0\210\35\0\0\0\0\0\0\3\0\0\0", 20) = 20
read(9, "30\0", 3)                      = 3
close(9)                                = 0
socket(PF_FILE, SOCK_STREAM, 0)         = 9
connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(9, "\2\0\0\0\0\0\0\0\210\35\0\0\0\0\0\0\0\0\0\0", 20) = 20
read(9, "\2\0\0\0\0\0\0\0\377\377\377\377\0\0\0\0\0\0\0\0", 20) = 20
close(9)                                = 0
select(7, [6], [6], NULL, {0, 0})       = 0 (Timeout)
select(6, [5], [5], NULL, {0, 0})       = 0 (Timeout)
select(7, [4 5 6], NULL, NULL, {10, 0}) = 0 (Timeout)
socket(PF_FILE, SOCK_STREAM, 0)         = 9
connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(9, "\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 20) = 20
read(9, "\1\0\0\0\0\0\0\0\246\35\0\0\0\0\0\0\0\0\0\0", 20) = 20
close(9)                                = 0
socket(PF_FILE, SOCK_STREAM, 0)         = 9
connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(9, "\3\0\0\0\0\0\0\0\246\35\0\0\0\0\0\0\31\0\0\0/cluster/@"..., 45) = 45
read(9, "\3\0\0\0\0\0\0\0\246\35\0\0\0\0\0\0\3\0\0\0", 20) = 20
read(9, "30\0", 3)                      = 3
close(9)                                = 0
socket(PF_FILE, SOCK_STREAM, 0)         = 9
connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(9, "\2\0\0\0\0\0\0\0\246\35\0\0\0\0\0\0\0\0\0\0", 20) = 20
read(9, "\2\0\0\0\0\0\0\0\377\377\377\377\0\0\0\0\0\0\0\0", 20) = 20
close(9)                                = 0
select(7, [6], [6], NULL, {0, 0})       = 0 (Timeout)
select(6, [5], [5], NULL, {0, 0})       = 0 (Timeout)
select(7, [4 5 6], NULL, NULL, {10, 0}) = 0 (Timeout)
socket(PF_FILE, SOCK_STREAM, 0)         = 9
connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(9, "\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 20) = 20
read(9, "\1\0\0\0\0\0\0\0\304\35\0\0\0\0\0\0\0\0\0\0", 20) = 20
close(9)                                = 0
socket(PF_FILE, SOCK_STREAM, 0)         = 9
connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(9, "\3\0\0\0\0\0\0\0\304\35\0\0\0\0\0\0\31\0\0\0/cluster/@"..., 45) = 45
read(9, "\3\0\0\0\0\0\0\0\304\35\0\0\0\0\0\0\3\0\0\0", 20) = 20
read(9, "30\0", 3)                      = 3
close(9)                                = 0
socket(PF_FILE, SOCK_STREAM, 0)         = 9
connect(9, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(9, "\2\0\0\0\0\0\0\0\304\35\0\0\0\0\0\0\0\0\0\0", 20) = 20
read(9, "\2\0\0\0\0\0\0\0\377\377\377\377\0\0\0\0\0\0\0\0", 20) = 20
close(9)                                = 0
...


What is clurgmgrd exactly trying to do?

Regards,


Jie




More information about the Linux-cluster mailing list