[Linux-cluster] RHEL 5.7: cpg_leave error retrying

Fri Sep 2 14:33:17 UTC 2011

I modified the /etc/init.d/cman script to use the -D flag on the
groupd start and re-direct the output to a file in /tmp.  During the
hang, I see groupd looping through the cpg_leave function.  When I
restart openais, it appears that groupd will get an error code "2" and
then break out of the loop. Looks like I need to dig into the openais
cpg_leave function....

Here is the output of the groupg -D output with the openais restart at
the very end.

1314973495 cman: our nodeid 2 name node2-priv quorum 1
1314973495 setup_cpg groupd_handle 6b8b456700000000
1314973495 groupd confchg total 2 left 0 joined 1
1314973495 send_version nodeid 2 cluster 2 mode 2 compat 1
1314973495 client connection 3
1314973495 got client 3 setup
1314973495 setup fence 0
1314973495 client connection 4
1314973495 got client 4 setup
1314973495 setup dlm 1
1314973495 client connection 5
1314973495 got client 5 setup
1314973495 setup gfs 2
1314973496 got client 3 join
1314973496 0:default got join
1314973496 0:default is cpg client 6 name 0_default handle 79e2a9e300000001
1314973496 0:default cpg_join ok
1314973496 0:default waiting for first cpg event
1314973496 client connection 7
1314973496 0:default waiting for first cpg event
1314973496 0:default confchg left 0 joined 1 total 2
1314973496 0:default process_node_join 2
1314973496 0:default cpg add node 1 total 1
1314973496 0:default cpg add node 2 total 2
1314973496 0:default make_event_id 200020001 nodeid 2 memb_count 2 type 1
1314973496 0:default queue join event for nodeid 2
1314973496 0:default process_current_event 200020001 2 JOIN_BEGIN
1314973496 0:default app node init: add 2 total 1
1314973496 0:default app node init: add 1 total 2
1314973496 0:default waiting for 1 more stopped messages before
JOIN_ALL_STOPPED 2
1314973496 got client 7 get_group
1314973496 0:default waiting for 1 more stopped messages before
JOIN_ALL_STOPPED 2
1314973496 0:default waiting for 1 more stopped messages before
JOIN_ALL_STOPPED 2
1314973496 0:default mark node 1 stopped
1314973496 0:default set global_id 10001 from 1
1314973496 0:default process_current_event 200020001 2 JOIN_ALL_STOPPED
1314973496 0:default action for app: setid default 65537
1314973496 0:default action for app: start default 1 2 2 1 2
1314973496 0:default mark node 1 started
1314973496 client connection 7
1314973496 got client 7 get_group
1314973496 client connection 7
1314973496 got client 7 get_group
1314973496 got client 3 start_done
1314973496 0:default send started
1314973496 0:default mark node 2 started
1314973496 0:default process_current_event 200020001 2 JOIN_ALL_STARTED
1314973496 0:default action for app: finish default 1
1314973497 client connection 7
1314973497 got client 7 get_group
1314973557 cman: node 0 added
1314973580 0:default confchg left 1 joined 0 total 1
1314973580 0:default confchg removed node 1 reason 2
1314973580 0:default process_node_leave 1
1314973580 0:default cpg del node 1 total 1
1314973580 0:default make_event_id 100010002 nodeid 1 memb_count 1 type 2
1314973580 0:default queue leave event for nodeid 1
1314973580 0:default process_current_event 100010002 1 LEAVE_BEGIN
1314973580 0:default action for app: stop default
1314973580 got client 3 stop_done
1314973580 0:default send stopped
1314973580 0:default waiting for 2 more stopped messages before
LEAVE_ALL_STOPPED 1
1314973580 0:default mark node 1 stopped
1314973580 0:default waiting for 1 more stopped messages before
LEAVE_ALL_STOPPED 1
1314973580 0:default waiting for 1 more stopped messages before
LEAVE_ALL_STOPPED 1
1314973580 0:default mark node 2 stopped
1314973580 0:default process_current_event 100010002 1 LEAVE_ALL_STOPPED
1314973580 0:default app node leave: del 1 total 1
1314973580 0:default action for app: start default 2 3 1 2
1314973580 got client 3 start_done
1314973580 0:default send started
1314973580 0:default mark node 2 started
1314973580 0:default process_current_event 100010002 1 LEAVE_ALL_STARTED
1314973580 0:default action for app: finish default 2
1314973583 cman: node 1 removed
1314973583 add_recovery_set_cman nodeid 1
1314973591 got client 3 leave
1314973591 0:default got leave
1314973591 cpg_leave error retry
1314973592 cpg_leave error retry
1314973593 cpg_leave error retry
1314973594 cpg_leave error retry
1314973595 cpg_leave error retry
1314973596 cpg_leave error retry
1314973597 cpg_leave error retry
1314973598 cpg_leave error retry
1314973599 cpg_leave error retry
1314973600 cpg_leave error retry
1314973601 0:default cpg_leave error retrying
1314973601 cpg_leave error retry
1314973602 cpg_leave error retry
1314973603 cpg_leave error retry
1314973604 cpg_leave error retry
1314973605 cpg_leave error retry
1314973606 cpg_leave error retry
1314973607 cpg_leave error retry
1314973608 cpg_leave error retry
1314973609 cpg_leave error retry
1314973610 cpg_leave error retry
1314973611 0:default cpg_leave error retrying
1314973611 cpg_leave error retry
1314973612 cpg_leave error retry
1314973613 cpg_leave error retry
1314973614 cpg_leave error retry
1314973615 cpg_leave error retry
1314973616 cpg_leave error retry
1314973617 cpg_leave error retry
1314973618 cpg_leave error retry
1314973619 cpg_leave error retry
1314973620 cpg_leave error retry
1314973621 0:default cpg_leave error retrying
1314973621 cpg_leave error retry
1314973622 cpg_leave error retry
1314973623 cpg_leave error retry
1314973624 cpg_leave error retry
1314973625 cpg_leave error retry
1314973626 cpg_leave error retry
1314973627 cpg_leave error retry
1314973628 cpg_leave error retry
1314973629 cpg_leave error retry
1314973630 cpg_leave error retry
1314973631 0:default cpg_leave error retrying
1314973631 cpg_leave error retry
1314973632 cpg_leave error retry
1314973633 cpg_leave error retry
1314973634 cpg_leave error retry
1314973635 cpg_leave error retry
1314973636 cpg_leave error retry
1314973637 cpg_leave error retry
1314973640 0:default cpg_leave error 2
1314973640 client connection 7
1314973640 cluster is down, exiting

On Fri, Sep 2, 2011 at 8:38 AM, Robert Hayden <rhayden.public at gmail.com> wrote:
> Has anyone experienced the following error/hang/loop when attempting
> to stop rgmanager or cman on the last node of a two node cluster?
>
> groupd[4909]: cpg_leave error retrying
>
> Basic scenario:
> RHEL 5.7 with the latest errata for cman.
> Create a two node cluster with qdisk and higher totem token=70000
> start cman on both nodes, wait for qdisk to become online with master determined
> stop cman on node1, wait for it to complete
> stop cman on node2
> error "cpg_leave" seen in logging output
>
> Observations:
> The "service cman stop" command hangs at "Stopping fencing" output
> If I cycle openais service with "service openais restart", then the
> "service cman stop" will complete (need to manually stop the openais
> service afterwards).
> When hung, the command "group_tool dump" hangs (any group_tool command hangs).
> The hang is inconsistent which, in my mind, implies a timing issue.
> Inconsistent meaning that every once in a while, then shutdown will
> complete (maybe 20% of the time).
> I have seen the issue with the stopping of rgmanager and cman.  The
> below example has been stripped down to show the hang with cman.
> I have tested with varying the length of time to wait before stopping
> the second node with no difference (hang still occurs periodically).
> I have tested with commenting out the totem token and the
> quorum_dev_poll and still experienced the hang. (we use the longer
> timeouts to help survive network and san blips)/
>
>
> I have dug through some of the source code.  The message appears in
> group's cpg.c as function do_cpg_leave( ).  This calls the cpg_leave
> function located in the openais package.
>
> If I attach to the groupd process with gdb, I get the following stack.
>  Watching with strace, groupd is just in a looping state.
> (gdb) where
> #0  0x000000341409a510 in __nanosleep_nocancel () from /lib64/libc.so.6
> #1  0x000000341409a364 in sleep () from /lib64/libc.so.6
> #2  0x000000000040a410 in time ()
> #3  0x000000000040bd09 in time ()
> #4  0x000000000040e2cb in time ()
> #5  0x000000000040ebe0 in time ()
> #6  0x000000000040f394 in time ()
> #7  0x000000341401d994 in __libc_start_main () from /lib64/libc.so.6
> #8  0x00000000004018f9 in time ()
> #9  0x00007fff04a671c8 in ?? ()
> #10 0x0000000000000000 in ?? ()
>
> If I attach to the aisexec process with gdb, I see the following:
> (gdb) where
> #0  0x00000034140cb696 in poll () from /lib64/libc.so.6
> #1  0x0000000000405c50 in poll_run ()
> #2  0x0000000000418aae in main ()
>
>
> As you can see in the cluster.conf example below, I have attempted
> many different ways to create more debug logging.  I do see debug
> messages from openais in the cpg.c component during startup, but
> nothing is logged on the shutdown hang scenario.
>
> I would appreciate any guidance on how to troubleshoot further,
> especially with increasing the tracing of the openais calls in cpg.c.
>
> Thanks
> Robert
>
>
> Example cluster.conf:
> <?xml version="1.0"?>
> <cluster config_version="33" name="cluster_app_1">
>        <logging to_syslog="yes" syslog_facility="local4"
> timestamp="on" debug="on">
>                <logger ident="CPG" debug="on"/>
>                <logger ident="CMAN" debug="on"/>
>        </logging>
>        <cman expected_nodes="2" expected_votes="3" quorum_dev_poll="70000">
>                <multicast addr="239.192.1.192"/>
>        </cman>
>        <totem token="70000"/>
>        <fence_daemon clean_start="0" log_facility="local4"
> post_fail_delay="10" post_join_delay="60"/>
>        <quorumd interval="1" label="rhcs_qdisk" log_facility="local4"
> log_level="7" min_score="1" tko="60" votes="1">
>                <heuristic interval="2" program="/bin/ping -c1 -t2
> -Ibond0 10.162.106.1" score="1" tko="3"/>
>        </quorumd>
>        <clusternodes>
>                <clusternode name="node1-priv" nodeid="1" votes="1">
>                        <fence>
>                                <method name="1">
>                                        <device name="iLO_node1"/>
>                                </method>
>                        </fence>
>                        <multicast addr="239.192.1.192" interface="bond1"/>
>                </clusternode>
>                <clusternode name="node2-priv" nodeid="2" votes="1">
>                        <fence>
>                                <method name="1">
>                                        <device name="iLO_node2"/>
>                                </method>
>                        </fence>
>                        <multicast addr="239.192.1.192" interface="bond1"/>
>                </clusternode>
>        </clusternodes>
>        <fencedevices>
>                <fencedevice action="off" agent="fence_ipmilan"
> ipaddr="X.X.X.X" login="node1_fence" name="iLO_node1"
> passwd="password" power_wait="10" lanplus="1"/>
>                <fencedevice action="off" agent="fence_ipmilan"
> ipaddr="X.X.X.X" login="node2_fence" name="iLO_node2"
> passwd="password" power_wait="10" lanplus="1"/>
>        </fencedevices>
>        <rm log_level="7"/>
> </cluster>
>