[Cluster-devel] [PATCH] gfs2_utils: Add gfs2_lockgather data gathering script

Adam Drew adrew at redhat.com
Mon Jan 9 22:52:48 UTC 2012


I wrote a simple data gathering script for GFS2 called gfs2_lockgather. It should help in situations where data about a possible locking or performance issue involving GFS2 is required. It gathers system information, DLM data, glock data, and thread dumps. The data gather can be run on a single node or a single node can run it on all nodes. The data gathered is quite good for diagnosing performance and locking issues.

- Adam

diff --git a/configure.ac b/configure.ac
index 81ffad8..3fe1a49 100644
--- a/configure.ac
+++ b/configure.ac
@@ -285,6 +285,7 @@ AC_CONFIG_FILES([Makefile
		 gfs2/tool/Makefile
		 gfs2/tune/Makefile
		 gfs2/man/Makefile
+		 gfs2/lockgather/Makefile
		 doc/Makefile
		 po/Makefile.in
		 ])
diff --git a/gfs2/Makefile.am b/gfs2/Makefile.am
index 9116bd3..08e59c4 100644
--- a/gfs2/Makefile.am
+++ b/gfs2/Makefile.am
@@ -1,4 +1,4 @@
MAINTAINERCLEANFILES	= Makefile.in

SUBDIRS			= libgfs2 convert edit fsck mkfs mount quota tool man \
-			  tune include #init.d 
+			  tune include lockgather #init.d 
diff --git a/gfs2/lockgather/Makefile.am b/gfs2/lockgather/Makefile.am
new file mode 100644
index 0000000..fe8b480
--- /dev/null
+++ b/gfs2/lockgather/Makefile.am
@@ -0,0 +1,12 @@
+MAINTAINERCLEANFILES    = Makefile.in
+
+# When an exec_prefix setting would have us install into /usr/sbin,
+# use /sbin instead.
+# Accept an existing sbindir value of /usr/sbin (probably for older automake),
+# or an empty value, for automake-1.11 and newer.
+sbindir := $(shell rpl=0; test '$(exec_prefix):$(sbindir)' = /usr:/usr/sbin \
+                        || test '$(exec_prefix):$(sbindir)' = /usr: && rpl=1; \
+                                     test $$rpl = 1 && echo /sbin || echo '$(exec_prefix)/sbin')
+
+
+dist_sbin_SCRIPTS           = gfs2_lockgather
diff --git a/gfs2/lockgather/gfs2_lockgather b/gfs2/lockgather/gfs2_lockgather
new file mode 100644
index 0000000..ed4a0c5
--- /dev/null
+++ b/gfs2/lockgather/gfs2_lockgather
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+#    gfs2_lockgather - A script that gathers data for diagnosing GFS2 locking issues
+#    Copyright 2012 Adam Drew <adrew at redhat.com>
+
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+QUIET=false
+
+#Handle arguments
+for var in "$@"
+do
+	#Handle running on all nodes
+	if [ $var == "--allnodes" ] ||  [ $var == "-a" ] ; then
+
+    		for node in $(ccs_tool lsnode | tail --lines=+5 | grep -v "Cluster name" | grep -v "Nodename" | awk '{print $1}') ; do
+        		#We gather via SSH on all nodes, even the local node
+        		#We do this becuase determining which node name is the 
+        		#node running the script is too much logic to be worth it
+        		echo "Starting data gathering on $node..." 
+			ssh  -q -f  root@$node '/sbin/gfs2_lockgather -q' 
+			echo "gfs2_lockgather will log a message in /var/log/messages on $node when complete or if there is an error."
+   	 	done	
+		exit 0
+	fi
+
+	#Handle quiet mode
+	if [ $var == "-q" ] || [ $var == "--quiet" ] ; then
+		QUIET=true
+	fi
+	
+	#Handle help request
+	if [ $var == "--help" ] || [ $var == "--info" ] || [ $var == "-h" ] ; then
+
+		echo "gfs2_lockgather, version 1"
+		echo "A script that gathers data for diagnosing GFS2 locking issues."
+		echo "---------------------------------------------------------------"
+		echo "To gather on a single node invoke the script with no arguments."
+		echo "To see this message use --help, --info, or -h."
+		echo "To run with messages supressed use --quiet or -q."
+		echo "To gather on all nodes invoke the script with --allnodes or -a."
+		echo "Only 1 instance of gfs2_lockgather may run on a node at a time."
+		echo ""
+		exit 0
+	fi
+	
+done
+
+#Check for the lock file. We only want one instance running at a time.
+if [ -e /var/run/gfs2_lockgather.lock ]; then
+	echo -ne 'Error: Lock file /var/run/gfs2_lockgather.lock found.\nAnother instance of gfs2_lockgather may be running.\nAnother node may be running a gather on this node.\n' 
+	logger -t gfs2_lockgather 'Error: Lock file /var/run/gfs2_lockgather.lock found. Another instance may be running. Quitting.'
+        exit 1
+fi
+
+#Create the gather lock
+touch  /var/run/gfs2_lockgather.lock 
+
+logger -t gfs2_lockgather 'Gather started.'
+
+if [ $QUIET == false ] ; then echo -ne '[       ]  Setting up for gather.\t\t\t\t\t\t\t\t\r' ; fi
+#Get the current datetime for unique naming
+DATETIME=$(date +%m%d%Y-%H%M%S)
+ 
+#Set up the directory structure
+mkdir /tmp/debugfs
+mount -t debugfs none /tmp/debugfs
+mkdir /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata
+mkdir /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run1
+mkdir /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run2
+
+if [ $QUIET == false ] ; then echo -ne '[#      ]  Gathering environment data.\t\t\t\t\t\t\t\t\r'  ; fi
+#Gather some basics
+clustat > /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/clustat.out
+cman_tool services > /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/clustat.out
+mount -l > /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/mount-l.out
+ps aux > /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/ps-aux.out
+uname -a > /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/uname-a.out
+
+if [ $QUIET == false ] ; then echo -ne '[##     ]  Gathering GFS2 and DLM lock data: pass 1\t\t\t\t\t\t\t\t\r'  ; fi
+#Glock and DLM lock dump 1
+for dlmfile in $(ls -lsv /tmp/debugfs/dlm/ | grep -v total | awk '{print $10}') ; do dd if=/tmp/debugfs/dlm/$dlmfile bs=1024M of=/tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run1/$dlmfile &> /dev/null; done
+for fs in $(ls -lsv /tmp/debugfs/gfs2/ | grep -v total | awk '{print $10}') ; do dd if=/tmp/debugfs/gfs2/$fs/glocks bs=1024M of=/tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run1/$fs-glocks &> /dev/null; done
+    
+#Enable and trigger sysrq
+echo 1 > /proc/sys/kernel/sysrq
+ 
+#Thread Dump
+#This is much faster than waiting for syslog to dump the thread dumps to the messages log
+if [ $QUIET == false ] ; then echo -ne '[###    ]  Gathering thread dumps.\t\t\t\t\t\t\t\t\r'  ; fi
+
+$(
+cat /proc/kmsg > /tmp/thread-dumps &
+echo 't' > /proc/sysrq-trigger
+sleep 10
+kill -9 $!
+)
+
+if [ $QUIET == false ] ; then echo -ne '[####   ]  Gathering GFS2 and DLM lock data: pass 2.\t\t\t\t\t\t\t\t\r' ; fi
+#Glock and DLM dump 2
+for dlmfile in $(ls -lsv /tmp/debugfs/dlm/ | grep -v total | awk '{print $10}') ; do dd if=/tmp/debugfs/dlm/$dlmfile bs=1024M of=/tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run2/$dlmfile &> /dev/null; done
+for fs in $(ls -lsv /tmp/debugfs/gfs2/ | grep -v total | awk '{print $10}') ; do dd if=/tmp/debugfs/gfs2/$fs/glocks bs=1024M of=/tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/run2/$fs-glocks &> /dev/null; done
+
+if [ $QUIET == false ] ; then echo -ne '[#####  ]  Gathering messages logs\t\t\t\t\t\t\t\t\r' ; fi
+#Get the messages log file
+cp /var/log/messages /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/
+
+#Tar up the results and clean up temporary files    
+if [ $QUIET == false ] ; then echo -ne '[###### ]   Cleaning up... 80%.\t\t\t\t\t\t\t\t\r' ; fi
+tar cjf /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata.tar.bz /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata/ &> /dev/null
+umount /tmp/debugfs/
+rm -f  /var/run/gfs2_lockgather.lock 
+rm -rf /tmp/debugfs
+rm -rf /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata
+logger -t gfs2_lockgather "Gather completed. File is /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata.tar.bz"  
+if [ $QUIET == false ] ; then echo -ne "[#######]  Done. File is /tmp/$(hostname)-$(echo $DATETIME)-gfshangdata.tar.bz\r\t\t\t\t\t\t\t\t\r\n"  ; fi
+exit 0
diff --git a/gfs2/man/Makefile.am b/gfs2/man/Makefile.am
index 0f132d6..648ed84 100644
--- a/gfs2/man/Makefile.am
+++ b/gfs2/man/Makefile.am
@@ -9,5 +9,6 @@ dist_man_MANS		= fsck.gfs2.8 \
			  gfs2_quota.8 \
			  gfs2_tool.8 \
			  mkfs.gfs2.8 \
+			  gfs2_lockgather.8 \
			  mount.gfs2.8 \
			  tunegfs2.8
diff --git a/gfs2/man/gfs2_lockgather.8 b/gfs2/man/gfs2_lockgather.8
new file mode 100644
index 0000000..3cd8b9c
--- /dev/null
+++ b/gfs2/man/gfs2_lockgather.8
@@ -0,0 +1,26 @@
+.TH gfs2_lockgather 8
+
+.SH NAME
+gfs2_lockgather - Gathers data for diagnosing GFS2 locking issues
+
+.SH SYNOPSIS
+.B gfs2_lockgather
+[\fIOPTIONS\fR]
+
+.SH DESCRIPTION
+gfs2_lockgather will gather data that is useful for diagnosing performance and locking issues 
+involving GFS2 filesystems. The script gathers basic system and cluster data such as rpm output, 
+kernel version, thread dumps from all processes, and 2 passes of glock and DLM locking data. After
+the data is gathered it is stored in a tarball under /tmp. The script can be invoked to gather
+data from a single node, or to gather data from all nodes via ssh.     
+.SH OPTIONS
+.TP
+\fB-h, --help, --info\fP
+Display help and usage information.
+.TP
+\fB-q, --quiet\fP
+Quiet mode. Run with output supressed.
+.TP
+\fB-a, --allnodes\fP
+Gather data from all nodes via ssh.
+







More information about the Cluster-devel mailing list