[Cluster-devel] cluster/rgmanager/src/resources lvm.sh

jbrassow at sourceware.org jbrassow at sourceware.org
Wed Apr 18 18:14:56 UTC 2007


CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	jbrassow at sourceware.org	2007-04-18 19:14:56

Modified files:
	rgmanager/src/resources: lvm.sh 

Log message:
	Bug 236580: [HA LVM]: Bringing site back on-line after failure causes pr...
	
	Setup:
	- 2 interconnected sites
	- each site has a disk and a machine
	- LVM mirroring is used to mirror the disks from the sites
	
	When one site fails, the LVM happily moves over to the second site -
	removing the failed disk from the VG that was part of the failed
	site.  However, when the failed site is restored and the service
	attempts to move back to the original machine, it fails because
	of the conflicts in LVM metadata on the disks.
	
	This fix allows the LV to be reactivated on the original node
	by filtering out the devices which have stale metadata (i.e
	the device that was removed during the failure).

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/lvm.sh.diff?cvsroot=cluster&r1=1.4&r2=1.5

--- cluster/rgmanager/src/resources/lvm.sh	2007/04/05 15:08:20	1.4
+++ cluster/rgmanager/src/resources/lvm.sh	2007/04/18 18:14:56	1.5
@@ -149,6 +149,78 @@
 	return $OCF_ERR_GENERIC
 }
 
+# lvm_exec_resilient
+#
+# Sometimes, devices can come back.  Their metadata will conflict
+# with the good devices that remain.  This function filters out those
+# failed devices when executing the given command
+#
+# Finishing with vgscan resets the cache/filter
+lvm_exec_resilient()
+{
+	declare command=$1
+	declare all_pvs
+
+	ocf_log notice "Making resilient : $command"
+
+	if [ -z $command ]; then
+		ocf_log err "lvm_exec_resilient: Arguments not supplied"
+		return $OCF_ERR_ARGS
+	fi
+
+	# pvs will print out only those devices that are valid
+	# If a device dies and comes back, it will not appear
+	# in pvs output (but you will get a Warning).
+	all_pvs=(`pvs --noheadings -o pv_name | grep -v Warning`)
+
+	# Now we use those valid devices in a filter which we set up.
+	# The device will then be activated because there are no
+	# metadata conflicts.
+        command=$command" --config devices{filter=[";
+	for i in ${all_pvs[*]}; do
+		command=$command'"a|'$i'|",'
+	done
+	command=$command"\"r|.*|\"]}"
+
+	ocf_log notice "Resilient command: $command"
+	if ! $command ; then
+		ocf_log err "lvm_exec_resilient failed"
+		vgscan
+		return $OCF_ERR_GENERIC
+	else
+		vgscan
+		return $OCF_SUCCESS
+	fi
+}
+
+# lv_activate_resilient
+#
+# Sometimes, devices can come back.  Their metadata will conflict
+# with the good devices that remain.  We must filter out those
+# failed devices when trying to reactivate
+lv_activate_resilient()
+{
+	declare action=$1
+	declare lv_path=$2
+	declare op="-ay"
+
+	if [ -z $action ] || [ -z $lv_path ]; then
+		ocf_log err "lv_activate_resilient: Arguments not supplied"
+		return $OCF_ERR_ARGS
+	fi
+
+	if [ $action != "start" ]; then
+	        op="-an"
+	fi
+
+	if ! lvm_exec_resilient "lvchange $op $lv_path" ; then
+		ocf_log err "lv_activate_resilient $action failed on $lv_path"
+		return $OCF_ERR_GENERIC
+	else
+		return $OCF_SUCCESS
+	fi
+}
+
 # lv_status
 #
 # Is the LV active?
@@ -203,7 +275,7 @@
 		ocf_log err "WARNING: $my_name does not own $lv_path"
 		ocf_log err "WARNING: Attempting shutdown of $lv_path"
 
-		lvchange -an $lv_path
+		lv_activate_resilient "stop" $lv_path
 		return $OCF_ERR_GENERIC
 	fi
 
@@ -229,15 +301,14 @@
 			ocf_log err "Unable to add tag to $lv_path"
 			return $OCF_ERR_GENERIC
 		fi
-		lvchange -ay $lv_path
-		if [ $? -ne 0 ]; then
+
+		if ! lv_activate_resilient $action $lv_path; then
 			ocf_log err "Unable to activate $lv_path"
 			return $OCF_ERR_GENERIC
 		fi
 	else
 		ocf_log notice "Deactivating $lv_path"
-		lvchange -an $lv_path
-		if [ $? -ne 0 ]; then
+		if ! lv_activate_resilient $action $lv_path; then
 			ocf_log err "Unable to deactivate $lv_path"
 			return $OCF_ERR_GENERIC
 		fi




More information about the Cluster-devel mailing list