#!/bin/bash
# lvcheck, version 1.1
# Maintainer: Andreas Dilger <adilger@sun.com>
# Maintainer: Bryan Kadzban <bryan.kadzban@is-a-geek.net>

# Other credits:
# Concept and original script by Theodore Tso <tyt...@mit.edu>
# on_ac_power is mostly from Debian's powermgmt-base package
# Better XFS support from Eric Sandeen <sandeen@redhat.com>

# Released under the GNU General Public License, either version 2 or
# (at your option) any later version.

# Overview:
# Run this from cron periodically (e.g. once per week). If the machine
# is on AC power, it will run the checks; otherwise they will all be
# skipped. (If the script can't tell whether the machine is on AC power,
# it will use a setting in the configuration file (/etc/lvcheck.conf) to
# decide whether to continue with the checks, or abort.)
#
# The script will then decide which logical volumes are active, and can
# therefore be checked via an LVM snapshot. Each of these LVs will be
# queried to find its last-check day, and if that was more than $INTERVAL
# days ago (where INTERVAL is set in the configuration file as well), or
# if the last-check day can't be determined, then the script will take an
# LVM snapshot of that LV and run fsck on the snapshot. The snapshot will
# be set to use 1/500 the space of the source LV. After fsck finishes,
# the snapshot is destroyed. (Snapshots are checked serially.)
#
# Any LV that passes fsck should have its last-check time updated (in
# the real superblock, not the snapshot's superblock); any LV whose
# fsck fails will send an email notification to a configurable user
# ($EMAIL). This $EMAIL setting is optional, but its use is highly
# recommended, since if any LV fails, it will need to be checked
# manually, offline. Relevant messages are also sent to syslog.

# Set default values for configuration params. Changes to these values will
# be overwritten on an upgrade! To change these values, edit /etc/lvcheck.conf.
EMAIL='root'
INTERVAL=30
AC_UNKNOWN="CONTINUE"
MINSNAP=256
MINFREE=0

[ "$1" == "-n" ] && NOCHECK="echo"

# pull in configuration -- overwrite the defaults above if the file exists
[ -r /etc/lvcheck.conf ] && . /etc/lvcheck.conf
CHECKPATH=$(dirname "$0" | sed -e 's:/s*bin::')
[ -r $CHECKPATH/etc/lvcheck.conf ] && . $CHECKPATH/etc/lvcheck.conf

# send $2 to syslog, with severity $1
# severities are emerg/alert/crit/err/warning/notice/info/debug
function log() {
	local sev="$1"
	local msg="$2"
	local arg=

	# log warning-or-higher messages to stderr as well
	case $sev in
	emerg|alert|crit|err|warning)
		arg=-s
		;;
	info|debug)
		:
		;;
	*)
		echo "error: unknown log severity '$sev'"
		;;
	esac

	[ "$NOCHECK" ] || logger -t lvcheck $arg -p user."$sev" -- "$msg"
}

# determine whether the machine is on AC power
function on_ac_power() {
	local any_known=no

	# try sysfs power class first
	if [ -d /sys/class/power_supply ]; then
		for psu in /sys/class/power_supply/*; do
			if [ -r "$psu/type" ]; then
		       		type=$(cat "$psu/type")

				# ignore batteries
				[ "$type" = "Battery" ] && continue

				online=$(cat "$psu/online")

				[ "$online" = 1 ] && return 0
				[ "$online" = 0 ] && any_known=yes
			fi
		done

		[ "$any_known" = "yes" ] && return 1
	fi

	# else fall back to AC adapters in /proc
	if [ -d /proc/acpi/ac_adapter ]; then
		for ac in /proc/acpi/ac_adapter/*; do
			if [ -r "$ac/state" ]; then
				grep -q on-line "$ac/state" && return 0
				grep -q off-line "$ac/state" && any_known=yes
			elif [ -r "$ac/status" ]; then
				grep -q on-line "$ac/status" && return 0
				grep -q off-line "$ac/status" && any_known=yes
			fi
		done

		[ "$any_known" = "yes" ] && return 1
	fi

	if [ "$AC_UNKNOWN" == "CONTINUE" ]; then
		return 0	# assume on AC power
	elif [ "$AC_UNKNOWN" == "ABORT" ]; then
		return 1	# assume on battery
	else
		log err "Invalid value for AC_UNKNOWN in the config file"
		exit 1
	fi
}

# attempt to force a check of $1 on the next reboot
function try_force_check() {
	local dev="$1"
	local fstype="$2"

	case "$fstype" in
	ext2|ext3|ext4)
		tune2fs -C 16000 "$dev"
		;;
	xfs)
		# XFS does not enforce check intervals; let email suffice.
		;;
	*)
		log warning "$dev: don't know how to force a check on $fstype."
		;;
	esac
}

# attempt to set the last-check time on $1 to now, and the mount count to 0.
function try_delay_checks() {
	local dev="$1"
	local fstype="$2"

	case "$fstype" in
	ext2|ext3|ext4)
		tune2fs -C 0 -T now "$dev"
		;;
	xfs)
		# XFS does not enforce check intervals; nothing to delay
		;;
	*)
		log info "$dev: don't know how to delay check on $fstype."
		;;
	esac
}

# print the date that $1 was last checked, in a format that date(1) will
# accept, or "Unknown" if we don't know how to find that date.
function try_get_check_date() {
	local dev="$1"
	local fstype="$2"

	case "$fstype" in
	ext2|ext3|ext4)
		dumpe2fs -h "$dev" 2>/dev/null | grep 'Last checked:' |
			sed -e 's/Last checked:[[:space:]]*//'
		;;
	*)
		# XFS does not save the last-checked date 
		# TODO: add support for various other FSes
		echo "Unknown"
		;;
	esac
}

# do any extra checks for filesystem type $2, on device $1
function should_still_check() {
	local dev="$1"
	local fstype="$2"

	case "$fstype" in
	ext2|ext3|ext4)
		if tune2fs -l "$dev" | grep -q "Journal device"; then
			log warning "skip $dev: using external journal."
			return 1
		fi
		;;
	jbd*)
		log debug "skip $dev: using external journal."
		return 1
		;;
	swap)
		log debug "skip $dev: is a swap device."
		return 1
		;;
	*)
		log warning "skip $dev: can't check $fstype passively: assuming OK."
		;;
	esac

	return 0
}

# check the FS on $1 passively, saving output to $3.
function perform_check() {
	local dev="$1"
	local fstype="$2"
	local errlog="$3"

	case "$fstype" in
	ext2|ext3|ext4)
		# first clear the orphaned-inode list, to avoid unnecessary FS
		# changes in the next step (which would cause an "error" exit
		# from e2fsck). -C 0 is present for cases where the script is
		# run interactively (logsave -s strips out the progress bar).
		# ignore the return status of this e2fsck, as it doesn't matter.
		$NOCHECK nice logsave -as "$errlog" e2fsck -p -C 0 "$dev"

		# then do the real check; -y is here to give more info on any
		# errors that may be present on the FS, in the log file. the
		# snapshot is writable, so it shouldn't break anything if
		# e2fsck changes it.
		$NOCHECK nice logsave -as "$errlog" e2fsck -fy -C 0 "$dev"
		return $?
		;;
	reiserfs)
		echo Yes | $NOCHECK nice logsave -as "$errlog" fsck.reiserfs --check "$dev"
		# apparently can't fail? let's hope not...
		return 0
		;;
	xfs)
		$NOCHECK nice logsave -as "$errlog" xfs_repair -n "$dev"
		return $?
		;;
	jfs)
		$NOCHECK nice logsave -as "$errlog" fsck.jfs -fn "$dev"
		return $?
		;;
	esac
}

# do everything needed to check and reset dates and counters on /dev/$1/$2.
function check_fs() {
	local vg="$1"
	local lv="$2"
	local fstype="$3"
	local snapsize="$4" # in units of MB

	local lvdev="/dev/$vg/$lv"
	local errlog="/var/log/lvcheck/$vg-$lv-$(date +%Y%m%d)"
	local snaplvbase="$lv-lvcheck-temp"
	local snaplv="$snaplvbase-$(date +'%Y%m%d')"

	# clean up any left-over snapshot LVs
	for lvtemp in /dev/$vg/$snaplvbase*; do
		if [ -e "$lvtemp" ]; then
			# Assume script won't run more than one at a time?
			log warning "stale $lvtemp: trying to remove old snapshot."

			if ! lvremove -f "$lvtemp"; then
				log err "error $lvtemp: could not delete."
				return 1
			fi
		fi
	done

	# see whether FS needs any extra checks that might disqualify it
	should_still_check "$lvdev" "$fstype" || return 0

	# get the last check time
	check_date=$(try_get_check_date "$lvdev" "$fstype")

	# if the date is unknown, run fsck every time the script runs. sigh.
	if [ "$check_date" != "Unknown" ]; then
		# add $INTERVAL days, and throw away the time portion
		check_day=$(date --date="$check_date $INTERVAL days" +'%Y%m%d')

		# get today's date, and skip the check if it's not within the interval
		today=$(date +'%Y%m%d')
		if [ $check_day -gt $today ]; then
			log debug "skip $lvdev: just checked on $check_date."
			return 0
		fi
	fi

	# create new snapshot LV
	lvcreate -s -L "$snapsize"M -n "$snaplv" "$vg/$lv"

	if perform_check "/dev/$vg/$snaplv" "$fstype" "$errlog"; then
		log info "$lvdev: Background check succeeded."
		[ -z "$NOCHECK" ] && try_delay_checks "$lvdev" "$fstype"
	else
		log err "error $lvdev: Background check failed! Run offline!"
		[ -z "$NOCHECK" ] && try_force_check "$lvdev" "$fstype"

		if [ "$EMAIL" -a -z "$NOCHECK" ]; then
			mail -s "Fsck $lvdev failed" $EMAIL < $errlog
		fi
	fi

	lvremove -f "$vg/$snaplv"
}

# check whether the machine is on AC power: if not, skip fsck
on_ac_power || exit 0

# parse up lvscan output
lvscan 2>&1 | grep ACTIVE | awk '{print $2;}' | while read DEV; do
	# remove the single quotes around the device name
	DEV=$(echo "$DEV" | tr -d \')
	if [ ! -b "$DEV" ]; then
		if [ ! -e "$DEV" ]; then
			log info "skip $DEV: no longer exists."
		else
			log info "skip $DEV: not a block device."
		fi
		continue
	fi

	# get the FS type: blkid prints TYPE="blah"
	FSTYPE=$(blkid -s TYPE "$DEV" | cut -d'=' -f2 | tr -d \"\ )
	if [ -z "$FSTYPE" ]; then
		log info "skip $DEV: can't determine device type."
		continue
	fi

	# get the volume group and logical volume names
	VG=$(echo $(lvs --noheadings -o vg_name "$DEV"))
	LV=$(echo $(lvs --noheadings -o lv_name "$DEV"))

	# get the free space and LV size (in megs), guess at the snapshot size,
	# and see how much the admin will let us use (keeping MINFREE available)
	SPACE=$(lvs --noheadings --units M -o vg_free "$DEV"|cut -d. -f1)
	SIZE=$(lvs --noheadings --units M -o lv_size "$DEV"|cut -d.  -f1)
	SNAPSIZE=$(($SIZE / 500))
	AVAIL=$(($SPACE - $MINFREE))

	# if we don't even have MINSNAP space available, skip the LV
	if [ "$MINSNAP" -gt "$AVAIL" -o "$AVAIL" -le 0 ]; then
		log warning "skip $DEV: need ${MINSNAP}M free space in volume group."
		continue
	fi

	# make snapshot large enough to handle e.g. journal and other updates
	[ "$SNAPSIZE" -lt "$MINSNAP" ] && SNAPSIZE="$MINSNAP"

	# limit snapshot to available space (VG space minus min-free)
	[ "$SNAPSIZE" -gt "$AVAIL" ] && SNAPSIZE="$AVAIL"

	# don't need to check SNAPSIZE again: MINSNAP <= AVAIL, MINSNAP <= SNAPSIZE,
	# and SNAPSIZE <= AVAIL, combined, means SNAPSIZE must be between MINSNAP
	# and AVAIL, which is what we need -- assuming AVAIL > 0

	check_fs "$VG" "$LV" "$FSTYPE" "$SNAPSIZE"
done