#!/bin/sh # # lvcheck # Released under the GNU General Public License, either version 2 or # (at your option) any later version. # Overview: # # Run this from cron periodically (e.g. once per week). If the # machine is on AC power, it will run the checks; otherwise they will # all be skipped. (If the script can't tell whether the machine is # on AC power, it will use a setting in the configuration file # (/etc/lvcheck.conf) to decide whether to continue with the checks, # or abort.) # # The script will then decide which logical volumes are active, and # can therefore be checked via an LVM snapshot. Each of these LVs # will be queried to find its last-check day, and if that was more # than $INTERVAL days ago (where INTERVAL is set in the configuration # file as well), or if the last-check day can't be determined, then # the script will take an LVM snapshot of that LV and run fsck on the # snapshot. The snapshot will be set to use 1/500 the space of the # source LV. After fsck finishes, the snapshot is destroyed. # (Snapshots are checked serially.) # # Any LV that passes fsck should have its last-check time updated (in # the real superblock, not the snapshot's superblock); any LV whose # fsck fails will send an email notification to a configurable user # ($EMAIL). This $EMAIL setting is optional, but its use is highly # recommended, since if any LV fails, it will need to be checked # manually, offline. Relevant messages are also sent to syslog. # Set default values for configuration params. Changes to these values # will be overwritten on an upgrade! To change these values, use # /etc/lvcheck.conf. EMAIL='root' INTERVAL=30 AC_UNKNOWN="CONTINUE" # send $2 to syslog, with severity $1 # severities are emerg/alert/crit/err/warning/notice/info/debug function log() { local sev="$1" local msg="$2" local arg= # log warning-or-higher messages to stderr as well [ "$sev" == "emerg" || "$sev" == "alert" || "$sev" == "crit" || \ "$sev" == "err" || "$sev" == "warning" ] && arg=-s logger $arg -p user."$sev" -- "$msg" } # determine whether the machine is on AC power function on_ac_power() { local any_known=no # try sysfs power class first if [ -d /sys/class/power_supply ] ; then for psu in /sys/class/power_supply/* ; do if [ -r "${psu}/type" ] ; then type="`cat "${psu}/type"`" # ignore batteries [ "${type}" = "Battery" ] && continue online="`cat "${psu}/online"`" [ "${online}" = 1 ] && return 0 [ "${online}" = 0 ] && any_known=yes fi done [ "${any_known}" = "yes" ] && return 1 fi # else fall back to AC adapters in /proc if [ -d /proc/acpi/ac_adapter ] ; then for ac in /proc/acpi/ac_adapter/* ; do if [ -r "${ac}/state" ] ; then grep -q on-line "${ac}/state" && return 0 grep -q off-line "${ac}/state" && any_known=yes elif [ -r "${ac}/status" ] ; then grep -q on-line "${ac}/status" && return 0 grep -q off-line "${ac}/status" && any_known=yes fi done [ "${any_known}" = "yes" ] && return 1 fi if [ "$AC_UNKNOWN" == "CONTINUE" ] ; then return 0 # assume on AC power elif [ "$AC_UNKNOWN" == "ABORT" ] ; then return 1 # assume on battery else log "err" "Invalid value for AC_UNKNOWN in the config file" exit 1 fi } # attempt to force a check of $1 on the next reboot function try_force_check() { local dev="$1" local fstype="$2" case "$fstype" in ext2|ext3) tune2fs -C 16000 -T "19000101" "$dev" ;; *) log "warning" "Don't know how to force a check on $fstype..." ;; esac } # attempt to set the last-check time on $1 to now, and the mount count to 0. function try_delay_checks() { local dev="$1" local fstype="$2" case "$fstype" in ext2|ext3) tune2fs -C 0 -T now "$dev" ;; reiserfs) # do nothing? ;; *) log "warning" "Don't know how to delay checks on $fstype..." ;; esac } # print the date that $1 was last checked, in a format that date(1) will # accept, or "Unknown" if we don't know how to find that date. function try_get_check_date() { local dev="$1" local fstype="$2" case "$fstype" in ext2|ext3) dumpe2fs -h "$dev" 2>/dev/null | grep 'Last checked:' | \ sed -e 's/Last checked:[[:space:]]*//' ;; *) # TODO: add support for various FSes here echo "Unknown" ;; esac } # check the FS on $1 passively, saving output to $3. function perform_check() { local dev="$1" local fstype="$2" local tmpfile="$3" case "$fstype" in ext2|ext3) nice logsave -as "${tmpfile}" e2fsck -fn "$dev" return $? ;; reiserfs) echo Yes | nice logsave -as "${tmpfile}" fsck.reiserfs --check "$dev" # apparently can't fail? let's hope not... return 0 ;; xfs) nice logsave -as "${tmpfile}" xfs_check "$dev" return $? ;; jfs) nice logsave -as "${tmpfile}" fsck.jfs -fn "$dev" return $? ;; *) log "warning" "Don't know how to check $fstype filesystems passively: assuming OK." ;; esac } # do everything needed to check and reset dates and counters on /dev/$1/$2. function check_fs() { local vg="$1" local lv="$2" local fstype="$3" local snapsize="$4" local tmpfile=`mktemp -t e2fsck.log.XXXXXXXXXX` local errlog="/var/log/lvcheck-${vg}@${lv}-`date +'%Y%m%d'`" local snaplvbase="${lv}-lvcheck-temp" local snaplv="${snaplvbase}-`date +'%Y%m%d'`" # clean up any left-over snapshot LVs for lvtemp in /dev/${vg}/${snaplvbase}* ; do if [ -e "$lvtemp" ] ; then # Assume the script won't run more than one instance at a time? lvremove -f "${lvtemp##/dev}" log "warning" "Found stale snapshot $lvtemp: deleting." fi done # and create this one lvcreate -s -l "$snapsize" -n "${snaplv}" "${vg}/${lv}" if perform_check "/dev/${vg}/${snaplv}" "${fstype}" "${tmpfile}" ; then log "info" "Background scrubbing of /dev/${vg}/${lv} succeeded." try_delay_checks "/dev/${vg}/${lv}" "$fstype" else log "err" "Background scrubbing of /dev/${vg}/${lv} failed: run fsck offline soon!" try_force_check "/dev/${vg}/${lv}" "$fstype" if test -n "$EMAIL"; then mail -s "Fsck of /dev/${vg}/${lv} failed!" $EMAIL < $tmpfile fi # save the log file in /var/log in case mail is disabled mv "$tmpfile" "$errlog" fi rm -f "$tmpfile" lvremove -f "${vg}/${snaplv}" } # pull in configuration -- overwrite the defaults above if the file exists [ -r /etc/lvcheck.conf ] && . /etc/lvcheck.conf # check whether the machine is on AC power: if not, skip fsck on_ac_power || exit 0 # parse up lvscan output lvscan 2>&1 | grep ACTIVE | awk '{print $2;}' | \ while read DEV ; do # remove the single quotes around the device name DEV="`echo "$DEV" | tr -d \'`" # get the FS type: blkid prints TYPE="blah" eval `blkid -s TYPE "$DEV" | cut -d' ' -f2` # get the last-check time check_date=`try_get_check_date "$DEV" "$TYPE"` # if the date is unknown, run fsck every time the script runs. sigh. if [ "$check_date" != "Unknown" ] ; then # add $INTERVAL days, and throw away the time portion check_day=`date --date="$check_date $INTERVAL days" +'%Y%m%d'` # get today's date, and skip the check if it's not within the interval today=`date +'%Y%m%d'` [ $check_day -gt $today ] && continue fi # get the free space and LV size (in megs) SPACE="`lvs --noheadings --units M --nosuffix -o vg_free "$DEV"`" SIZE="`lvs --noheadings --units M --nosuffix -o lv_size "$DEV"`" SNAPSIZE="`expr "$SIZE" / 500`" if [ "$SNAPSIZE" -gt "$SPACE" ] ; then log "err" "Can't take a snapshot of $DEV: not enough free space in the VG." continue fi # get the volume group and logical volume names VG="`lvs --noheadings -o vg_name "$DEV"`" LV="`lvs --noheadings -o lv_name "$DEV"`" # check it check_fs "$VG" "$LV" "$TYPE" "$SNAPSIZE" done