#!/bin/sh # # lvcheck, version 1.0 # Maintainer: Bryan Kadzban # Other credits: # Concept and original script by Theodore Tso # on_ac_power is mostly from Debian's powermgmt-base package # Lots of help (ideas, initial XFS/JFS support, etc.) from # Andreas Dilger # Better XFS support from Eric Sandeen # Released under the GNU General Public License, either version 2 or # (at your option) any later version. # Overview: # # Run this from cron periodically (e.g. once per week). If the # machine is on AC power, it will run the checks; otherwise they will # all be skipped. (If the script can't tell whether the machine is # on AC power, it will use a setting in the configuration file # (/etc/lvcheck.conf) to decide whether to continue with the checks, # or abort.) # # The script will then decide which logical volumes are active, and # can therefore be checked via an LVM snapshot. Each of these LVs # will be queried to find its last-check day, and if that was more # than $INTERVAL days ago (where INTERVAL is set in the configuration # file as well), or if the last-check day can't be determined, then # the script will take an LVM snapshot of that LV and run fsck on the # snapshot. The snapshot will be set to use 1/500 the space of the # source LV. After fsck finishes, the snapshot is destroyed. # (Snapshots are checked serially.) # # Any LV that passes fsck should have its last-check time updated (in # the real superblock, not the snapshot's superblock); any LV whose # fsck fails will send an email notification to a configurable user # ($EMAIL). This $EMAIL setting is optional, but its use is highly # recommended, since if any LV fails, it will need to be checked # manually, offline. Relevant messages are also sent to syslog. # Set default values for configuration params. Changes to these values # will be overwritten on an upgrade! To change these values, use # /etc/lvcheck.conf. EMAIL='root' INTERVAL=30 AC_UNKNOWN="CONTINUE" MINSNAP=256 MINFREE=0 # send $2 to syslog, with severity $1 # severities are emerg/alert/crit/err/warning/notice/info/debug function log() { local sev="$1" local msg="$2" local arg= # log warning-or-higher messages to stderr as well [ "$sev" == "emerg" || "$sev" == "alert" || "$sev" == "crit" || \ "$sev" == "err" || "$sev" == "warning" ] && arg=-s logger -t lvcheck $arg -p user."$sev" -- "$msg" } # determine whether the machine is on AC power function on_ac_power() { local any_known=no # try sysfs power class first if [ -d /sys/class/power_supply ] ; then for psu in /sys/class/power_supply/* ; do if [ -r "${psu}/type" ] ; then type="`cat "${psu}/type"`" # ignore batteries [ "${type}" = "Battery" ] && continue online="`cat "${psu}/online"`" [ "${online}" = 1 ] && return 0 [ "${online}" = 0 ] && any_known=yes fi done [ "${any_known}" = "yes" ] && return 1 fi # else fall back to AC adapters in /proc if [ -d /proc/acpi/ac_adapter ] ; then for ac in /proc/acpi/ac_adapter/* ; do if [ -r "${ac}/state" ] ; then grep -q on-line "${ac}/state" && return 0 grep -q off-line "${ac}/state" && any_known=yes elif [ -r "${ac}/status" ] ; then grep -q on-line "${ac}/status" && return 0 grep -q off-line "${ac}/status" && any_known=yes fi done [ "${any_known}" = "yes" ] && return 1 fi if [ "$AC_UNKNOWN" == "CONTINUE" ] ; then return 0 # assume on AC power elif [ "$AC_UNKNOWN" == "ABORT" ] ; then return 1 # assume on battery else log "err" "Invalid value for AC_UNKNOWN in the config file" exit 1 fi } # attempt to force a check of $1 on the next reboot function try_force_check() { local dev="$1" local fstype="$2" case "$fstype" in ext2|ext3) tune2fs -C 16000 "$dev" ;; xfs) # XFS does not enforce check intervals; let email suffice. ;; *) log "warning" "Don't know how to force a check on $fstype..." ;; esac } # attempt to set the last-check time on $1 to now, and the mount count to 0. function try_delay_checks() { local dev="$1" local fstype="$2" case "$fstype" in ext2|ext3) tune2fs -C 0 -T now "$dev" ;; xfs) # XFS does not enforce check intervals; nothing to delay ;; *) log "warning" "Don't know how to delay checks on $fstype..." ;; esac } # print the date that $1 was last checked, in a format that date(1) will # accept, or "Unknown" if we don't know how to find that date. function try_get_check_date() { local dev="$1" local fstype="$2" case "$fstype" in ext2|ext3) dumpe2fs -h "$dev" 2>/dev/null | grep 'Last checked:' | \ sed -e 's/Last checked:[[:space:]]*//' ;; *) # XFS does not save the last-checked date # TODO: add support for various other FSes echo "Unknown" ;; esac } # check the FS on $1 passively, saving output to $3. function perform_check() { local dev="$1" local fstype="$2" local tmpfile="$3" case "$fstype" in ext2|ext3) # first clear the orphaned-inode list, to avoid unnecessary FS changes # in the next step (which would cause an "error" exit from e2fsck). # -C 0 is present for cases where the script is run interactively # (logsave -s strips out the progress bar). ignore the return status # of this e2fsck, as it doesn't matter. nice logsave -as "${tmpfile}" e2fsck -p -C 0 "$dev" # then do the real check; -y is here to give more info on any errors # that may be present on the FS, in the log file. the snapshot is # writable, so it shouldn't break anything if e2fsck changes it. nice logsave -as "${tmpfile}" e2fsck -fy -C 0 "$dev" return $? ;; reiserfs) echo Yes | nice logsave -as "${tmpfile}" fsck.reiserfs --check "$dev" # apparently can't fail? let's hope not... return 0 ;; xfs) nice logsave -as "${tmpfile}" xfs_repair -n "$dev" return $? ;; jfs) nice logsave -as "${tmpfile}" fsck.jfs -fn "$dev" return $? ;; *) log "warning" "Don't know how to check $fstype filesystems passively: assuming OK." ;; esac } # do everything needed to check and reset dates and counters on /dev/$1/$2. function check_fs() { local vg="$1" local lv="$2" local fstype="$3" local snapsize="$4" local tmpfile=`mktemp -t lvcheck.log.XXXXXXXXXX` local errlog="/var/log/lvcheck-${vg}@${lv}" local snaplvbase="${lv}-lvcheck-temp" local snaplv="${snaplvbase}-`date +'%Y%m%d'`" # clean up any left-over snapshot LVs for lvtemp in /dev/${vg}/${snaplvbase}* ; do if [ -e "$lvtemp" ] ; then # Assume the script won't run more than one instance at a time? log "warning" "Found stale snapshot $lvtemp: attempting to remove." if ! lvremove -f "${lvtemp##/dev}" ; then log "error" "Could not delete stale snapshot $lvtemp" return 1 fi fi done # and create this one lvcreate -s -l "$snapsize" -n "${snaplv}" "${vg}/${lv}" if perform_check "/dev/${vg}/${snaplv}" "${fstype}" "${tmpfile}" ; then log "info" "Background scrubbing of /dev/${vg}/${lv} succeeded." try_delay_checks "/dev/${vg}/${lv}" "$fstype" else log "err" "Background scrubbing of /dev/${vg}/${lv} failed: run fsck offline soon!" try_force_check "/dev/${vg}/${lv}" "$fstype" if test -n "$EMAIL"; then mail -s "Fsck of /dev/${vg}/${lv} failed!" $EMAIL < $tmpfile fi # save the log file in /var/log in case mail is disabled ( echo "" echo -n " Check on " ; date +'%Y-%m-%d' echo "=======================" cat "$tmpfile" ) >>"$errlog" fi rm -f "$tmpfile" lvremove -f "${vg}/${snaplv}" } # pull in configuration -- overwrite the defaults above if the file exists [ -r /etc/lvcheck.conf ] && . /etc/lvcheck.conf # check whether the machine is on AC power: if not, skip fsck on_ac_power || exit 0 # parse up lvscan output lvscan 2>&1 | grep ACTIVE | awk '{print $2;}' | \ while read DEV ; do # remove the single quotes around the device name DEV="`echo "$DEV" | tr -d \'`" # get the FS type: blkid prints TYPE="blah" eval `blkid -s TYPE "$DEV" | cut -d' ' -f2` # get the last-check time check_date=`try_get_check_date "$DEV" "$TYPE"` # if the date is unknown, run fsck every time the script runs. sigh. if [ "$check_date" != "Unknown" ] ; then # add $INTERVAL days, and throw away the time portion check_day=`date --date="$check_date $INTERVAL days" +'%Y%m%d'` # get today's date, and skip the check if it's not within the interval today=`date +'%Y%m%d'` [ $check_day -gt $today ] && continue fi # get the volume group and logical volume names VG="`lvs --noheadings -o vg_name "$DEV"`" LV="`lvs --noheadings -o lv_name "$DEV"`" # get the free space and LV size (in megs), guess at the snapshot # size, and see how much the admin will let us use (keeping MINFREE # available) SPACE="`lvs --noheadings --units M --nosuffix -o vg_free "$DEV"`" SIZE="`lvs --noheadings --units M --nosuffix -o lv_size "$DEV"`" SNAPSIZE="`expr "$SIZE" / 500`" AVAIL="`expr "$SPACE" - "$MINFREE"`" # if we don't even have MINSNAP space available, skip the LV if [ "$MINSNAP" -gt "$AVAIL" -o "$AVAIL" -le 0 ] ; then log "warning" "Not enough free space on volume group for ${DEV}; skipping" continue fi # make snapshot large enough to handle e.g. journal and other updates [ "$SNAPSIZE" -lt "$MINSNAP" ] && SNAPSIZE="$MINSNAP" # limit snapshot to available space (VG space minus min-free) [ "$SNAPSIZE" -gt "$AVAIL" ] && SNAPSIZE="$AVAIL" # don't need to check SNAPSIZE again: MINSNAP <= AVAIL, MINSNAP <= SNAPSIZE, # and SNAPSIZE <= AVAIL, combined, means SNAPSIZE must be between MINSNAP # and AVAIL, which is what we need -- assuming AVAIL > 0 # check it check_fs "$VG" "$LV" "$TYPE" "$SNAPSIZE" done