#!/bin/bash # lvcheck, version 1.1 # Maintainer: Andreas Dilger # Maintainer: Bryan Kadzban # Other credits: # Concept and original script by Theodore Tso # on_ac_power is mostly from Debian's powermgmt-base package # Better XFS support from Eric Sandeen # Released under the GNU General Public License, either version 2 or # (at your option) any later version. # Overview: # Run this from cron periodically (e.g. once per week). If the machine # is on AC power, it will run the checks; otherwise they will all be # skipped. (If the script can't tell whether the machine is on AC power, # it will use a setting in the configuration file (/etc/lvcheck.conf) to # decide whether to continue with the checks, or abort.) # # The script will then decide which logical volumes are active, and can # therefore be checked via an LVM snapshot. Each of these LVs will be # queried to find its last-check day, and if that was more than $INTERVAL # days ago (where INTERVAL is set in the configuration file as well), or # if the last-check day can't be determined, then the script will take an # LVM snapshot of that LV and run fsck on the snapshot. The snapshot will # be set to use 1/500 the space of the source LV. After fsck finishes, # the snapshot is destroyed. (Snapshots are checked serially.) # # Any LV that passes fsck should have its last-check time updated (in # the real superblock, not the snapshot's superblock); any LV whose # fsck fails will send an email notification to a configurable user # ($EMAIL). This $EMAIL setting is optional, but its use is highly # recommended, since if any LV fails, it will need to be checked # manually, offline. Relevant messages are also sent to syslog. # Set default values for configuration params. Changes to these values will # be overwritten on an upgrade! To change these values, edit /etc/lvcheck.conf. EMAIL='root' INTERVAL=30 AC_UNKNOWN="CONTINUE" MINSNAP=256 MINFREE=0 [ "$1" == "-n" ] && NOCHECK="echo" # pull in configuration -- overwrite the defaults above if the file exists [ -r /etc/lvcheck.conf ] && . /etc/lvcheck.conf CHECKPATH=$(dirname "$0" | sed -e 's:/s*bin::') [ -r $CHECKPATH/etc/lvcheck.conf ] && . $CHECKPATH/etc/lvcheck.conf # send $2 to syslog, with severity $1 # severities are emerg/alert/crit/err/warning/notice/info/debug function log() { local sev="$1" local msg="$2" local arg= # log warning-or-higher messages to stderr as well case $sev in emerg|alert|crit|err|warning) arg=-s ;; info|debug) : ;; *) echo "error: unknown log severity '$sev'" ;; esac [ "$NOCHECK" ] || logger -t lvcheck $arg -p user."$sev" -- "$msg" } # determine whether the machine is on AC power function on_ac_power() { local any_known=no # try sysfs power class first if [ -d /sys/class/power_supply ]; then for psu in /sys/class/power_supply/*; do if [ -r "$psu/type" ]; then type=$(cat "$psu/type") # ignore batteries [ "$type" = "Battery" ] && continue online=$(cat "$psu/online") [ "$online" = 1 ] && return 0 [ "$online" = 0 ] && any_known=yes fi done [ "$any_known" = "yes" ] && return 1 fi # else fall back to AC adapters in /proc if [ -d /proc/acpi/ac_adapter ]; then for ac in /proc/acpi/ac_adapter/*; do if [ -r "$ac/state" ]; then grep -q on-line "$ac/state" && return 0 grep -q off-line "$ac/state" && any_known=yes elif [ -r "$ac/status" ]; then grep -q on-line "$ac/status" && return 0 grep -q off-line "$ac/status" && any_known=yes fi done [ "$any_known" = "yes" ] && return 1 fi if [ "$AC_UNKNOWN" == "CONTINUE" ]; then return 0 # assume on AC power elif [ "$AC_UNKNOWN" == "ABORT" ]; then return 1 # assume on battery else log err "Invalid value for AC_UNKNOWN in the config file" exit 1 fi } # attempt to force a check of $1 on the next reboot function try_force_check() { local dev="$1" local fstype="$2" case "$fstype" in ext2|ext3|ext4) tune2fs -C 16000 "$dev" ;; xfs) # XFS does not enforce check intervals; let email suffice. ;; *) log warning "$dev: don't know how to force a check on $fstype." ;; esac } # attempt to set the last-check time on $1 to now, and the mount count to 0. function try_delay_checks() { local dev="$1" local fstype="$2" case "$fstype" in ext2|ext3|ext4) tune2fs -C 0 -T now "$dev" ;; xfs) # XFS does not enforce check intervals; nothing to delay ;; *) log info "$dev: don't know how to delay check on $fstype." ;; esac } # print the date that $1 was last checked, in a format that date(1) will # accept, or "Unknown" if we don't know how to find that date. function try_get_check_date() { local dev="$1" local fstype="$2" case "$fstype" in ext2|ext3|ext4) dumpe2fs -h "$dev" 2>/dev/null | grep 'Last checked:' | sed -e 's/Last checked:[[:space:]]*//' ;; *) # XFS does not save the last-checked date # TODO: add support for various other FSes echo "Unknown" ;; esac } # do any extra checks for filesystem type $2, on device $1 function should_still_check() { local dev="$1" local fstype="$2" case "$fstype" in ext2|ext3|ext4) if tune2fs -l "$dev" | grep -q "Journal device"; then log warning "skip $dev: using external journal." return 1 fi ;; jbd*) log debug "skip $dev: using external journal." return 1 ;; swap) log debug "skip $dev: is a swap device." return 1 ;; *) log warning "skip $dev: can't check $fstype passively: assuming OK." ;; esac return 0 } # check the FS on $1 passively, saving output to $3. function perform_check() { local dev="$1" local fstype="$2" local errlog="$3" case "$fstype" in ext2|ext3|ext4) # first clear the orphaned-inode list, to avoid unnecessary FS # changes in the next step (which would cause an "error" exit # from e2fsck). -C 0 is present for cases where the script is # run interactively (logsave -s strips out the progress bar). # ignore the return status of this e2fsck, as it doesn't matter. $NOCHECK nice logsave -as "$errlog" e2fsck -p -C 0 "$dev" # then do the real check; -y is here to give more info on any # errors that may be present on the FS, in the log file. the # snapshot is writable, so it shouldn't break anything if # e2fsck changes it. $NOCHECK nice logsave -as "$errlog" e2fsck -fy -C 0 "$dev" return $? ;; reiserfs) echo Yes | $NOCHECK nice logsave -as "$errlog" fsck.reiserfs --check "$dev" # apparently can't fail? let's hope not... return 0 ;; xfs) $NOCHECK nice logsave -as "$errlog" xfs_repair -n "$dev" return $? ;; jfs) $NOCHECK nice logsave -as "$errlog" fsck.jfs -fn "$dev" return $? ;; esac } # do everything needed to check and reset dates and counters on /dev/$1/$2. function check_fs() { local vg="$1" local lv="$2" local fstype="$3" local snapsize="$4" # in units of MB local lvdev="/dev/$vg/$lv" local errlog="/var/log/lvcheck/$vg-$lv-$(date +%Y%m%d)" local snaplvbase="$lv-lvcheck-temp" local snaplv="$snaplvbase-$(date +'%Y%m%d')" # clean up any left-over snapshot LVs for lvtemp in /dev/$vg/$snaplvbase*; do if [ -e "$lvtemp" ]; then # Assume script won't run more than one at a time? log warning "stale $lvtemp: trying to remove old snapshot." if ! lvremove -f "$lvtemp"; then log err "error $lvtemp: could not delete." return 1 fi fi done # see whether FS needs any extra checks that might disqualify it should_still_check "$lvdev" "$fstype" || return 0 # get the last check time check_date=$(try_get_check_date "$lvdev" "$fstype") # if the date is unknown, run fsck every time the script runs. sigh. if [ "$check_date" != "Unknown" ]; then # add $INTERVAL days, and throw away the time portion check_day=$(date --date="$check_date $INTERVAL days" +'%Y%m%d') # get today's date, and skip the check if it's not within the interval today=$(date +'%Y%m%d') if [ $check_day -gt $today ]; then log debug "skip $lvdev: just checked on $check_date." return 0 fi fi # create new snapshot LV lvcreate -s -L "$snapsize"M -n "$snaplv" "$vg/$lv" if perform_check "/dev/$vg/$snaplv" "$fstype" "$errlog"; then log info "$lvdev: Background check succeeded." [ -z "$NOCHECK" ] && try_delay_checks "$lvdev" "$fstype" else log err "error $lvdev: Background check failed! Run offline!" [ -z "$NOCHECK" ] && try_force_check "$lvdev" "$fstype" if [ "$EMAIL" -a -z "$NOCHECK" ]; then mail -s "Fsck $lvdev failed" $EMAIL < $errlog fi fi lvremove -f "$vg/$snaplv" } # check whether the machine is on AC power: if not, skip fsck on_ac_power || exit 0 # parse up lvscan output lvscan 2>&1 | grep ACTIVE | awk '{print $2;}' | while read DEV; do # remove the single quotes around the device name DEV=$(echo "$DEV" | tr -d \') if [ ! -b "$DEV" ]; then if [ ! -e "$DEV" ]; then log info "skip $DEV: no longer exists." else log info "skip $DEV: not a block device." fi continue fi # get the FS type: blkid prints TYPE="blah" FSTYPE=$(blkid -s TYPE "$DEV" | cut -d'=' -f2 | tr -d \"\ ) if [ -z "$FSTYPE" ]; then log info "skip $DEV: can't determine device type." continue fi # get the volume group and logical volume names VG=$(echo $(lvs --noheadings -o vg_name "$DEV")) LV=$(echo $(lvs --noheadings -o lv_name "$DEV")) # get the free space and LV size (in megs), guess at the snapshot size, # and see how much the admin will let us use (keeping MINFREE available) SPACE=$(lvs --noheadings --units M -o vg_free "$DEV"|cut -d. -f1) SIZE=$(lvs --noheadings --units M -o lv_size "$DEV"|cut -d. -f1) SNAPSIZE=$(($SIZE / 500)) AVAIL=$(($SPACE - $MINFREE)) # if we don't even have MINSNAP space available, skip the LV if [ "$MINSNAP" -gt "$AVAIL" -o "$AVAIL" -le 0 ]; then log warning "skip $DEV: need ${MINSNAP}M free space in volume group." continue fi # make snapshot large enough to handle e.g. journal and other updates [ "$SNAPSIZE" -lt "$MINSNAP" ] && SNAPSIZE="$MINSNAP" # limit snapshot to available space (VG space minus min-free) [ "$SNAPSIZE" -gt "$AVAIL" ] && SNAPSIZE="$AVAIL" # don't need to check SNAPSIZE again: MINSNAP <= AVAIL, MINSNAP <= SNAPSIZE, # and SNAPSIZE <= AVAIL, combined, means SNAPSIZE must be between MINSNAP # and AVAIL, which is what we need -- assuming AVAIL > 0 check_fs "$VG" "$LV" "$FSTYPE" "$SNAPSIZE" done