[Cluster-devel] [GFS2 PATCH v6 11/26] gfs2: log error reform

Tue Aug 20 14:09:36 UTC 2019

Bob,

On Thu, May 23, 2019 at 3:05 PM Bob Peterson <rpeterso at redhat.com> wrote:
> Before this patch, gfs2 kept track of journal io errors in two
> places sd_log_error and the SDF_AIL1_IO_ERROR flag in sd_flags.
> This patch consolidates the two into sd_log_error so that it
> reflects the first error encountered writing to the journal.
> In future patches, we will take advantage of this by checking
> this value rather than having to check both when reacting to
> io errors.
>
> In addition, this fixes a tight loop in unmount: If buffers
> get on the ail1 list and an io error occurs elsewhere, the
> ail1 list would never be cleared because they were always busy.
> So unmount would hang, waiting for the ail1 list to empty.
>
> Signed-off-by: Bob Peterson <rpeterso at redhat.com>
> ---
>  fs/gfs2/incore.h |  7 +++----
>  fs/gfs2/log.c    | 20 +++++++++++++++-----
>  fs/gfs2/quota.c  |  2 +-
>  3 files changed, 19 insertions(+), 10 deletions(-)
>
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index b261168be298..39cec5361ba5 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -620,9 +620,8 @@ enum {
>         SDF_RORECOVERY          = 7, /* read only recovery */
>         SDF_SKIP_DLM_UNLOCK     = 8,
>         SDF_FORCE_AIL_FLUSH     = 9,
> -       SDF_AIL1_IO_ERROR       = 10,
> -       SDF_FS_FROZEN           = 11,
> -       SDF_WITHDRAWING         = 12, /* Will withdraw eventually */
> +       SDF_FS_FROZEN           = 10,
> +       SDF_WITHDRAWING         = 11, /* Will withdraw eventually */
>  };
>
>  enum gfs2_freeze_state {
> @@ -831,7 +830,7 @@ struct gfs2_sbd {
>         atomic_t sd_log_in_flight;
>         struct bio *sd_log_bio;
>         wait_queue_head_t sd_log_flush_wait;
> -       int sd_log_error;
> +       int sd_log_error; /* First log error */
>
>         atomic_t sd_reserving_log;
>         wait_queue_head_t sd_reserving_log_wait;
> diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
> index 0fe11bde796b..9784763fbb4e 100644
> --- a/fs/gfs2/log.c
> +++ b/fs/gfs2/log.c
> @@ -108,8 +108,7 @@ __acquires(&sdp->sd_ail_lock)
>
>                 if (!buffer_busy(bh)) {
>                         if (!buffer_uptodate(bh) &&
> -                           !test_and_set_bit(SDF_AIL1_IO_ERROR,
> -                                             &sdp->sd_flags)) {
> +                           !cmpxchg(&sdp->sd_log_error, 0, -EIO)) {
>                                 gfs2_io_error_bh(sdp, bh);
>                                 set_bit(SDF_WITHDRAWING, &sdp->sd_flags);
>                         }
> @@ -203,10 +202,21 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
>                                          bd_ail_st_list) {
>                 bh = bd->bd_bh;
>                 gfs2_assert(sdp, bd->bd_tr == tr);
> -               if (buffer_busy(bh))
> +               /**
> +                * If another process flagged an io error, e.g. writing to the
> +                * journal, error all other bhs and move them off the ail1 to
> +                * prevent a tight loop when unmount tries to flush ail1,
> +                * regardless of whether they're still busy. If no outside
> +                * errors were found and the buffer is busy, move to the next.
> +                * If the ail buffer is not busy and caught an error, flag it
> +                * for others.
> +                */
> +               if (sdp->sd_log_error) {
> +                       gfs2_io_error_bh(sdp, bh);

some of the error handling here is still sketchy: the only place where
sd_log_error is set without withdrawing the filesystem is
quotad_error. If the filesystem has already been marked
SDF_WITHDRAWING or SDF_WITHDRAWN, gfs2_io_error_bh will be a no-op. It
seems that we want to set SDF_WITHDRAWING here for the quotad_error
case instead of calling gfs2_io_error_bh?

> +               } else if (buffer_busy(bh)) {
>                         continue;
> -               if (!buffer_uptodate(bh) &&
> -                   !test_and_set_bit(SDF_AIL1_IO_ERROR, &sdp->sd_flags)) {
> +               } else if (!buffer_uptodate(bh) &&
> +                          !cmpxchg(&sdp->sd_log_error, 0, -EIO)) {
>                         gfs2_io_error_bh(sdp, bh);
>                         set_bit(SDF_WITHDRAWING, &sdp->sd_flags);
>                 }
> diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
> index a8dfc86fd682..8871fca9102f 100644
> --- a/fs/gfs2/quota.c
> +++ b/fs/gfs2/quota.c
> @@ -1480,7 +1480,7 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
>                 return;
>         if (!gfs2_withdrawn(sdp)) {
>                 fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
> -               sdp->sd_log_error = error;
> +               cmpxchg(&sdp->sd_log_error, 0, error);
>                 wake_up(&sdp->sd_logd_waitq);
>         }
>  }
> --
> 2.21.0
>

Thanks,
Andreas