[dm-devel] [PATCH] dm-writecache: fix incorrect flush sequence when doing commit
Maged Mokhtar
mmokhtar at petasan.org
Mon Jan 13 19:47:22 UTC 2020
On 08/01/2020 17:46, Mikulas Patocka wrote:
> When commiting state, the function writecache_flush does the following:
> 1. write metadata (writecache_commit_flushed)
> 2. flush disk cache (writecache_commit_flushed)
> 3. wait for data writes to complete (writecache_wait_for_ios)
> 4. increase superblock seq_count
> 5. write the superblock
> 6. flush disk cache
>
> It may happen that at step 3, when we wait for some write to finish, the
> disk may report the write as finished, but the write only hit the disk
> cache and it is not yet stored in persistent storage. At step 5 we write
> the superblock - it may happen that the superblock is written before the
> write that we waited for in step 3. If the machine crashes, it may result
> in incorect data being returned after reboot.
>
> In order to fix the bug, we must swap steps 2 and 3 in the above sequence,
> so that we first wait for writes to complete and then flush the disk
> cache.
>
> Signed-off-by: Mikulas Patocka <mpatocka at redhat.com>
> Cc: stable at vger.kernel.org # 4.18+
> Fixes: 48debafe4f2f ("dm: add writecache target")
>
> ---
> drivers/md/dm-writecache.c | 42 +++++++++++++++++++++---------------------
> 1 file changed, 21 insertions(+), 21 deletions(-)
>
> Index: linux-2.6/drivers/md/dm-writecache.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-writecache.c 2020-01-08 15:14:51.000000000 +0100
> +++ linux-2.6/drivers/md/dm-writecache.c 2020-01-08 16:15:43.000000000 +0100
> @@ -442,7 +442,13 @@ static void writecache_notify_io(unsigne
> complete(&endio->c);
> }
>
> -static void ssd_commit_flushed(struct dm_writecache *wc)
> +static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
> +{
> + wait_event(wc->bio_in_progress_wait[direction],
> + !atomic_read(&wc->bio_in_progress[direction]));
> +}
> +
> +static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
> {
> struct dm_io_region region;
> struct dm_io_request req;
> @@ -488,17 +494,20 @@ static void ssd_commit_flushed(struct dm
> writecache_notify_io(0, &endio);
> wait_for_completion_io(&endio.c);
>
> + if (wait_for_ios)
> + writecache_wait_for_ios(wc, WRITE);
> +
> writecache_disk_flush(wc, wc->ssd_dev);
>
> memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
> }
>
> -static void writecache_commit_flushed(struct dm_writecache *wc)
> +static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
> {
> if (WC_MODE_PMEM(wc))
> wmb();
> else
> - ssd_commit_flushed(wc);
> + ssd_commit_flushed(wc, wait_for_ios);
> }
>
> static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
> @@ -522,12 +531,6 @@ static void writecache_disk_flush(struct
> writecache_error(wc, r, "error flushing metadata: %d", r);
> }
>
> -static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
> -{
> - wait_event(wc->bio_in_progress_wait[direction],
> - !atomic_read(&wc->bio_in_progress[direction]));
> -}
> -
> #define WFE_RETURN_FOLLOWING 1
> #define WFE_LOWEST_SEQ 2
>
> @@ -724,15 +727,12 @@ static void writecache_flush(struct dm_w
> e = e2;
> cond_resched();
> }
> - writecache_commit_flushed(wc);
> -
> - if (!WC_MODE_PMEM(wc))
> - writecache_wait_for_ios(wc, WRITE);
> + writecache_commit_flushed(wc, true);
>
> wc->seq_count++;
> pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
> writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
> - writecache_commit_flushed(wc);
> + writecache_commit_flushed(wc, false);
>
> wc->overwrote_committed = false;
>
> @@ -756,7 +756,7 @@ static void writecache_flush(struct dm_w
> }
>
> if (need_flush_after_free)
> - writecache_commit_flushed(wc);
> + writecache_commit_flushed(wc, false);
> }
>
> static void writecache_flush_work(struct work_struct *work)
> @@ -809,7 +809,7 @@ static void writecache_discard(struct dm
> }
>
> if (discarded_something)
> - writecache_commit_flushed(wc);
> + writecache_commit_flushed(wc, false);
> }
>
> static bool writecache_wait_for_writeback(struct dm_writecache *wc)
> @@ -958,7 +958,7 @@ erase_this:
>
> if (need_flush) {
> writecache_flush_all_metadata(wc);
> - writecache_commit_flushed(wc);
> + writecache_commit_flushed(wc, false);
> }
>
> wc_unlock(wc);
> @@ -1342,7 +1342,7 @@ static void __writecache_endio_pmem(stru
> wc->writeback_size--;
> n_walked++;
> if (unlikely(n_walked >= ENDIO_LATENCY)) {
> - writecache_commit_flushed(wc);
> + writecache_commit_flushed(wc, false);
> wc_unlock(wc);
> wc_lock(wc);
> n_walked = 0;
> @@ -1423,7 +1423,7 @@ pop_from_list:
> writecache_wait_for_ios(wc, READ);
> }
>
> - writecache_commit_flushed(wc);
> + writecache_commit_flushed(wc, false);
>
> wc_unlock(wc);
> }
> @@ -1766,10 +1766,10 @@ static int init_memory(struct dm_writeca
> write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
>
> writecache_flush_all_metadata(wc);
> - writecache_commit_flushed(wc);
> + writecache_commit_flushed(wc, false);
> pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
> writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
> - writecache_commit_flushed(wc);
> + writecache_commit_flushed(wc, false);
>
> return 0;
> }
>
Hi Mikulas,
Swapping of the steps does indeed make sense.
/Maged
More information about the dm-devel
mailing list