[dm-devel] [PATCH] dm-writecache: improve performance of large linear writes on SSDs

Maged Mokhtar mmokhtar at petasan.org
Thu Jan 16 22:47:08 UTC 2020



> From: Mikulas Patocka <mpatocka redhat com>
> To: Mike Snitzer <msnitzer redhat com>, Nikhil Kshirsagar <nkshirsa 
> redhat com>
> Cc: dm-devel redhat com
> Subject: [dm-devel] [PATCH] dm-writecache: improve performance of large 
> linear writes on SSDs
> Date: Wed, 15 Jan 2020 04:35:22 -0500 (EST)
> 
> When dm-writecache is used with SSD as a cache device, it would submit
> separate bio for each written block. The I/Os would be merged by the disk
> scheduler, but this merging degrades performance.
> 
> This patch makes dm-writecache submit larger bios - we can submit large
> bio as long as there is consecutive free space on the cache device.
> 
> Benchmark (arm64 with 64k page size, using /dev/ram0 as a cache device):
> 
> fio --bs=512k --iodepth=32 --size=400M --direct=1 
> --filename=/dev/mapper/cache --rw=randwrite --numjobs=1 --name=test
> 
> block   old    new
> size    MiB/s  MiB/s
> ---------------------
> 512	181     700
> 1k	347     1256
> 2k    644     2020
> 4k    1183    2759
> 8k    1852    3333
> 16k   2469    3509
> 32k   2974    3670
> 64k   3404    3810
> 
> Signed-off-by: Mikulas Patocka <mpatocka redhat com>
> 
> ---
>   drivers/md/dm-writecache.c |   28 ++++++++++++++++++++++++----
>   1 file changed, 24 insertions(+), 4 deletions(-)
> 
> Index: linux-2.6/drivers/md/dm-writecache.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-writecache.c    2020-01-14 
> 16:11:09.000000000 +0100
> +++ linux-2.6/drivers/md/dm-writecache.c    2020-01-14 
> 21:42:44.000000000 +0100
> @@ -626,7 +626,7 @@ static void writecache_add_to_freelist(s
>       wc->freelist_size++;
>   }
> 
> -static struct wc_entry *writecache_pop_from_freelist(struct 
> dm_writecache *wc)
> +static struct wc_entry *writecache_pop_from_freelist(struct 
> dm_writecache *wc, sector_t expected_sector)
>   {
>       struct wc_entry *e;
> 
> @@ -635,6 +635,8 @@ static struct wc_entry *writecache_pop_f
>           if (unlikely(!wc->current_free))
>               return NULL;
>           e = wc->current_free;
> +        if (expected_sector != (sector_t)-1 && 
> unlikely(cache_sector(wc, e) != expected_sector))
> +            return NULL;
>           next = rb_next(&e->rb_node);
>           rb_erase(&e->rb_node, &wc->freetree);
>           if (unlikely(!next))
> @@ -644,6 +646,8 @@ static struct wc_entry *writecache_pop_f
>           if (unlikely(list_empty(&wc->freelist)))
>               return NULL;
>           e = container_of(wc->freelist.next, struct wc_entry, lru);
> +        if (expected_sector != (sector_t)-1 && 
> unlikely(cache_sector(wc, e) != expected_sector))
> +            return NULL;
>           list_del(&e->lru);
>       }
>       wc->freelist_size--;
> @@ -1194,7 +1198,7 @@ read_next_block:
>                       goto bio_copy;
>                   }
>               }
> -            e = writecache_pop_from_freelist(wc);
> +            e = writecache_pop_from_freelist(wc, (sector_t)-1);
>               if (unlikely(!e)) {
>                   writecache_wait_on_freelist(wc);
>                   continue;
> @@ -1206,9 +1210,25 @@ bio_copy:
>               if (WC_MODE_PMEM(wc)) {
>                   bio_copy_block(wc, bio, memory_data(wc, e));
>               } else {
> -                dm_accept_partial_bio(bio, wc->block_size >> 
> SECTOR_SHIFT);
> +                unsigned bio_size = wc->block_size;
> +                sector_t start_cache_sec = cache_sector(wc, e);
> +                sector_t current_cache_sec = start_cache_sec + 
> (bio_size >> SECTOR_SHIFT);
> +
> +                while (bio_size < bio->bi_iter.bi_size) {
> +                    struct wc_entry *f = 
> writecache_pop_from_freelist(wc, current_cache_sec);
> +                    if (!f)
> +                        break;
> +                    write_original_sector_seq_count(wc, f, 
> bio->bi_iter.bi_sector + (bio_size >> SECTOR_SHIFT), wc->seq_count);
> +                    writecache_insert_entry(wc, f);
> +                    wc->uncommitted_blocks++;
> +                    bio_size += wc->block_size;
> +                    current_cache_sec += wc->block_size >> SECTOR_SHIFT;
> +                }
> +
>                   bio_set_dev(bio, wc->ssd_dev->bdev);
> -                bio->bi_iter.bi_sector = cache_sector(wc, e);
> +                bio->bi_iter.bi_sector = start_cache_sec;
> +                dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
> +
>                   if (unlikely(wc->uncommitted_blocks >= 
> wc->autocommit_blocks)) {
>                       wc->uncommitted_blocks = 0;
>                       queue_work(wc->writeback_wq, &wc->flush_work);


The speed gain looks quite good.
One concern is if over time the free list becomes defragmented meaning 
it may become harder for the current free entry to have consecutive free 
blocks.





More information about the dm-devel mailing list