[dm-devel] [PATCH] dm-writecache: improve performance of large linear writes on SSDs

Mikulas Patocka mpatocka at redhat.com
Wed Jan 15 09:35:22 UTC 2020


When dm-writecache is used with SSD as a cache device, it would submit
separate bio for each written block. The I/Os would be merged by the disk
scheduler, but this merging degrades performance.

This patch makes dm-writecache submit larger bios - we can submit large
bio as long as there is consecutive free space on the cache device.

Benchmark (arm64 with 64k page size, using /dev/ram0 as a cache device):

fio --bs=512k --iodepth=32 --size=400M --direct=1 --filename=/dev/mapper/cache --rw=randwrite --numjobs=1 --name=test

block	old	new
size	MiB/s	MiB/s
---------------------
512	181	700
1k	347	1256
2k	644	2020
4k	1183	2759
8k	1852	3333
16k	2469	3509
32k	2974	3670
64k	3404	3810

Signed-off-by: Mikulas Patocka <mpatocka at redhat.com>

---
 drivers/md/dm-writecache.c |   28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

Index: linux-2.6/drivers/md/dm-writecache.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-writecache.c	2020-01-14 16:11:09.000000000 +0100
+++ linux-2.6/drivers/md/dm-writecache.c	2020-01-14 21:42:44.000000000 +0100
@@ -626,7 +626,7 @@ static void writecache_add_to_freelist(s
 	wc->freelist_size++;
 }
 
-static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
+static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
 {
 	struct wc_entry *e;
 
@@ -635,6 +635,8 @@ static struct wc_entry *writecache_pop_f
 		if (unlikely(!wc->current_free))
 			return NULL;
 		e = wc->current_free;
+		if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
+			return NULL;
 		next = rb_next(&e->rb_node);
 		rb_erase(&e->rb_node, &wc->freetree);
 		if (unlikely(!next))
@@ -644,6 +646,8 @@ static struct wc_entry *writecache_pop_f
 		if (unlikely(list_empty(&wc->freelist)))
 			return NULL;
 		e = container_of(wc->freelist.next, struct wc_entry, lru);
+		if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
+			return NULL;
 		list_del(&e->lru);
 	}
 	wc->freelist_size--;
@@ -1194,7 +1198,7 @@ read_next_block:
 					goto bio_copy;
 				}
 			}
-			e = writecache_pop_from_freelist(wc);
+			e = writecache_pop_from_freelist(wc, (sector_t)-1);
 			if (unlikely(!e)) {
 				writecache_wait_on_freelist(wc);
 				continue;
@@ -1206,9 +1210,25 @@ bio_copy:
 			if (WC_MODE_PMEM(wc)) {
 				bio_copy_block(wc, bio, memory_data(wc, e));
 			} else {
-				dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
+				unsigned bio_size = wc->block_size;
+				sector_t start_cache_sec = cache_sector(wc, e);
+				sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
+
+				while (bio_size < bio->bi_iter.bi_size) {
+					struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
+					if (!f)
+						break;
+					write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + (bio_size >> SECTOR_SHIFT), wc->seq_count);
+					writecache_insert_entry(wc, f);
+					wc->uncommitted_blocks++;
+					bio_size += wc->block_size;
+					current_cache_sec += wc->block_size >> SECTOR_SHIFT;
+				}
+
 				bio_set_dev(bio, wc->ssd_dev->bdev);
-				bio->bi_iter.bi_sector = cache_sector(wc, e);
+				bio->bi_iter.bi_sector = start_cache_sec;
+				dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
+
 				if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
 					wc->uncommitted_blocks = 0;
 					queue_work(wc->writeback_wq, &wc->flush_work);




More information about the dm-devel mailing list