[dm-devel] [PATCH] dm-switch target

Mikulas Patocka mpatocka at redhat.com
Tue Sep 25 21:50:31 UTC 2012


Hi

This is the dm-switch target to be included in the next kernel.

It is equivalent to the last code sent by Jim Ramsay with the exception 
that REQ_FLUSH processing was removed (because hardware has no write-back 
cache).

Mikulas

---

dm-switch target

Originally developed by Jim Ramsay. Simplified by Mikulas Patocka.

Signed-off-by: Mikulas Patocka <mpatocka at redhat.com>
Signed-off-by: Jim Ramsay <jim_ramsay at dell.com>

---
 drivers/md/Kconfig     |   11 +
 drivers/md/Makefile    |    1 
 drivers/md/dm-switch.c |  520 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 532 insertions(+)

Index: linux-3.5.4-fast/drivers/md/Kconfig
===================================================================
--- linux-3.5.4-fast.orig/drivers/md/Kconfig	2012-09-25 22:15:36.000000000 +0200
+++ linux-3.5.4-fast/drivers/md/Kconfig	2012-09-25 22:21:56.000000000 +0200
@@ -417,4 +417,15 @@ config DM_VERITY2
 
 source "drivers/md/enhanceio/Kconfig"
 
+config DM_SWITCH
+	tristate "Switch target support (EXPERIMENTAL)"
+	depends on BLK_DEV_DM && EXPERIMENTAL
+	---help---
+	  Help text needs writing
+
+	  To compile this code as a module, choose M here: the module will
+	  be called dm-switch.
+
+	  If unsure, say N.
+
 endif # MD
Index: linux-3.5.4-fast/drivers/md/Makefile
===================================================================
--- linux-3.5.4-fast.orig/drivers/md/Makefile	2012-09-25 22:15:36.000000000 +0200
+++ linux-3.5.4-fast/drivers/md/Makefile	2012-09-25 22:21:56.000000000 +0200
@@ -48,6 +48,7 @@ obj-$(CONFIG_DM_THIN_PROVISIONING)	+= dm
 obj-$(CONFIG_DM_VERITY)		+= dm-verity.o
 obj-$(CONFIG_DM_ZEROED)		+= dm-zeroed.o
 obj-$(CONFIG_DM_ENHANCEIO)	+= enhanceio/
+obj-$(CONFIG_DM_SWITCH)		+= dm-switch.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
Index: linux-3.5.4-fast/drivers/md/dm-switch.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-3.5.4-fast/drivers/md/dm-switch.c	2012-09-25 22:21:53.000000000 +0200
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2010-2012 by Dell Inc.  All rights reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Description:
+ *
+ *     file:    dm-switch.c
+ *     authors: Kevin_OKelley at dell.com
+ *              Jim_Ramsay at dell.com
+ *              Narendran_Ganapathy at dell.com
+ *		mpatocka at redhat.com
+ *
+ * This file implements a "switch" target which efficiently implements a
+ * mapping of IOs to underlying block devices in scenarios where there are:
+ *   (1) a large number of address regions
+ *   (2) a fixed size equal across all address regions
+ *   (3) no pattern than allows for a compact description with something like
+ *       the dm-stripe target.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device-mapper.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "switch"
+
+/*
+ * Switch device context block: A new one is created for each dm device.
+ * Contains an array of devices from which we have taken references.
+ */
+struct switch_dev {
+	struct dm_dev *dmdev;
+	sector_t start;
+};
+
+typedef unsigned long pt_entry;
+
+/* Switch context header */
+struct switch_ctx {
+	unsigned dev_count;		/* Number of devices */
+	unsigned page_size;		/* Page size in 512B sectors */
+	unsigned long n_pages;		/* Number of pages */
+	signed char page_size_bits;	/* log2 of page_size or -1 */
+
+	unsigned char pte_size;		/* Page table entry size in bits */
+	unsigned char pte_fields;	/* Number of entries per pt_entry */
+	signed char pte_fields_bits;	/* log2 of pte_fields or -1 */
+	pt_entry *page_table;		/* Page table */
+
+	/* Array of dm devices to switch between */
+	struct switch_dev dev_list[0];
+};
+
+static inline void switch_get_position(struct switch_ctx *pctx,
+				       unsigned long page,
+				       unsigned long *index,
+				       unsigned *bit)
+
+{
+	if (pctx->pte_fields_bits >= 0) {
+		*index = page >> pctx->pte_fields_bits;
+		*bit = page & (pctx->pte_fields - 1);
+	} else {
+		*index = page / pctx->pte_fields;
+		*bit = page % pctx->pte_fields;
+	}
+	*bit *= pctx->pte_size;
+
+}
+
+static inline unsigned switch_get_deviceidx(struct switch_ctx *pctx,
+					    sector_t sector)
+{
+	unsigned long index;
+	unsigned bit, idev;
+	sector_t p;
+
+	p = sector;
+	if (pctx->page_size_bits >= 0)
+		p >>= pctx->page_size_bits;
+	else
+		sector_div(p, pctx->page_size);
+
+	switch_get_position(pctx, p, &index, &bit);
+	idev = (ACCESS_ONCE(pctx->page_table[index]) >> bit) &
+	       ((1 << pctx->pte_size) - 1);
+
+	/* This can only happen if the processor uses non-atomic stores. */
+	if (unlikely(idev >= pctx->dev_count))
+		idev = 0;
+
+	return idev;
+}
+
+static void switch_page_table_write(struct switch_ctx *pctx, unsigned long page,
+				    unsigned value)
+{
+	unsigned long index;
+	unsigned bit;
+	pt_entry pte;
+
+	switch_get_position(pctx, page, &index, &bit);
+
+	pte = pctx->page_table[index];
+	pte &= ~((((pt_entry)1 << pctx->pte_size) - 1) << bit);
+	pte |= (pt_entry)value << bit;
+	pctx->page_table[index] = pte;
+}
+
+/*
+ * Constructor: Called each time a dmsetup command creates a dm device.  The
+ * target parameter will already have the table, type, begin and len fields
+ * filled in.  Arguments are in pairs: <dev_path> <offset>.  Therefore, we get
+ * multiple constructor calls, but we will need to build a list of switch_ctx
+ * blocks so that the page table information gets matched to the correct
+ * device.
+ */
+static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	unsigned a;
+	int n;
+	int r;
+	unsigned dev_count;
+	struct switch_ctx *pctx;
+	sector_t dev_size;
+	unsigned long e;
+
+	if (argc < 4) {
+		ti->error = "Insufficient arguments";
+		r = -EINVAL;
+		goto error;
+	}
+	if (kstrtouint(argv[0], 10, &dev_count) ||
+	    !dev_count ||
+	    dev_count > (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_dev)) {
+		ti->error = "Invalid device count";
+		r = -EINVAL;
+		goto error;
+	}
+	if (dev_count != (argc - 2) / 2) {
+		ti->error = "Invalid argument count";
+		r = -EINVAL;
+		goto error;
+	}
+	pctx = kmalloc(sizeof(struct switch_ctx) + (dev_count * sizeof(struct switch_dev)),
+		       GFP_KERNEL);
+	if (!pctx) {
+		ti->error = "Cannot allocate redirect context";
+		r = -ENOMEM;
+		goto error;
+	}
+	pctx->dev_count = dev_count;
+	if (kstrtouint(argv[1], 10, &pctx->page_size) ||
+	    !pctx->page_size) {
+		ti->error = "Invalid page size";
+		r = -EINVAL;
+		goto error_kfree;
+	}
+
+	if (!(pctx->page_size & (pctx->page_size - 1)))
+		pctx->page_size_bits = __ffs(pctx->page_size);
+	else
+		pctx->page_size_bits = -1;
+
+	pctx->pte_size = 1;
+	while (pctx->pte_size < sizeof(pt_entry) * 8 &&
+	       (pt_entry)1 << pctx->pte_size < pctx->dev_count)
+		pctx->pte_size++;
+
+	pctx->pte_fields = (sizeof(pt_entry) * 8) / pctx->pte_size;
+	if (!(pctx->pte_fields & (pctx->pte_fields - 1)))
+		pctx->pte_fields_bits = __ffs(pctx->pte_fields);
+	else
+		pctx->pte_fields_bits = -1;
+
+	dev_size = ti->len;
+	if (sector_div(dev_size, pctx->page_size))
+		dev_size++;
+
+	pctx->n_pages = dev_size;
+	if (pctx->n_pages != dev_size || pctx->n_pages >= ULONG_MAX) {
+		ti->error = "Too long page table";
+		r = -EINVAL;
+		goto error_kfree;
+	}
+
+	if (sector_div(dev_size, pctx->pte_fields))
+		dev_size++;
+
+	if (dev_size > ULONG_MAX / sizeof(pt_entry)) {
+		ti->error = "Too long page table";
+		r = -EINVAL;
+		goto error_kfree;
+	}
+
+	r = dm_set_target_max_io_len(ti, pctx->page_size);
+	if (r)
+		goto error_kfree;
+
+	pctx->page_table = vmalloc(dev_size * sizeof(pt_entry));
+	if (!pctx->page_table) {
+		ti->error = "Cannot allocate page table";
+		r = -ENOMEM;
+		goto error_kfree;
+	}
+
+	a = 0;
+	for (e = 0; e < pctx->n_pages; e++) {
+		switch_page_table_write(pctx, e, a);
+		a++;
+		if (a >= pctx->dev_count)
+			a = 0;
+	}
+
+	/*
+	 * Check each device beneath the target to ensure that the limits are
+	 * consistent.
+	 */
+	for (n = 0, a = 2; n < pctx->dev_count; n++, a += 2) {
+		struct dm_dev *dm;
+		sector_t dev_size;
+		unsigned long long start;
+
+		if (kstrtoull(argv[a + 1], 10, &start) ||
+		    start != (sector_t)start) {
+			ti->error = "Invalid device starting offset";
+			r = -EINVAL;
+			n--;
+			goto error_release_n;
+		}
+		r = dm_get_device
+		    (ti, argv[a], dm_table_get_mode(ti->table), &dm);
+		if (r) {
+			ti->error = "Device lookup failed";
+			n--;
+			goto error_release_n;
+		}
+		pctx->dev_list[n].dmdev = dm;
+		pctx->dev_list[n].start = start;
+
+		dev_size = i_size_read(dm->bdev->bd_inode) >> SECTOR_SHIFT;
+
+		if (ti->len > start + dev_size) {
+			ti->error = "Device is too small";
+			r = -EINVAL;
+			goto error_release_n;
+		}
+	}
+
+	/* For UNMAP, sending the request down any path is sufficient */
+	ti->num_discard_requests = 1;
+
+	ti->private = pctx;
+
+	return 0;
+
+error_release_n:		/* De-reference all devices  */
+	for (; n >= 0; n--)
+		dm_put_device(ti, pctx->dev_list[n].dmdev);
+
+	vfree(pctx->page_table);
+error_kfree:
+	kfree(pctx);
+
+error:
+	return r;
+}
+
+/*
+ * Destructor: Don't free the dm_target, just the ti->private data (if any).
+ */
+static void switch_dtr(struct dm_target *ti)
+{
+	int n;
+	struct switch_ctx *pctx = ti->private;
+
+	for (n = 0; n < pctx->dev_count; n++)
+		dm_put_device(ti, pctx->dev_list[n].dmdev);
+
+	vfree(pctx->page_table);
+	kfree(pctx);
+}
+
+static int switch_map(struct dm_target *ti, struct bio *bio,
+		      union map_info *map_context)
+{
+	struct switch_ctx *pctx = ti->private;
+
+	sector_t offset = bio->bi_sector - ti->begin;
+	unsigned idev;
+
+	idev = switch_get_deviceidx(pctx, offset);
+
+	bio->bi_bdev = pctx->dev_list[idev].dmdev->bdev;
+	bio->bi_sector = pctx->dev_list[idev].start + offset;
+
+	return DM_MAPIO_REMAPPED;
+}
+
+/*
+ * We need to parse hex numbers as fast as possible.
+ * Message is used to load the whole table.
+ *
+ * This table-based hex parser improves performance.
+ * It improves a time to load 1000000 entries compared to the condition-based
+ * parser.
+ *		table-based parser	condition-based parser
+ * PA-RISC	0.29s			0.31s
+ * Opteron	0.0495s			0.0498s
+ */
+
+static const unsigned char hex_table[256] = {
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+0,1,2,3,4,5,6,7,8,9,255,255,255,255,255,255,
+255,10,11,12,13,14,15,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,10,11,12,13,14,15,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
+};
+
+static inline void parse_hex(const char *string, sector_t *result, const char **end)
+{
+	unsigned char d;
+	sector_t r = 0;
+#if 1
+	while ((d = hex_table[(unsigned char)*string]) < 16) {
+		r = (r << 4) | d;
+		string++;
+	}
+#else
+	while (1) {
+		d = *string;
+		if (d >= '0' && d <= '9')
+			d -= '0';
+		else if (d >= 'A' && d <= 'F')
+			d -= 'A' - 10;
+		else if (d >= 'a' && d <= 'f')
+			d -= 'a' - 10;
+		else
+			break;
+		r = (r << 4) | d;
+		string++;
+	}
+#endif
+	*end = string;
+	*result = r;
+}
+
+static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+	static DEFINE_MUTEX(message_mutex);
+
+	struct switch_ctx *pctx = ti->private;
+	int r;
+
+	mutex_lock(&message_mutex);
+
+	if (!argc) {
+		goto invalid_message;
+	} else if (!strcasecmp(argv[0], "set-table")) {
+		unsigned i;
+		sector_t table_index = 0;
+		for (i = 1; i < argc; i++) {
+			sector_t device;
+			const char *string = argv[i];
+			if (*string == ':')
+				table_index++;
+			else {
+				parse_hex(string, &table_index, &string);
+				if (unlikely(*string != ':')) {
+invalid_table:
+					DMWARN("invalid set-table argument");
+					r = -EINVAL;
+					goto ret;
+				}
+			}
+			string++;
+			if (unlikely(!*string))
+				goto invalid_table;
+			parse_hex(string, &device, &string);
+			if (unlikely(*string))
+				goto invalid_table;
+			if (unlikely(table_index >= pctx->n_pages)) {
+				DMWARN("invalid set-table page");
+				r = -EINVAL;
+				goto ret;
+			}
+			if (unlikely(device >= pctx->dev_count)) {
+				DMWARN("invalid set-table device");
+				r = -EINVAL;
+				goto ret;
+			}
+			switch_page_table_write(pctx, table_index, device);
+		}
+		r = 0;
+	} else {
+invalid_message:
+		DMWARN("unrecognised message received.");
+		r = -EINVAL;
+	}
+ret:
+	mutex_unlock(&message_mutex);
+	return r;
+}
+
+static int switch_status(struct dm_target *ti, status_type_t type,
+			 unsigned status_flags, char *result, unsigned maxlen)
+{
+	struct switch_ctx *pctx = ti->private;
+	unsigned sz = 0;
+	int n;
+
+	result[0] = '\0';
+	switch (type) {
+	case STATUSTYPE_INFO:
+		result[0] = 0;
+		break;
+
+	case STATUSTYPE_TABLE:
+		DMEMIT("%u %u", pctx->dev_count, pctx->page_size);
+		for (n = 0; n < pctx->dev_count; n++) {
+			DMEMIT(" %s %llu", pctx->dev_list[n].dmdev->name,
+			       (unsigned long long)pctx->dev_list[n].start);
+		}
+		break;
+
+	default:
+		return 0;
+	}
+	return 0;
+}
+
+/*
+ * Switch ioctl:
+ *
+ * Passthrough all ioctls to the path for sector 0
+ */
+static int switch_ioctl(struct dm_target *ti, unsigned cmd,
+			unsigned long arg)
+{
+	struct switch_ctx *pctx = ti->private;
+	struct block_device *bdev;
+	fmode_t mode;
+	unsigned idev;
+
+	idev = switch_get_deviceidx(pctx, 0);
+
+	bdev = pctx->dev_list[idev].dmdev->bdev;
+	mode = pctx->dev_list[idev].dmdev->mode;
+
+	return __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+}
+
+static int switch_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct switch_ctx *pctx = (struct switch_ctx *)ti->private;
+	int n, ret = 0;
+
+	for (n = 0; n < pctx->dev_count; n++) {
+		ret = fn(ti, pctx->dev_list[n].dmdev, ti->begin, ti->len, data);
+		if (ret)
+			goto out;
+	}
+
+out:
+	return ret;
+}
+
+static struct target_type switch_target = {
+	.name = "switch",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr = switch_ctr,
+	.dtr = switch_dtr,
+	.map = switch_map,
+	.message = switch_message,
+	.status = switch_status,
+	.ioctl = switch_ioctl,
+	.iterate_devices = switch_iterate_devices,
+};
+
+int __init dm_switch_init(void)
+{
+	int r;
+
+	r = dm_register_target(&switch_target);
+	if (r) {
+		DMERR("dm_register_target() failed %d", r);
+		return r;
+	}
+
+	return 0;
+}
+
+void dm_switch_exit(void)
+{
+	dm_unregister_target(&switch_target);
+}
+
+module_init(dm_switch_init);
+module_exit(dm_switch_exit);
+
+MODULE_DESCRIPTION(DM_NAME " fixed-size address-region-mapping throughput-oriented path selector");
+MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley at dell.com>");
+MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay at dell.com>");
+MODULE_AUTHOR("Mikulas Patocka <mpatocka at redhat.com>");
+MODULE_LICENSE("GPL");




More information about the dm-devel mailing list