[dm-devel] [PATCH] dm-switch target
Mikulas Patocka
mpatocka at redhat.com
Tue Sep 25 21:50:31 UTC 2012
Hi
This is the dm-switch target to be included in the next kernel.
It is equivalent to the last code sent by Jim Ramsay with the exception
that REQ_FLUSH processing was removed (because hardware has no write-back
cache).
Mikulas
---
dm-switch target
Originally developed by Jim Ramsay. Simplified by Mikulas Patocka.
Signed-off-by: Mikulas Patocka <mpatocka at redhat.com>
Signed-off-by: Jim Ramsay <jim_ramsay at dell.com>
---
drivers/md/Kconfig | 11 +
drivers/md/Makefile | 1
drivers/md/dm-switch.c | 520 +++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 532 insertions(+)
Index: linux-3.5.4-fast/drivers/md/Kconfig
===================================================================
--- linux-3.5.4-fast.orig/drivers/md/Kconfig 2012-09-25 22:15:36.000000000 +0200
+++ linux-3.5.4-fast/drivers/md/Kconfig 2012-09-25 22:21:56.000000000 +0200
@@ -417,4 +417,15 @@ config DM_VERITY2
source "drivers/md/enhanceio/Kconfig"
+config DM_SWITCH
+ tristate "Switch target support (EXPERIMENTAL)"
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ ---help---
+ Help text needs writing
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-switch.
+
+ If unsure, say N.
+
endif # MD
Index: linux-3.5.4-fast/drivers/md/Makefile
===================================================================
--- linux-3.5.4-fast.orig/drivers/md/Makefile 2012-09-25 22:15:36.000000000 +0200
+++ linux-3.5.4-fast/drivers/md/Makefile 2012-09-25 22:21:56.000000000 +0200
@@ -48,6 +48,7 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm
obj-$(CONFIG_DM_VERITY) += dm-verity.o
obj-$(CONFIG_DM_ZEROED) += dm-zeroed.o
obj-$(CONFIG_DM_ENHANCEIO) += enhanceio/
+obj-$(CONFIG_DM_SWITCH) += dm-switch.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
Index: linux-3.5.4-fast/drivers/md/dm-switch.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-3.5.4-fast/drivers/md/dm-switch.c 2012-09-25 22:21:53.000000000 +0200
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2010-2012 by Dell Inc. All rights reserved.
+ *
+ * This file is released under the GPL.
+ *
+ * Description:
+ *
+ * file: dm-switch.c
+ * authors: Kevin_OKelley at dell.com
+ * Jim_Ramsay at dell.com
+ * Narendran_Ganapathy at dell.com
+ * mpatocka at redhat.com
+ *
+ * This file implements a "switch" target which efficiently implements a
+ * mapping of IOs to underlying block devices in scenarios where there are:
+ * (1) a large number of address regions
+ * (2) a fixed size equal across all address regions
+ * (3) no pattern than allows for a compact description with something like
+ * the dm-stripe target.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device-mapper.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "switch"
+
+/*
+ * Switch device context block: A new one is created for each dm device.
+ * Contains an array of devices from which we have taken references.
+ */
+struct switch_dev {
+ struct dm_dev *dmdev;
+ sector_t start;
+};
+
+typedef unsigned long pt_entry;
+
+/* Switch context header */
+struct switch_ctx {
+ unsigned dev_count; /* Number of devices */
+ unsigned page_size; /* Page size in 512B sectors */
+ unsigned long n_pages; /* Number of pages */
+ signed char page_size_bits; /* log2 of page_size or -1 */
+
+ unsigned char pte_size; /* Page table entry size in bits */
+ unsigned char pte_fields; /* Number of entries per pt_entry */
+ signed char pte_fields_bits; /* log2 of pte_fields or -1 */
+ pt_entry *page_table; /* Page table */
+
+ /* Array of dm devices to switch between */
+ struct switch_dev dev_list[0];
+};
+
+static inline void switch_get_position(struct switch_ctx *pctx,
+ unsigned long page,
+ unsigned long *index,
+ unsigned *bit)
+
+{
+ if (pctx->pte_fields_bits >= 0) {
+ *index = page >> pctx->pte_fields_bits;
+ *bit = page & (pctx->pte_fields - 1);
+ } else {
+ *index = page / pctx->pte_fields;
+ *bit = page % pctx->pte_fields;
+ }
+ *bit *= pctx->pte_size;
+
+}
+
+static inline unsigned switch_get_deviceidx(struct switch_ctx *pctx,
+ sector_t sector)
+{
+ unsigned long index;
+ unsigned bit, idev;
+ sector_t p;
+
+ p = sector;
+ if (pctx->page_size_bits >= 0)
+ p >>= pctx->page_size_bits;
+ else
+ sector_div(p, pctx->page_size);
+
+ switch_get_position(pctx, p, &index, &bit);
+ idev = (ACCESS_ONCE(pctx->page_table[index]) >> bit) &
+ ((1 << pctx->pte_size) - 1);
+
+ /* This can only happen if the processor uses non-atomic stores. */
+ if (unlikely(idev >= pctx->dev_count))
+ idev = 0;
+
+ return idev;
+}
+
+static void switch_page_table_write(struct switch_ctx *pctx, unsigned long page,
+ unsigned value)
+{
+ unsigned long index;
+ unsigned bit;
+ pt_entry pte;
+
+ switch_get_position(pctx, page, &index, &bit);
+
+ pte = pctx->page_table[index];
+ pte &= ~((((pt_entry)1 << pctx->pte_size) - 1) << bit);
+ pte |= (pt_entry)value << bit;
+ pctx->page_table[index] = pte;
+}
+
+/*
+ * Constructor: Called each time a dmsetup command creates a dm device. The
+ * target parameter will already have the table, type, begin and len fields
+ * filled in. Arguments are in pairs: <dev_path> <offset>. Therefore, we get
+ * multiple constructor calls, but we will need to build a list of switch_ctx
+ * blocks so that the page table information gets matched to the correct
+ * device.
+ */
+static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ unsigned a;
+ int n;
+ int r;
+ unsigned dev_count;
+ struct switch_ctx *pctx;
+ sector_t dev_size;
+ unsigned long e;
+
+ if (argc < 4) {
+ ti->error = "Insufficient arguments";
+ r = -EINVAL;
+ goto error;
+ }
+ if (kstrtouint(argv[0], 10, &dev_count) ||
+ !dev_count ||
+ dev_count > (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_dev)) {
+ ti->error = "Invalid device count";
+ r = -EINVAL;
+ goto error;
+ }
+ if (dev_count != (argc - 2) / 2) {
+ ti->error = "Invalid argument count";
+ r = -EINVAL;
+ goto error;
+ }
+ pctx = kmalloc(sizeof(struct switch_ctx) + (dev_count * sizeof(struct switch_dev)),
+ GFP_KERNEL);
+ if (!pctx) {
+ ti->error = "Cannot allocate redirect context";
+ r = -ENOMEM;
+ goto error;
+ }
+ pctx->dev_count = dev_count;
+ if (kstrtouint(argv[1], 10, &pctx->page_size) ||
+ !pctx->page_size) {
+ ti->error = "Invalid page size";
+ r = -EINVAL;
+ goto error_kfree;
+ }
+
+ if (!(pctx->page_size & (pctx->page_size - 1)))
+ pctx->page_size_bits = __ffs(pctx->page_size);
+ else
+ pctx->page_size_bits = -1;
+
+ pctx->pte_size = 1;
+ while (pctx->pte_size < sizeof(pt_entry) * 8 &&
+ (pt_entry)1 << pctx->pte_size < pctx->dev_count)
+ pctx->pte_size++;
+
+ pctx->pte_fields = (sizeof(pt_entry) * 8) / pctx->pte_size;
+ if (!(pctx->pte_fields & (pctx->pte_fields - 1)))
+ pctx->pte_fields_bits = __ffs(pctx->pte_fields);
+ else
+ pctx->pte_fields_bits = -1;
+
+ dev_size = ti->len;
+ if (sector_div(dev_size, pctx->page_size))
+ dev_size++;
+
+ pctx->n_pages = dev_size;
+ if (pctx->n_pages != dev_size || pctx->n_pages >= ULONG_MAX) {
+ ti->error = "Too long page table";
+ r = -EINVAL;
+ goto error_kfree;
+ }
+
+ if (sector_div(dev_size, pctx->pte_fields))
+ dev_size++;
+
+ if (dev_size > ULONG_MAX / sizeof(pt_entry)) {
+ ti->error = "Too long page table";
+ r = -EINVAL;
+ goto error_kfree;
+ }
+
+ r = dm_set_target_max_io_len(ti, pctx->page_size);
+ if (r)
+ goto error_kfree;
+
+ pctx->page_table = vmalloc(dev_size * sizeof(pt_entry));
+ if (!pctx->page_table) {
+ ti->error = "Cannot allocate page table";
+ r = -ENOMEM;
+ goto error_kfree;
+ }
+
+ a = 0;
+ for (e = 0; e < pctx->n_pages; e++) {
+ switch_page_table_write(pctx, e, a);
+ a++;
+ if (a >= pctx->dev_count)
+ a = 0;
+ }
+
+ /*
+ * Check each device beneath the target to ensure that the limits are
+ * consistent.
+ */
+ for (n = 0, a = 2; n < pctx->dev_count; n++, a += 2) {
+ struct dm_dev *dm;
+ sector_t dev_size;
+ unsigned long long start;
+
+ if (kstrtoull(argv[a + 1], 10, &start) ||
+ start != (sector_t)start) {
+ ti->error = "Invalid device starting offset";
+ r = -EINVAL;
+ n--;
+ goto error_release_n;
+ }
+ r = dm_get_device
+ (ti, argv[a], dm_table_get_mode(ti->table), &dm);
+ if (r) {
+ ti->error = "Device lookup failed";
+ n--;
+ goto error_release_n;
+ }
+ pctx->dev_list[n].dmdev = dm;
+ pctx->dev_list[n].start = start;
+
+ dev_size = i_size_read(dm->bdev->bd_inode) >> SECTOR_SHIFT;
+
+ if (ti->len > start + dev_size) {
+ ti->error = "Device is too small";
+ r = -EINVAL;
+ goto error_release_n;
+ }
+ }
+
+ /* For UNMAP, sending the request down any path is sufficient */
+ ti->num_discard_requests = 1;
+
+ ti->private = pctx;
+
+ return 0;
+
+error_release_n: /* De-reference all devices */
+ for (; n >= 0; n--)
+ dm_put_device(ti, pctx->dev_list[n].dmdev);
+
+ vfree(pctx->page_table);
+error_kfree:
+ kfree(pctx);
+
+error:
+ return r;
+}
+
+/*
+ * Destructor: Don't free the dm_target, just the ti->private data (if any).
+ */
+static void switch_dtr(struct dm_target *ti)
+{
+ int n;
+ struct switch_ctx *pctx = ti->private;
+
+ for (n = 0; n < pctx->dev_count; n++)
+ dm_put_device(ti, pctx->dev_list[n].dmdev);
+
+ vfree(pctx->page_table);
+ kfree(pctx);
+}
+
+static int switch_map(struct dm_target *ti, struct bio *bio,
+ union map_info *map_context)
+{
+ struct switch_ctx *pctx = ti->private;
+
+ sector_t offset = bio->bi_sector - ti->begin;
+ unsigned idev;
+
+ idev = switch_get_deviceidx(pctx, offset);
+
+ bio->bi_bdev = pctx->dev_list[idev].dmdev->bdev;
+ bio->bi_sector = pctx->dev_list[idev].start + offset;
+
+ return DM_MAPIO_REMAPPED;
+}
+
+/*
+ * We need to parse hex numbers as fast as possible.
+ * Message is used to load the whole table.
+ *
+ * This table-based hex parser improves performance.
+ * It improves a time to load 1000000 entries compared to the condition-based
+ * parser.
+ * table-based parser condition-based parser
+ * PA-RISC 0.29s 0.31s
+ * Opteron 0.0495s 0.0498s
+ */
+
+static const unsigned char hex_table[256] = {
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+0,1,2,3,4,5,6,7,8,9,255,255,255,255,255,255,
+255,10,11,12,13,14,15,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,10,11,12,13,14,15,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,
+255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
+};
+
+static inline void parse_hex(const char *string, sector_t *result, const char **end)
+{
+ unsigned char d;
+ sector_t r = 0;
+#if 1
+ while ((d = hex_table[(unsigned char)*string]) < 16) {
+ r = (r << 4) | d;
+ string++;
+ }
+#else
+ while (1) {
+ d = *string;
+ if (d >= '0' && d <= '9')
+ d -= '0';
+ else if (d >= 'A' && d <= 'F')
+ d -= 'A' - 10;
+ else if (d >= 'a' && d <= 'f')
+ d -= 'a' - 10;
+ else
+ break;
+ r = (r << 4) | d;
+ string++;
+ }
+#endif
+ *end = string;
+ *result = r;
+}
+
+static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ static DEFINE_MUTEX(message_mutex);
+
+ struct switch_ctx *pctx = ti->private;
+ int r;
+
+ mutex_lock(&message_mutex);
+
+ if (!argc) {
+ goto invalid_message;
+ } else if (!strcasecmp(argv[0], "set-table")) {
+ unsigned i;
+ sector_t table_index = 0;
+ for (i = 1; i < argc; i++) {
+ sector_t device;
+ const char *string = argv[i];
+ if (*string == ':')
+ table_index++;
+ else {
+ parse_hex(string, &table_index, &string);
+ if (unlikely(*string != ':')) {
+invalid_table:
+ DMWARN("invalid set-table argument");
+ r = -EINVAL;
+ goto ret;
+ }
+ }
+ string++;
+ if (unlikely(!*string))
+ goto invalid_table;
+ parse_hex(string, &device, &string);
+ if (unlikely(*string))
+ goto invalid_table;
+ if (unlikely(table_index >= pctx->n_pages)) {
+ DMWARN("invalid set-table page");
+ r = -EINVAL;
+ goto ret;
+ }
+ if (unlikely(device >= pctx->dev_count)) {
+ DMWARN("invalid set-table device");
+ r = -EINVAL;
+ goto ret;
+ }
+ switch_page_table_write(pctx, table_index, device);
+ }
+ r = 0;
+ } else {
+invalid_message:
+ DMWARN("unrecognised message received.");
+ r = -EINVAL;
+ }
+ret:
+ mutex_unlock(&message_mutex);
+ return r;
+}
+
+static int switch_status(struct dm_target *ti, status_type_t type,
+ unsigned status_flags, char *result, unsigned maxlen)
+{
+ struct switch_ctx *pctx = ti->private;
+ unsigned sz = 0;
+ int n;
+
+ result[0] = '\0';
+ switch (type) {
+ case STATUSTYPE_INFO:
+ result[0] = 0;
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%u %u", pctx->dev_count, pctx->page_size);
+ for (n = 0; n < pctx->dev_count; n++) {
+ DMEMIT(" %s %llu", pctx->dev_list[n].dmdev->name,
+ (unsigned long long)pctx->dev_list[n].start);
+ }
+ break;
+
+ default:
+ return 0;
+ }
+ return 0;
+}
+
+/*
+ * Switch ioctl:
+ *
+ * Passthrough all ioctls to the path for sector 0
+ */
+static int switch_ioctl(struct dm_target *ti, unsigned cmd,
+ unsigned long arg)
+{
+ struct switch_ctx *pctx = ti->private;
+ struct block_device *bdev;
+ fmode_t mode;
+ unsigned idev;
+
+ idev = switch_get_deviceidx(pctx, 0);
+
+ bdev = pctx->dev_list[idev].dmdev->bdev;
+ mode = pctx->dev_list[idev].dmdev->mode;
+
+ return __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+}
+
+static int switch_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct switch_ctx *pctx = (struct switch_ctx *)ti->private;
+ int n, ret = 0;
+
+ for (n = 0; n < pctx->dev_count; n++) {
+ ret = fn(ti, pctx->dev_list[n].dmdev, ti->begin, ti->len, data);
+ if (ret)
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+static struct target_type switch_target = {
+ .name = "switch",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = switch_ctr,
+ .dtr = switch_dtr,
+ .map = switch_map,
+ .message = switch_message,
+ .status = switch_status,
+ .ioctl = switch_ioctl,
+ .iterate_devices = switch_iterate_devices,
+};
+
+int __init dm_switch_init(void)
+{
+ int r;
+
+ r = dm_register_target(&switch_target);
+ if (r) {
+ DMERR("dm_register_target() failed %d", r);
+ return r;
+ }
+
+ return 0;
+}
+
+void dm_switch_exit(void)
+{
+ dm_unregister_target(&switch_target);
+}
+
+module_init(dm_switch_init);
+module_exit(dm_switch_exit);
+
+MODULE_DESCRIPTION(DM_NAME " fixed-size address-region-mapping throughput-oriented path selector");
+MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley at dell.com>");
+MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay at dell.com>");
+MODULE_AUTHOR("Mikulas Patocka <mpatocka at redhat.com>");
+MODULE_LICENSE("GPL");
More information about the dm-devel
mailing list