[dm-devel] [PATCH 11/14] dm-zoned: support arbitrary number of devices
Damien Le Moal
Damien.LeMoal at wdc.com
Sun May 31 23:54:56 UTC 2020
On 2020/05/31 22:06, Hannes Reinecke wrote:
> On 5/31/20 11:10 AM, Damien Le Moal wrote:
>> On Fri, 2020-05-29 at 19:39 +0200, Hannes Reinecke wrote:
>>> Remove the hard-coded limit of two devices and support an unlimited
>>> number of additional zoned devices.
>>> With that we need to increase the device-mapper version number to
>>> 3.0.0 as we've modified the interface.
>>>
>>> Signed-off-by: Hannes Reinecke <hare at suse.de>
>>> ---
>>> drivers/md/dm-zoned-metadata.c | 15 +++++-
>>> drivers/md/dm-zoned-target.c | 106 ++++++++++++++++++++++++-----------------
>>> 2 files changed, 75 insertions(+), 46 deletions(-)
>>>
>>> diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
>>> index 044c152eb756..221163ae5f68 100644
>>> --- a/drivers/md/dm-zoned-metadata.c
>>> +++ b/drivers/md/dm-zoned-metadata.c
>>> @@ -1523,7 +1523,20 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
>>> */
>>> zmd->sb[0].zone = dmz_get(zmd, 0);
>>>
>>> - zoned_dev = &zmd->dev[1];
>>> + for (i = 1; i < zmd->nr_devs; i++) {
>>> + zoned_dev = &zmd->dev[i];
>>> +
>>> + ret = blkdev_report_zones(zoned_dev->bdev, 0,
>>> + BLK_ALL_ZONES,
>>> + dmz_init_zone, zoned_dev);
>>> + if (ret < 0) {
>>> + DMDEBUG("(%s): Failed to report zones, error %d",
>>> + zmd->devname, ret);
>>> + dmz_drop_zones(zmd);
>>> + return ret;
>>> + }
>>> + }
>>> + return 0;
>>> }
>>>
>>> /*
>>> diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
>>> index aa3d26d16441..4a51738d4b0d 100644
>>> --- a/drivers/md/dm-zoned-target.c
>>> +++ b/drivers/md/dm-zoned-target.c
>>> @@ -13,8 +13,6 @@
>>>
>>> #define DMZ_MIN_BIOS 8192
>>>
>>> -#define DMZ_MAX_DEVS 2
>>> -
>>> /*
>>> * Zone BIO context.
>>> */
>>> @@ -40,10 +38,10 @@ struct dm_chunk_work {
>>> * Target descriptor.
>>> */
>>> struct dmz_target {
>>> - struct dm_dev *ddev[DMZ_MAX_DEVS];
>>> + struct dm_dev **ddev;
>>> unsigned int nr_ddevs;
>>>
>>> - unsigned long flags;
>>> + unsigned int flags;
>>>
>>> /* Zoned block device information */
>>> struct dmz_dev *dev;
>>> @@ -764,7 +762,7 @@ static void dmz_put_zoned_device(struct dm_target *ti)
>>> struct dmz_target *dmz = ti->private;
>>> int i;
>>>
>>> - for (i = 0; i < DMZ_MAX_DEVS; i++) {
>>> + for (i = 0; i < dmz->nr_ddevs; i++) {
>>> if (dmz->ddev[i]) {
>>> dm_put_device(ti, dmz->ddev[i]);
>>> dmz->ddev[i] = NULL;
>>> @@ -777,21 +775,35 @@ static int dmz_fixup_devices(struct dm_target *ti)
>>> struct dmz_target *dmz = ti->private;
>>> struct dmz_dev *reg_dev, *zoned_dev;
>>> struct request_queue *q;
>>> + sector_t zone_nr_sectors = 0;
>>> + int i;
>>>
>>> /*
>>> - * When we have two devices, the first one must be a regular block
>>> - * device and the second a zoned block device.
>>> + * When we have more than on devices, the first one must be a
>>> + * regular block device and the others zoned block devices.
>>> */
>>> - if (dmz->ddev[0] && dmz->ddev[1]) {
>>> + if (dmz->nr_ddevs > 1) {
>>> reg_dev = &dmz->dev[0];
>>> if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) {
>>> ti->error = "Primary disk is not a regular device";
>>> return -EINVAL;
>>> }
>>> - zoned_dev = &dmz->dev[1];
>>> - if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
>>> - ti->error = "Secondary disk is not a zoned device";
>>> - return -EINVAL;
>>> + for (i = 1; i < dmz->nr_ddevs; i++) {
>>> + zoned_dev = &dmz->dev[i];
>>> + if (zoned_dev->flags & DMZ_BDEV_REGULAR) {
>>> + ti->error = "Secondary disk is not a zoned device";
>>> + return -EINVAL;
>>> + }
>>> + q = bdev_get_queue(zoned_dev->bdev);
>>> + if (zone_nr_sectors &&
>>> + zone_nr_sectors != blk_queue_zone_sectors(q)) {
>>> + ti->error = "Zone nr sectors mismatch";
>>> + return -EINVAL;
>>> + }
>>> + zone_nr_sectors = blk_queue_zone_sectors(q);
>>> + zoned_dev->zone_nr_sectors = zone_nr_sectors;
>>> + zoned_dev->nr_zones =
>>> + blkdev_nr_zones(zoned_dev->bdev->bd_disk);
>>> }
>>> } else {
>>> reg_dev = NULL;
>>> @@ -800,17 +812,24 @@ static int dmz_fixup_devices(struct dm_target *ti)
>>> ti->error = "Disk is not a zoned device";
>>> return -EINVAL;
>>> }
>>> + q = bdev_get_queue(zoned_dev->bdev);
>>> + zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q);
>>> + zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk);
>>> }
>>> - q = bdev_get_queue(zoned_dev->bdev);
>>> - zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q);
>>> - zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk);
>>>
>>> if (reg_dev) {
>>> - reg_dev->zone_nr_sectors = zoned_dev->zone_nr_sectors;
>>> + sector_t zone_offset;
>>> +
>>> + reg_dev->zone_nr_sectors = zone_nr_sectors;
>>> reg_dev->nr_zones =
>>> DIV_ROUND_UP_SECTOR_T(reg_dev->capacity,
>>> reg_dev->zone_nr_sectors);
>>> - zoned_dev->zone_offset = reg_dev->nr_zones;
>>> + reg_dev->zone_offset = 0;
>>> + zone_offset = reg_dev->nr_zones;
>>> + for (i = 1; i < dmz->nr_ddevs; i++) {
>>> + dmz->dev[i].zone_offset = zone_offset;
>>> + zone_offset += dmz->dev[i].nr_zones;
>>> + }
>>> }
>>> return 0;
>>> }
>>> @@ -824,7 +843,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>>> int ret, i;
>>>
>>> /* Check arguments */
>>> - if (argc < 1 || argc > 2) {
>>> + if (argc < 1) {
>>> ti->error = "Invalid argument count";
>>> return -EINVAL;
>>> }
>>> @@ -835,32 +854,31 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
>>> ti->error = "Unable to allocate the zoned target descriptor";
>>> return -ENOMEM;
>>> }
>>> - dmz->dev = kcalloc(2, sizeof(struct dmz_dev), GFP_KERNEL);
>>> + dmz->dev = kcalloc(argc, sizeof(struct dmz_dev), GFP_KERNEL);
>>> if (!dmz->dev) {
>>> ti->error = "Unable to allocate the zoned device descriptors";
>>> kfree(dmz);
>>> return -ENOMEM;
>>> }
>>> + dmz->ddev = kcalloc(argc, sizeof(struct dm_dev *), GFP_KERNEL);
>>> + if (!dmz->ddev) {
>>> + ti->error = "Unable to allocate the dm device descriptors";
>>> + ret = -ENOMEM;
>>> + goto err;
>>> + }
>>> dmz->nr_ddevs = argc;
>>> +
>>> ti->private = dmz;
>>>
>>> /* Get the target zoned block device */
>>> - ret = dmz_get_zoned_device(ti, argv[0], 0, argc);
>>> - if (ret)
>>> - goto err;
>>> -
>>> - if (argc == 2) {
>>> - ret = dmz_get_zoned_device(ti, argv[1], 1, argc);
>>> - if (ret) {
>>> - dmz_put_zoned_device(ti);
>>> - goto err;
>>> - }
>>> + for (i = 0; i < argc; i++) {
>>> + ret = dmz_get_zoned_device(ti, argv[i], i, argc);
>>> + if (ret)
>>> + goto err_dev;
>>> }
>>> ret = dmz_fixup_devices(ti);
>>> - if (ret) {
>>> - dmz_put_zoned_device(ti);
>>> - goto err;
>>> - }
>>> + if (ret)
>>> + goto err_dev;
>>>
>>> /* Initialize metadata */
>>> ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata,
>>> @@ -1056,13 +1074,13 @@ static int dmz_iterate_devices(struct dm_target *ti,
>>> struct dmz_target *dmz = ti->private;
>>> unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata);
>>> sector_t capacity;
>>> - int r;
>>> + int i, r;
>>>
>>> - capacity = dmz->dev[0].capacity & ~(zone_nr_sectors - 1);
>>> - r = fn(ti, dmz->ddev[0], 0, capacity, data);
>>> - if (!r && dmz->ddev[1]) {
>>> - capacity = dmz->dev[1].capacity & ~(zone_nr_sectors - 1);
>>> - r = fn(ti, dmz->ddev[1], 0, capacity, data);
>>> + for (i = 0; i < dmz->nr_ddevs; i++) {
>>> + capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1);
>>> + r = fn(ti, dmz->ddev[i], 0, capacity, data);
>>> + if (r)
>>> + break;
>>> }
>>> return r;
>>> }
>>> @@ -1083,9 +1101,7 @@ static void dmz_status(struct dm_target *ti, status_type_t type,
>>> dmz_nr_zones(dmz->metadata),
>>> dmz_nr_unmap_cache_zones(dmz->metadata),
>>> dmz_nr_cache_zones(dmz->metadata));
>>> - for (i = 0; i < DMZ_MAX_DEVS; i++) {
>>> - if (!dmz->ddev[i])
>>> - continue;
>>> + for (i = 0; i < dmz->nr_ddevs; i++) {
>>> /*
>>> * For a multi-device setup the first device
>>> * contains only cache zones.
>>> @@ -1104,8 +1120,8 @@ static void dmz_status(struct dm_target *ti, status_type_t type,
>>> dev = &dmz->dev[0];
>>> format_dev_t(buf, dev->bdev->bd_dev);
>>> DMEMIT("%s", buf);
>>> - if (dmz->dev[1].bdev) {
>>> - dev = &dmz->dev[1];
>>> + for (i = 1; i < dmz->nr_ddevs; i++) {
>>> + dev = &dmz->dev[i];
>>> format_dev_t(buf, dev->bdev->bd_dev);
>>> DMEMIT(" %s", buf);
>>> }
>>> @@ -1133,7 +1149,7 @@ static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv,
>>>
>>> static struct target_type dmz_type = {
>>> .name = "zoned",
>>> - .version = {2, 0, 0},
>>> + .version = {3, 0, 0},
>>> .features = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM,
>>> .module = THIS_MODULE,
>>> .ctr = dmz_ctr,
>>
>> Looks all good to me, but thinking more about it, don't we need to add
>> a device index in the super blocks ? The reason is that if the drive
>> configuration changes between stopt/start (drives removed, added or
>> changed slots), the drive names will change and while the userspace
>> will still be able to find the group of drives constituting the target
>> (using UUID9, there is no obvious way to find out what the original
>> drive order was. Since the kernel side relies on the drive being passed
>> to the ctr function in the order of the mapping, we need to preserve
>> that. Or change also the kernel side to use the index in the super
>> block to put each drive in its correct dmz->dev[] slot.
>>
> Already taken care of; here's where the tertiary superblocks come in.
> Each superblock carries its own position (in the 'sb_block' field).
> This is the _absolute_ position within the entire setup, not the
> relative per-device block number.
> And it also has the absolute number of blocks in the 'nr_chunks' field.
>
> Hence we know exactly where this superblock (and, by implication, the
> zones following this superblock) should end up. And we know how large
> the entire setup will be. So can insert the superblock at the right
> position and then can check if we have enough zones for the entire
> device.
I do not get it though. Where is that checked ? At least in this patch, drives
are initialized in the order of the ctr arguments, and this loop:
+ for (i = 1; i < dmz->nr_ddevs; i++) {
+ dmz->dev[i].zone_offset = zone_offset;
+ zone_offset += dmz->dev[i].nr_zones;
+ }
in dmz_fixup_devices() sets the zone offset for each device in the same order.
So for a given chunk mapped to a zone identified by its ID, if the device order
changes, zone ID will change and the chunk will not be mapped to the correct
zone. What am I missing here ?
>
> Not sure if the dmzadm does it, though; but should be easy enough to
> implement.
>
> Cheers,
>
> Hannes
>
--
Damien Le Moal
Western Digital Research
More information about the dm-devel
mailing list