[dm-devel] [PATCH 11/11] dm-zoned: metadata version 2

Hannes Reinecke hare at suse.de
Tue Apr 7 08:34:56 UTC 2020


On 4/7/20 5:22 AM, Damien Le Moal wrote:
> On 2020/04/07 2:29, Hannes Reinecke wrote:
>> Implement handling for metadata version 2. The new metadata adds
>> a label and UUID for the device mapper device, and additional UUID
>> for the underlying block devices.
>> It also allows for an additional regular drive to be used for
>> emulating random access zones. The emulated zones will be placed
>> logically in front of the zones from the zoned block device, causing
>> the superblocks and metadata to be stored on that device.
>> The first zone of the original zoned device will be used to hold
>> another, tertiary copy of the metadata. This copy carries a
>> generation number of 0 and is never updated; it's just used
>> for identification.
>> The zones are spaced equidistant, which means that any eventual
>> runt zones will be counted, too. They are marked as 'offline' and
>> ignored when selecting zones.
>>
>> Signed-off-by: Hannes Reinecke <hare at suse.de>
>> ---
>>   drivers/md/dm-zoned-metadata.c | 269 +++++++++++++++++++++++++++------
>>   drivers/md/dm-zoned-target.c   | 128 +++++++++-------
>>   drivers/md/dm-zoned.h          |   7 +-
>>   3 files changed, 304 insertions(+), 100 deletions(-)
>>
>> diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
>> index e036ba4bfe57..e498e1aee915 100644
>> --- a/drivers/md/dm-zoned-metadata.c
>> +++ b/drivers/md/dm-zoned-metadata.c
>> @@ -16,7 +16,7 @@
>>   /*
>>    * Metadata version.
>>    */
>> -#define DMZ_META_VER	1
>> +#define DMZ_META_VER	2
>>   
>>   /*
>>    * On-disk super block magic.
>> @@ -69,8 +69,17 @@ struct dmz_super {
>>   	/* Checksum */
>>   	__le32		crc;			/*  48 */
>>   
>> +	/* DM-Zoned label */
>> +	u8		dmz_label[32];		/*  80 */
>> +
>> +	/* DM-Zoned UUID */
>> +	u8		dmz_uuid[16];		/*  96 */
>> +
>> +	/* Device UUID */
>> +	u8		dev_uuid[16];		/* 112 */
>> +
>>   	/* Padding to full 512B sector */
>> -	u8		reserved[464];		/* 512 */
>> +	u8		reserved[400];		/* 512 */
>>   };
>>   
>>   /*
>> @@ -135,6 +144,8 @@ struct dmz_metadata {
>>   	struct dmz_dev		*dev;
>>   
>>   	char			devname[BDEVNAME_SIZE];
>> +	char			label[BDEVNAME_SIZE];
>> +	uuid_t			uuid;
>>   
>>   	sector_t		zone_bitmap_size;
>>   	unsigned int		zone_nr_bitmap_blocks;
>> @@ -161,8 +172,9 @@ struct dmz_metadata {
>>   	/* Zone information array */
>>   	struct dm_zone		*zones;
>>   
>> -	struct dmz_sb		sb[2];
>> +	struct dmz_sb		sb[3];
>>   	unsigned int		mblk_primary;
>> +	unsigned int		sb_version;
>>   	u64			sb_gen;
>>   	unsigned int		min_nr_mblks;
>>   	unsigned int		max_nr_mblks;
>> @@ -195,16 +207,16 @@ struct dmz_metadata {
>>   };
>>   
>>   #define dmz_zmd_info(zmd, format, args...)	\
>> -	DMINFO("(%s): " format, (zmd)->devname, ## args)
>> +	DMINFO("(%s): " format, (zmd)->label, ## args)
>>   
>>   #define dmz_zmd_err(zmd, format, args...)	\
>> -	DMERR("(%s): " format, (zmd)->devname, ## args)
>> +	DMERR("(%s): " format, (zmd)->label, ## args)
>>   
>>   #define dmz_zmd_warn(zmd, format, args...)	\
>> -	DMWARN("(%s): " format, (zmd)->devname, ## args)
>> +	DMWARN("(%s): " format, (zmd)->label, ## args)
>>   
>>   #define dmz_zmd_debug(zmd, format, args...)	\
>> -	DMDEBUG("(%s): " format, (zmd)->devname, ## args)
>> +	DMDEBUG("(%s): " format, (zmd)->label, ## args)
>>   /*
>>    * Various accessors
>>    */
>> @@ -215,16 +227,41 @@ unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone)
>>   
>>   sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone)
>>   {
>> -	return (sector_t)dmz_id(zmd, zone) << zmd->zone_nr_sectors_shift;
>> +	unsigned int zone_id;
>> +
>> +	if (WARN_ON(!zone))
>> +		return 0;
>> +
>> +	zone_id = zone->id;
>> +	if (zmd->dev[0].zone_offset &&
>> +	    (zone_id >= zmd->dev[0].zone_offset))
>> +		zone_id -= zmd->dev[0].zone_offset;
>> +	return (sector_t)zone_id << zmd->zone_nr_sectors_shift;
>>   }
>>   
>>   sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone)
>>   {
>> -	return (sector_t)dmz_id(zmd, zone) << zmd->zone_nr_blocks_shift;
>> +	unsigned int zone_id;
>> +
>> +	if (WARN_ON(!zone))
>> +		return 0;
>> +
>> +	zone_id = zone->id;
>> +	if (zmd->dev[0].zone_offset &&
>> +	    (zone_id >= zmd->dev[0].zone_offset))
>> +		zone_id -= zmd->dev[0].zone_offset;
>> +	return (sector_t)zone_id << zmd->zone_nr_blocks_shift;
> 
> The zone_id calculation is repeated. May be add a simple helper
> dmz_dev_zone_id() for it ?
> 
Ok.

>>   }
>>   
>>   struct dmz_dev *dmz_zone_to_dev(struct dmz_metadata *zmd, struct dm_zone *zone)
>>   {
>> +	if (WARN_ON(!zone))
>> +		return &zmd->dev[0];
>> +
>> +	if (zmd->dev[0].zone_offset &&
>> +	    zone->id < zmd->dev[0].zone_offset)
>> +			return &zmd->dev[1];
> 
> Isn't that return &zmd->dev[0]; ? if zone->id < zmd->dev[0].zone_offset then you
> are within the first dev, no ?
> 
Yes, but the first device is the second :-)
Remember, dev[0] is the zoned device, and dev[1] is the regular device.
But dev[1] is arranged in front of dev[0] :-)

>> +
>>   	return &zmd->dev[0];
> 
> And this would be return &zmd->dev[1];
> 
No. See above.

We could possibly make it position independent (ie that the order of 
devices wouldn't matter), but it took me long enough on decide on 
remotely sane layout (and get it to work ...) so that I didn't attempt it.
Hence, for now, dev[0] is the zoned device, dev[1] is the regular 
device, dev[0] has an offset to the zones making the zones from dev[0] 
to be arranged after the emulated zones from dev[1].

>>   }
>>   
>> @@ -280,7 +317,7 @@ unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd)
>>   
>>   const char *dmz_metadata_label(struct dmz_metadata *zmd)
>>   {
>> -	return (const char *)zmd->devname;
>> +	return (const char *)zmd->label;
>>   }
>>   
>>   bool dmz_check_dev(struct dmz_metadata *zmd)
>> @@ -687,6 +724,9 @@ static int dmz_rdwr_block(struct dmz_dev *dev, int op,
>>   	struct bio *bio;
>>   	int ret;
>>   
>> +	if (WARN_ON(!dev))
>> +		return -EIO;
>> +
>>   	if (dmz_bdev_is_dying(dev))
>>   		return -EIO;
>>   
>> @@ -711,7 +751,8 @@ static int dmz_rdwr_block(struct dmz_dev *dev, int op,
>>    */
>>   static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
>>   {
>> -	sector_t block = zmd->sb[set].block;
>> +	unsigned int sb_zid = dmz_id(zmd, zmd->sb[set].zone);
>> +	sector_t sb_block = sb_zid << zmd->zone_nr_blocks_shift;
>>   	struct dmz_mblock *mblk = zmd->sb[set].mblk;
>>   	struct dmz_super *sb = zmd->sb[set].sb;
>>   	struct dmz_dev *dev = zmd->sb[set].dev;
>> @@ -719,11 +760,17 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
>>   	int ret;
>>   
>>   	sb->magic = cpu_to_le32(DMZ_MAGIC);
>> -	sb->version = cpu_to_le32(DMZ_META_VER);
>> +
>> +	sb->version = cpu_to_le32(zmd->sb_version);
>> +	if (zmd->sb_version > 1) {
>> +		uuid_copy((uuid_t *)sb->dmz_uuid, &zmd->uuid);
>> +		memcpy(sb->dmz_label, zmd->label, BDEVNAME_SIZE);
>> +		uuid_copy((uuid_t *)sb->dev_uuid, &dev->uuid);
>> +	}
>>   
>>   	sb->gen = cpu_to_le64(sb_gen);
>>   
>> -	sb->sb_block = cpu_to_le64(block);
>> +	sb->sb_block = cpu_to_le64(sb_block);
>>   	sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks);
>>   	sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq);
>>   	sb->nr_chunks = cpu_to_le32(zmd->nr_chunks);
>> @@ -734,7 +781,8 @@ static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
>>   	sb->crc = 0;
>>   	sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE));
>>   
>> -	ret = dmz_rdwr_block(dev, REQ_OP_WRITE, block, mblk->page);
>> +	ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block,
>> +			     mblk->page);
>>   	if (ret == 0)
>>   		ret = blkdev_issue_flush(dev->bdev, GFP_NOIO, NULL);
>>   
>> @@ -915,6 +963,23 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set)
>>   	u32 crc, stored_crc;
>>   	u64 gen;
>>   
>> +	if (le32_to_cpu(sb->magic) != DMZ_MAGIC) {
>> +		dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)",
>> +			    DMZ_MAGIC, le32_to_cpu(sb->magic));
>> +		return -ENXIO;
>> +	}
>> +
>> +	zmd->sb_version = le32_to_cpu(sb->version);
>> +	if (zmd->sb_version > DMZ_META_VER) {
>> +		dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)",
>> +			    DMZ_META_VER, zmd->sb_version);
>> +		return -EINVAL;
>> +	}
>> +	if ((zmd->sb_version < 1) && (set == 2)) {
>> +		dmz_dev_err(dev, "Tertiary superblocks are not supported");
>> +		return -EINVAL;
>> +	}
>> +
>>   	gen = le64_to_cpu(sb->gen);
>>   	stored_crc = le32_to_cpu(sb->crc);
>>   	sb->crc = 0;
>> @@ -925,18 +990,41 @@ static int dmz_check_sb(struct dmz_metadata *zmd, unsigned int set)
>>   		return -ENXIO;
>>   	}
>>   
>> -	if (le32_to_cpu(sb->magic) != DMZ_MAGIC) {
>> -		dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)",
>> -			    DMZ_MAGIC, le32_to_cpu(sb->magic));
>> -		return -ENXIO;
>> -	}
>> +	if (zmd->sb_version > 1) {
>> +		if (uuid_is_null((uuid_t *)sb->dmz_uuid)) {
>> +			dmz_dev_err(dev, "sb%d: NULL DM-Zoned uuid", set);
>> +			return -ENXIO;
>> +		} else if (uuid_is_null(&zmd->uuid)) {
>> +			uuid_copy(&zmd->uuid, (uuid_t *)sb->dmz_uuid);
>> +		} else if (!uuid_equal(&zmd->uuid, (uuid_t *)sb->dmz_uuid)) {
>> +			dmz_dev_err(dev, "sb%d: mismatching DM-Zoned uuid, "
>> +				    "is %pUl expected %pUl", set,
>> +				    (uuid_t *)sb->dmz_uuid, &zmd->uuid);
>> +			return -ENXIO;
>> +		}
>> +		if (!strlen(zmd->label))
>> +			memcpy(zmd->label, sb->dmz_label, BDEVNAME_SIZE);
> 
> We should probably ensure that label is always 0 terminated here. If the
> metadata label uses all BDEVNAME_SIZE characters, zmd->label will too since it
> is a BDEVNAME_SIZE long string. Make it BDEVNAME_SIZE + 1 long ?
> 
I would rely on the the general usage here.
BDEVNAME is a global variable, used in several places.
So I guess everyone will have the same issue.
But I'll check what the general use pattern here is.

>> +		else if (memcmp(zmd->label, sb->dmz_label, BDEVNAME_SIZE)) {
>> +			dmz_dev_err(dev, "sb%d: mismatching DM-Zoned label, "
>> +				    "is %s expected %s",
>> +				    set, sb->dmz_label, zmd->label);
>> +			return -ENXIO;
>> +		}
>> +		if (uuid_is_null((uuid_t *)sb->dev_uuid)) {
>> +			dmz_dev_err(dev, "sb%d: NULL device uuid", set);
>> +			return -ENXIO;
>> +		}
>> +		uuid_copy(&dev->uuid, (uuid_t *)sb->dev_uuid);
>>   
>> -	if (le32_to_cpu(sb->version) != DMZ_META_VER) {
>> -		dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)",
>> -			    DMZ_META_VER, le32_to_cpu(sb->version));
>> -		return -ENXIO;
>> +		if (set == 2) {
>> +			if (gen != 0) {
>> +				dmz_dev_err(dev, "sb%d: Invalid generation %llu",
>> +					    set, gen);
>> +				return -ENXIO;
>> +			}
>> +			return 0;
>> +		}
>>   	}
>> -
>>   	nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + zmd->zone_nr_blocks - 1)
>>   		>> zmd->zone_nr_blocks_shift;
>>   	if (!nr_meta_zones ||
>> @@ -1185,21 +1273,37 @@ static int dmz_load_sb(struct dmz_metadata *zmd)
>>   		      "Using super block %u (gen %llu)",
>>   		      zmd->mblk_primary, zmd->sb_gen);
>>   
>> +	if ((zmd->sb_version > 1) && zmd->sb[2].zone) {
>> +		zmd->sb[2].block = dmz_start_block(zmd, zmd->sb[2].zone);
>> +		zmd->sb[2].dev = dmz_zone_to_dev(zmd, zmd->sb[2].zone);
>> +		ret = dmz_get_sb(zmd, 2);
>> +		if (ret) {
>> +			dmz_dev_err(zmd->sb[2].dev,
>> +				    "Read tertiary super block failed");
>> +			return ret;
>> +		}
>> +		ret = dmz_check_sb(zmd, 2);
>> +		if (ret == -EINVAL)
>> +			return ret;
>> +	}
>>   	return 0;
>>   }
>>   
>>   /*
>>    * Initialize a zone descriptor.
>>    */
>> -static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data)
>> +static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data)
>>   {
>>   	struct dmz_metadata *zmd = data;
>> +	int idx = num + zmd->dev[0].zone_offset;
>>   	struct dm_zone *zone = &zmd->zones[idx];
>> -	struct dmz_dev *dev = zmd->dev;
>>   
>> -	/* Ignore the eventual last runt (smaller) zone */
>>   	if (blkz->len != zmd->zone_nr_sectors) {
>> -		if (blkz->start + blkz->len == dev->capacity)
>> +		if (zmd->sb_version > 1) {
>> +			/* Ignore the eventual runt (smaller) zone */
>> +			set_bit(DMZ_OFFLINE, &zone->flags);
>> +			return 0;
>> +		} else if (blkz->start + blkz->len == dev->capacity)
>>   			return 0;
>>   		return -ENXIO;
>>   	}
>> @@ -1235,10 +1339,14 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data)
>>   		if (dmz_is_rnd(zone)) {
>>   			zmd->nr_rnd_zones++;
>>   			if (!zmd->sb[0].zone) {
>> -				/* Super block zone */
>> +				/* Primary super block zone */
>>   				zmd->sb[0].zone = zone;
>>   			}
>>   		}
>> +		if (zmd->dev[1].bdev && !zmd->sb[2].zone) {
>> +			/* Tertiary superblock zone */
>> +			zmd->sb[2].zone = zone;
>> +		}
>>   	}
>>   
>>   	return 0;
>> @@ -1259,11 +1367,10 @@ static void dmz_drop_zones(struct dmz_metadata *zmd)
>>    */
>>   static int dmz_init_zones(struct dmz_metadata *zmd)
>>   {
>> -	struct dmz_dev *dev = &zmd->dev[0];
>>   	int ret;
>>   
>>   	/* Init */
>> -	zmd->zone_nr_sectors = dev->zone_nr_sectors;
>> +	zmd->zone_nr_sectors = zmd->dev[0].zone_nr_sectors;
>>   	zmd->zone_nr_sectors_shift = ilog2(zmd->zone_nr_sectors);
>>   	zmd->zone_nr_blocks = dmz_sect2blk(zmd->zone_nr_sectors);
>>   	zmd->zone_nr_blocks_shift = ilog2(zmd->zone_nr_blocks);
>> @@ -1274,7 +1381,14 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
>>   					DMZ_BLOCK_SIZE_BITS);
>>   
>>   	/* Allocate zone array */
>> -	zmd->nr_zones = dev->nr_zones;
>> +	zmd->nr_zones = zmd->dev[0].nr_zones;
>> +	if (zmd->dev[1].bdev)
>> +		zmd->nr_zones += zmd->dev[1].nr_zones;
>> +
>> +	if (!zmd->nr_zones) {
>> +		DMERR("(%s): No zones found", zmd->devname);
>> +		return -ENXIO;
>> +	}
>>   	zmd->zones = kcalloc(zmd->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
>>   	if (!zmd->zones)
>>   		return -ENOMEM;
>> @@ -1282,13 +1396,43 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
>>   	DMINFO("(%s): Using %zu B for zone information",
>>   	       zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones);
>>   
>> +	if (zmd->dev[1].bdev) {
>> +		int idx;
>> +		sector_t zone_offset = 0;
>> +
>> +		for(idx = 0; idx < zmd->dev[1].nr_zones; idx++) {
>> +			struct dm_zone *zone = &zmd->zones[idx];
>> +
>> +			INIT_LIST_HEAD(&zone->link);
>> +			atomic_set(&zone->refcount, 0);
>> +			zone->id = idx;
>> +			zone->chunk = DMZ_MAP_UNMAPPED;
>> +			set_bit(DMZ_RND, &zone->flags);
>> +			zone->wp_block = 0;
>> +			zmd->nr_rnd_zones++;
>> +			zmd->nr_useable_zones++;
>> +			if (zmd->dev[1].capacity - zone_offset <
>> +			    zmd->zone_nr_sectors) {
>> +				/* Disable runt zone */
>> +				set_bit(DMZ_OFFLINE, &zone->flags);
>> +				break;
>> +			}
>> +			zone_offset += zmd->zone_nr_sectors;
>> +			if (!zmd->sb[0].zone) {
>> +				/* Primary superblock zone */
>> +				zmd->sb[0].zone = zone;
>> +			}
>> +		}
>> +		zmd->dev[0].zone_offset = zmd->dev[1].nr_zones;
> 
> This is assuming that dev[1] always is a regular device. What if this is a zoned
> device ? E.g. do we want to prevent the case: zoned device with rnd zoned +
> zoned device with no rnd zones ? I do not think it is an interesting use case,
> but I do not see any reason for preventing it either as I do not think this will
> change anything to the code beside the init zones. Thoughts ?
> 
It might work if the zoned device doesn't contain any random write 
zones. The current model assumes that the metadata zones are located at 
the (logical) start of the zones, and the bitmap covers all zones after 
the metadata.
For V2 I had to make some sleigh of hand here for runt zones and the 
tertiary metadata zone; they are marked specifically (as offline or 
metadata zone, respectively) so that dmz_alloc_zone() will ignore them 
when selecting a new zone from the bitmap.
But the important bit is that the random and sequential write zones are 
assumed to be consecutive; any device compound which can be arranged 
like that should work.
Which is why I checked for !zoned here; all zoned devices I've seen so 
far _do_ have random write zones at the start, so the chunk allocator 
wouldn't readily work with them and I didn't want to go into details 
figuring out which zoned devices match and which don't.

Plus this really would be more like a RAID0 compound, which raises the 
question if it shouldn't better be handled by RAID0 directly ...

>> +	}
>> +
>>   	/*
>>   	 * Get zone information and initialize zone descriptors.  At the same
>>   	 * time, determine where the super block should be: first block of the
>>   	 * first randomly writable zone.
>>   	 */
>> -	ret = blkdev_report_zones(dev->bdev, 0, BLK_ALL_ZONES, dmz_init_zone,
>> -				  zmd);
>> +	ret = blkdev_report_zones(zmd->dev[0].bdev, 0, BLK_ALL_ZONES,
>> +				  dmz_init_zone, zmd);
>>   	if (ret < 0) {
>>   		dmz_drop_zones(zmd);
>>   		return ret;
>> @@ -1325,6 +1469,9 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
>>   	unsigned int noio_flag;
>>   	int ret;
>>   
>> +	if (dev->flags & DMZ_BDEV_REGULAR)
>> +		return 0;
>> +
>>   	/*
>>   	 * Get zone information from disk. Since blkdev_report_zones() uses
>>   	 * GFP_KERNEL by default for memory allocations, set the per-task
>> @@ -1741,7 +1888,8 @@ struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chu
>>   		/* Allocate a random zone */
>>   		dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
>>   		if (!dzone) {
>> -			if (dmz_bdev_is_dying(zmd->dev)) {
>> +			if (dmz_bdev_is_dying(&zmd->dev[0]) ||
>> +			    dmz_bdev_is_dying(&zmd->dev[1])) {
>>   				dzone = ERR_PTR(-EIO);
>>   				goto out;
>>   			}
>> @@ -1842,7 +1990,8 @@ struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
>>   	/* Allocate a random zone */
>>   	bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
>>   	if (!bzone) {
>> -		if (dmz_bdev_is_dying(zmd->dev)) {
>> +		if (dmz_bdev_is_dying(&zmd->dev[0]) ||
>> +		    dmz_bdev_is_dying(&zmd->dev[1])) {
>>   			bzone = ERR_PTR(-EIO);
>>   			goto out;
>>   		}
>> @@ -2480,18 +2629,34 @@ void dmz_print_dev(struct dmz_metadata *zmd, int num)
>>   {
>>   	struct dmz_dev *dev = &zmd->dev[num];
>>   
>> -	dmz_dev_info(dev, "Host-%s zoned block device",
>> -		     bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
>> -		     "aware" : "managed");
>> -	dmz_dev_info(dev, "  %llu 512-byte logical sectors",
>> -		     (u64)dev->capacity);
>> -	dmz_dev_info(dev, "  %u zones of %llu 512-byte logical sectors",
>> -		     dev->nr_zones, (u64)zmd->zone_nr_sectors);
>> +	if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE)
>> +		dmz_dev_info(dev, "Regular block device");
>> +	else
>> +		dmz_dev_info(dev, "Host-%s zoned block device",
>> +			     bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
>> +			     "aware" : "managed");
>> +	if (zmd->sb_version > 1) {
>> +		sector_t sector_offset =
>> +			dev->zone_offset << zmd->zone_nr_sectors_shift;
>> +
>> +		dmz_dev_info(dev, "  uuid %pUl", &dev->uuid);
>> +		dmz_dev_info(dev, "  %llu 512-byte logical sectors (offset %llu)",
>> +			     (u64)dev->capacity, (u64)sector_offset);
>> +		dmz_dev_info(dev, "  %u zones of %llu 512-byte logical sectors (offset %llu)",
>> +			     dev->nr_zones, (u64)zmd->zone_nr_sectors,
>> +			     (u64)dev->zone_offset);
>> +	} else {
>> +		dmz_dev_info(dev, "  %llu 512-byte logical sectors",
>> +			     (u64)dev->capacity);
>> +		dmz_dev_info(dev, "  %u zones of %llu 512-byte logical sectors",
>> +			     dev->nr_zones, (u64)zmd->zone_nr_sectors);
>> +	}
>>   }
>>   /*
>>    * Initialize the zoned metadata.
>>    */
>> -int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata,
>> +int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev,
>> +		     struct dmz_metadata **metadata,
>>   		     const char *devname)
>>   {
>>   	struct dmz_metadata *zmd;
>> @@ -2544,7 +2709,13 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata,
>>   			goto err;
>>   		set_bit(DMZ_META, &zone->flags);
>>   	}
>> -
>> +	if (zmd->sb[2].zone) {
>> +		zid = dmz_id(zmd, zmd->sb[2].zone);
>> +		zone = dmz_get(zmd, zid);
>> +		if (!zone)
>> +			goto err;
>> +		set_bit(DMZ_META, &zone->flags);
>> +	}
>>   	/* Load mapping table */
>>   	ret = dmz_load_mapping(zmd);
>>   	if (ret)
>> @@ -2569,8 +2740,14 @@ int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata,
>>   		goto err;
>>   	}
>>   
>> -	dmz_zmd_info(zmd, "DM-Zoned metadata version %d", DMZ_META_VER);
>> +	dmz_zmd_info(zmd, "DM-Zoned metadata version %d", zmd->sb_version);
>> +	if (zmd->sb_version > 1) {
>> +		dmz_zmd_info(zmd, "DM UUID %pUl", &zmd->uuid);
>> +		dmz_zmd_info(zmd, "DM Label %s", zmd->label);
>> +	}
>>   	dmz_print_dev(zmd, 0);
>> +	if (zmd->dev[1].bdev)
>> +		dmz_print_dev(zmd, 1);
>>   
>>   	dmz_zmd_info(zmd, "  %u zones of %llu 512-byte logical sectors",
>>   		     zmd->nr_zones, (u64)zmd->zone_nr_sectors);
>> diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
>> index ccf90608f434..4cfb34b38659 100644
>> --- a/drivers/md/dm-zoned-target.c
>> +++ b/drivers/md/dm-zoned-target.c
>> @@ -38,7 +38,7 @@ struct dm_chunk_work {
>>    * Target descriptor.
>>    */
>>   struct dmz_target {
>> -	struct dm_dev		*ddev;
>> +	struct dm_dev		*ddev[2];
> 
> Form the code above, ddev[1] is the regular device and ddev[0] the zoned device.
> I would really prefer to have that reversed as the ddev array index would
> increase together with target sector/zone id instead of being reversed. That
> would be far less confusing I think.
> 
This was done so to keep compability with the original version, which 
had the zoned device as the first argument.

But I'll see how large the code churn would be when re-arranging the 
devices.

>>   
>>   	unsigned long		flags;
>>   
>> @@ -223,6 +223,9 @@ static int dmz_handle_read(struct dmz_target *dmz, struct dmz_dev *dev,
>>   		}
>>   
>>   		if (nr_blocks) {
>> +			if (rzone != zone)
>> +				dev = dmz_zone_to_dev(zmd, rzone);
>> +
>>   			/* Valid blocks found: read them */
>>   			nr_blocks = min_t(unsigned int, nr_blocks, end_block - chunk_block);
>>   			ret = dmz_submit_bio(dmz, dev, rzone, bio,
>> @@ -298,6 +301,7 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz,
>>   		return -EROFS;
>>   
>>   	/* Submit write */
>> +	dev = dmz_zone_to_dev(zmd, bzone);
>>   	ret = dmz_submit_bio(dmz, dev, bzone, bio, chunk_block, nr_blocks);
>>   	if (ret)
>>   		return ret;
>> @@ -591,6 +595,10 @@ static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
>>    */
>>   bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
>>   {
>> +	/* Device not configured, no error */
>> +	if (!dmz_dev->bdev)
>> +		return false;
>> +
>>   	if (dmz_dev->flags & DMZ_BDEV_DYING)
>>   		return true;
>>   
>> @@ -698,60 +706,47 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
>>   /*
>>    * Get zoned device information.
>>    */
>> -static int dmz_get_zoned_device(struct dm_target *ti, char *path)
>> +static int dmz_get_zoned_device(struct dm_target *ti, char *path,
>> +				struct dmz_dev *dev, int num)
>>   {
>>   	struct dmz_target *dmz = ti->private;
>>   	struct request_queue *q;
>> -	struct dmz_dev *dev;
>> -	sector_t aligned_capacity;
>>   	int ret;
>>   
>>   	/* Get the target device */
>> -	ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &dmz->ddev);
>> +	ret = dm_get_device(ti, path, dm_table_get_mode(ti->table),
>> +			    &dmz->ddev[num]);
>>   	if (ret) {
>>   		ti->error = "Get target device failed";
>> -		dmz->ddev = NULL;
>> +		dmz->ddev[num] = NULL;
>>   		return ret;
>>   	}
>>   
>> -	dev = kzalloc(sizeof(struct dmz_dev), GFP_KERNEL);
>> -	if (!dev) {
>> -		ret = -ENOMEM;
>> -		goto err;
>> -	}
>> -
>> -	dev->bdev = dmz->ddev->bdev;
>> +	dev->bdev = dmz->ddev[num]->bdev;
>>   	(void)bdevname(dev->bdev, dev->name);
>>   
>> -	if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) {
>> -		ti->error = "Not a zoned block device";
>> -		ret = -EINVAL;
>> -		goto err;
>> -	}
>> +	if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE)
>> +		dev->flags = DMZ_BDEV_REGULAR;
> 
> Do we want to disallow regular block dev for num == 1 (second device) ?
> Or we actually do not care about the order in which devices are obtained from
> the ctr arguments ?
> 
For now, we do.
I'll need to do a code audit to ensure we can handle an arbitrary order 
of devices.

But possibly with a next patchset; took me long enough to get this one 
figured out :-(


Thanks for your review!

Cheers,

Hannes
-- 
Dr. Hannes Reinecke            Teamlead Storage & Networking
hare at suse.de                               +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer





More information about the dm-devel mailing list