[dm-devel] [PATCH v2 10/11] dm: introduce zone append emulation

Damien Le Moal Damien.LeMoal at wdc.com
Thu May 20 06:25:16 UTC 2021


On 2021/05/20 15:10, Hannes Reinecke wrote:
[...]
>> +/*
>> + * First phase of BIO mapping for targets with zone append emulation:
>> + * check all BIO that change a zone writer pointer and change zone
>> + * append operations into regular write operations.
>> + */
>> +static bool dm_zone_map_bio_begin(struct mapped_device *md,
>> +				  struct bio *orig_bio, struct bio *clone)
>> +{
>> +	sector_t zone_sectors = blk_queue_zone_sectors(md->queue);
>> +	unsigned int zno = bio_zone_no(orig_bio);
>> +	unsigned long flags;
>> +	bool good_io = false;
>> +
>> +	spin_lock_irqsave(&md->zwp_offset_lock, flags);
>> +
>> +	/*
>> +	 * If the target zone is in an error state, recover by inspecting the
>> +	 * zone to get its current write pointer position. Note that since the
>> +	 * target zone is already locked, a BIO issuing context should never
>> +	 * see the zone write in the DM_ZONE_UPDATING_WP_OFST state.
>> +	 */
>> +	if (md->zwp_offset[zno] == DM_ZONE_INVALID_WP_OFST) {
>> +		unsigned int wp_offset;
>> +		int ret;
>> +
>> +		md->zwp_offset[zno] = DM_ZONE_UPDATING_WP_OFST;
>> +
>> +		spin_unlock_irqrestore(&md->zwp_offset_lock, flags);
>> +		ret = dm_update_zone_wp_offset(md, zno, &wp_offset);
>> +		spin_lock_irqsave(&md->zwp_offset_lock, flags);
>> +
>> +		if (ret) {
>> +			md->zwp_offset[zno] = DM_ZONE_INVALID_WP_OFST;
>> +			goto out;
>> +		}
>> +		md->zwp_offset[zno] = wp_offset;
>> +	} else if (md->zwp_offset[zno] == DM_ZONE_UPDATING_WP_OFST) {
>> +		DMWARN_LIMIT("Invalid DM_ZONE_UPDATING_WP_OFST state");
>> +		goto out;
>> +	}
>> +
>> +	switch (bio_op(orig_bio)) {
>> +	case REQ_OP_WRITE_ZEROES:
>> +	case REQ_OP_WRITE_SAME:
>> +	case REQ_OP_WRITE:
>> +		break;
>> +	case REQ_OP_ZONE_RESET:
>> +	case REQ_OP_ZONE_FINISH:
>> +		goto good;
>> +	case REQ_OP_ZONE_APPEND:
>> +		/*
>> +		 * Change zone append operations into a non-mergeable regular
>> +		 * writes directed at the current write pointer position of the
>> +		 * target zone.
>> +		 */
>> +		clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE |
>> +			(orig_bio->bi_opf & (~REQ_OP_MASK));
>> +		clone->bi_iter.bi_sector =
>> +			orig_bio->bi_iter.bi_sector + md->zwp_offset[zno];
>> +		break;
>> +	default:
>> +		DMWARN_LIMIT("Invalid BIO operation");
>> +		goto out;
>> +	}
>> +
>> +	/* Cannot write to a full zone */
>> +	if (md->zwp_offset[zno] >= zone_sectors)
>> +		goto out;
>> +
>> +	/* Writes must be aligned to the zone write pointer */
>> +	if ((clone->bi_iter.bi_sector & (zone_sectors - 1)) != md->zwp_offset[zno])
>> +		goto out;
>> +
>> +good:
>> +	good_io = true;
>> +
>> +out:
>> +	spin_unlock_irqrestore(&md->zwp_offset_lock, flags);
> 
> I'm not happy with the spinlock. Can't the same effect be achieved with 
> some clever READ_ONCE()/WRITE_ONCE/cmpexch magic?
> Especially as you have a separate 'zone lock' mechanism ...

hmmm... Let me see. Given that what the bio completion is relatively simple, it
may be possible. With more coffee, I amy be able to come up with something clever.

> 
>> +
>> +	return good_io;
>> +}
>> +
>> +/*
>> + * Second phase of BIO mapping for targets with zone append emulation:
>> + * update the zone write pointer offset array to account for the additional
>> + * data written to a zone. Note that at this point, the remapped clone BIO
>> + * may already have completed, so we do not touch it.
>> + */
>> +static blk_status_t dm_zone_map_bio_end(struct mapped_device *md,
>> +					struct bio *orig_bio,
>> +					unsigned int nr_sectors)
>> +{
>> +	unsigned int zno = bio_zone_no(orig_bio);
>> +	blk_status_t sts = BLK_STS_OK;
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&md->zwp_offset_lock, flags);
>> +
>> +	/* Update the zone wp offset */
>> +	switch (bio_op(orig_bio)) {
>> +	case REQ_OP_ZONE_RESET:
>> +		md->zwp_offset[zno] = 0;
>> +		break;
>> +	case REQ_OP_ZONE_FINISH:
>> +		md->zwp_offset[zno] = blk_queue_zone_sectors(md->queue);
>> +		break;
>> +	case REQ_OP_WRITE_ZEROES:
>> +	case REQ_OP_WRITE_SAME:
>> +	case REQ_OP_WRITE:
>> +		md->zwp_offset[zno] += nr_sectors;
>> +		break;
>> +	case REQ_OP_ZONE_APPEND:
>> +		/*
>> +		 * Check that the target did not truncate the write operation
>> +		 * emulating a zone append.
>> +		 */
>> +		if (nr_sectors != bio_sectors(orig_bio)) {
>> +			DMWARN_LIMIT("Truncated write for zone append");
>> +			sts = BLK_STS_IOERR;
>> +			break;
>> +		}
>> +		md->zwp_offset[zno] += nr_sectors;
>> +		break;
>> +	default:
>> +		DMWARN_LIMIT("Invalid BIO operation");
>> +		sts = BLK_STS_IOERR;
>> +		break;
>> +	}
>> +
>> +	spin_unlock_irqrestore(&md->zwp_offset_lock, flags);
> 
> You don't need the spinlock here; using WRITE_ONCE() should be sufficient.

If other references to zwp_offset use READ_ONCE(), no ?

[...]
>> +void dm_zone_endio(struct dm_io *io, struct bio *clone)
>> +{
>> +	struct mapped_device *md = io->md;
>> +	struct request_queue *q = md->queue;
>> +	struct bio *orig_bio = io->orig_bio;
>> +	unsigned long flags;
>> +	unsigned int zno;
>> +
>> +	/*
>> +	 * For targets that do not emulate zone append, we only need to
>> +	 * handle native zone-append bios.
>> +	 */
>> +	if (!dm_emulate_zone_append(md)) {
>> +		/*
>> +		 * Get the offset within the zone of the written sector
>> +		 * and add that to the original bio sector position.
>> +		 */
>> +		if (clone->bi_status == BLK_STS_OK &&
>> +		    bio_op(clone) == REQ_OP_ZONE_APPEND) {
>> +			sector_t mask = (sector_t)blk_queue_zone_sectors(q) - 1;
>> +
>> +			orig_bio->bi_iter.bi_sector +=
>> +				clone->bi_iter.bi_sector & mask;
>> +		}
>> +
>> +		return;
>> +	}
>> +
>> +	/*
>> +	 * For targets that do emulate zone append, if the clone BIO does not
>> +	 * own the target zone write lock, we have nothing to do.
>> +	 */
>> +	if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
>> +		return;
>> +
>> +	zno = bio_zone_no(orig_bio);
>> +
>> +	spin_lock_irqsave(&md->zwp_offset_lock, flags);
>> +	if (clone->bi_status != BLK_STS_OK) {
>> +		/*
>> +		 * BIOs that modify a zone write pointer may leave the zone
>> +		 * in an unknown state in case of failure (e.g. the write
>> +		 * pointer was only partially advanced). In this case, set
>> +		 * the target zone write pointer as invalid unless it is
>> +		 * already being updated.
>> +		 */
>> +		if (md->zwp_offset[zno] != DM_ZONE_UPDATING_WP_OFST)
>> +			md->zwp_offset[zno] = DM_ZONE_INVALID_WP_OFST;
>> +	} else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
>> +		/*
>> +		 * Get the written sector for zone append operation that were
>> +		 * emulated using regular write operations.
>> +		 */
>> +		if (WARN_ON_ONCE(md->zwp_offset[zno] < bio_sectors(orig_bio)))
>> +			md->zwp_offset[zno] = DM_ZONE_INVALID_WP_OFST;
>> +		else
>> +			orig_bio->bi_iter.bi_sector +=
>> +				md->zwp_offset[zno] - bio_sectors(orig_bio);
>> +	}
>> +	spin_unlock_irqrestore(&md->zwp_offset_lock, flags);
>> +
> 
> Similar comments to the spinlock here.

OK.

Thanks for the review.


-- 
Damien Le Moal
Western Digital Research






More information about the dm-devel mailing list