[dm-devel] [PATCH v2 10/11] dm: introduce zone append emulation

Damien Le Moal Damien.LeMoal at wdc.com
Thu May 20 06:57:12 UTC 2021


On 2021/05/20 15:47, Hannes Reinecke wrote:
> On 5/20/21 8:25 AM, Damien Le Moal wrote:
>> On 2021/05/20 15:10, Hannes Reinecke wrote:
>> [...]
>>>> +/*
>>>> + * First phase of BIO mapping for targets with zone append emulation:
>>>> + * check all BIO that change a zone writer pointer and change zone
>>>> + * append operations into regular write operations.
>>>> + */
>>>> +static bool dm_zone_map_bio_begin(struct mapped_device *md,
>>>> +				  struct bio *orig_bio, struct bio *clone)
>>>> +{
>>>> +	sector_t zone_sectors = blk_queue_zone_sectors(md->queue);
>>>> +	unsigned int zno = bio_zone_no(orig_bio);
>>>> +	unsigned long flags;
>>>> +	bool good_io = false;
>>>> +
>>>> +	spin_lock_irqsave(&md->zwp_offset_lock, flags);
>>>> +
>>>> +	/*
>>>> +	 * If the target zone is in an error state, recover by inspecting the
>>>> +	 * zone to get its current write pointer position. Note that since the
>>>> +	 * target zone is already locked, a BIO issuing context should never
>>>> +	 * see the zone write in the DM_ZONE_UPDATING_WP_OFST state.
>>>> +	 */
>>>> +	if (md->zwp_offset[zno] == DM_ZONE_INVALID_WP_OFST) {
>>>> +		unsigned int wp_offset;
>>>> +		int ret;
>>>> +
>>>> +		md->zwp_offset[zno] = DM_ZONE_UPDATING_WP_OFST;
>>>> +
>>>> +		spin_unlock_irqrestore(&md->zwp_offset_lock, flags);
>>>> +		ret = dm_update_zone_wp_offset(md, zno, &wp_offset);
>>>> +		spin_lock_irqsave(&md->zwp_offset_lock, flags);
>>>> +
>>>> +		if (ret) {
>>>> +			md->zwp_offset[zno] = DM_ZONE_INVALID_WP_OFST;
>>>> +			goto out;
>>>> +		}
>>>> +		md->zwp_offset[zno] = wp_offset;
>>>> +	} else if (md->zwp_offset[zno] == DM_ZONE_UPDATING_WP_OFST) {
>>>> +		DMWARN_LIMIT("Invalid DM_ZONE_UPDATING_WP_OFST state");
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	switch (bio_op(orig_bio)) {
>>>> +	case REQ_OP_WRITE_ZEROES:
>>>> +	case REQ_OP_WRITE_SAME:
>>>> +	case REQ_OP_WRITE:
>>>> +		break;
>>>> +	case REQ_OP_ZONE_RESET:
>>>> +	case REQ_OP_ZONE_FINISH:
>>>> +		goto good;
>>>> +	case REQ_OP_ZONE_APPEND:
>>>> +		/*
>>>> +		 * Change zone append operations into a non-mergeable regular
>>>> +		 * writes directed at the current write pointer position of the
>>>> +		 * target zone.
>>>> +		 */
>>>> +		clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE |
>>>> +			(orig_bio->bi_opf & (~REQ_OP_MASK));
>>>> +		clone->bi_iter.bi_sector =
>>>> +			orig_bio->bi_iter.bi_sector + md->zwp_offset[zno];
>>>> +		break;
>>>> +	default:
>>>> +		DMWARN_LIMIT("Invalid BIO operation");
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	/* Cannot write to a full zone */
>>>> +	if (md->zwp_offset[zno] >= zone_sectors)
>>>> +		goto out;
>>>> +
>>>> +	/* Writes must be aligned to the zone write pointer */
>>>> +	if ((clone->bi_iter.bi_sector & (zone_sectors - 1)) != md->zwp_offset[zno])
>>>> +		goto out;
>>>> +
>>>> +good:
>>>> +	good_io = true;
>>>> +
>>>> +out:
>>>> +	spin_unlock_irqrestore(&md->zwp_offset_lock, flags);
>>>
>>> I'm not happy with the spinlock. Can't the same effect be achieved with
>>> some clever READ_ONCE()/WRITE_ONCE/cmpexch magic?
>>> Especially as you have a separate 'zone lock' mechanism ...
>>
>> hmmm... Let me see. Given that what the bio completion is relatively simple, it
>> may be possible. With more coffee, I amy be able to come up with something clever.
>>
> More coffee is always a good idea :-)
> I would look at killing the intermediate state UPDATING_WP_OFST and only 
> update the pointer on endio (or if it failed).
> That way we would need to update the pointer only once if we have a 
> final state, and don't need to do the double update you are doing now.

Good point. That should work. Definitely more coffee needed :)

> 
>>>
>>>> +
>>>> +	return good_io;
>>>> +}
>>>> +
>>>> +/*
>>>> + * Second phase of BIO mapping for targets with zone append emulation:
>>>> + * update the zone write pointer offset array to account for the additional
>>>> + * data written to a zone. Note that at this point, the remapped clone BIO
>>>> + * may already have completed, so we do not touch it.
>>>> + */
>>>> +static blk_status_t dm_zone_map_bio_end(struct mapped_device *md,
>>>> +					struct bio *orig_bio,
>>>> +					unsigned int nr_sectors)
>>>> +{
>>>> +	unsigned int zno = bio_zone_no(orig_bio);
>>>> +	blk_status_t sts = BLK_STS_OK;
>>>> +	unsigned long flags;
>>>> +
>>>> +	spin_lock_irqsave(&md->zwp_offset_lock, flags);
>>>> +
>>>> +	/* Update the zone wp offset */
>>>> +	switch (bio_op(orig_bio)) {
>>>> +	case REQ_OP_ZONE_RESET:
>>>> +		md->zwp_offset[zno] = 0;
>>>> +		break;
>>>> +	case REQ_OP_ZONE_FINISH:
>>>> +		md->zwp_offset[zno] = blk_queue_zone_sectors(md->queue);
>>>> +		break;
>>>> +	case REQ_OP_WRITE_ZEROES:
>>>> +	case REQ_OP_WRITE_SAME:
>>>> +	case REQ_OP_WRITE:
>>>> +		md->zwp_offset[zno] += nr_sectors;
>>>> +		break;
>>>> +	case REQ_OP_ZONE_APPEND:
>>>> +		/*
>>>> +		 * Check that the target did not truncate the write operation
>>>> +		 * emulating a zone append.
>>>> +		 */
>>>> +		if (nr_sectors != bio_sectors(orig_bio)) {
>>>> +			DMWARN_LIMIT("Truncated write for zone append");
>>>> +			sts = BLK_STS_IOERR;
>>>> +			break;
>>>> +		}
>>>> +		md->zwp_offset[zno] += nr_sectors;
>>>> +		break;
>>>> +	default:
>>>> +		DMWARN_LIMIT("Invalid BIO operation");
>>>> +		sts = BLK_STS_IOERR;
>>>> +		break;
>>>> +	}
>>>> +
>>>> +	spin_unlock_irqrestore(&md->zwp_offset_lock, flags);
>>>
>>> You don't need the spinlock here; using WRITE_ONCE() should be sufficient.
>>
>> If other references to zwp_offset use READ_ONCE(), no ?
>>
> Why, but of course.
> If you touch one you have to touch all :-)
> 
> Cheers,
> 
> Hannes
> 


-- 
Damien Le Moal
Western Digital Research






More information about the dm-devel mailing list