[dm-devel] [PATCH 3/7] block: copy offload support infrastructure

SelvaKumar S selvakuma.s1 at samsung.com
Tue Aug 17 10:14:19 UTC 2021


From: Nitesh Shetty <nj.shetty at samsung.com>

Introduce REQ_OP_COPY, a no-merge copy offload operation. Create
bio with control information as payload and submit to the device.
Larger copy operation may be divided if necessary by looking at device
limits. REQ_OP_COPY(19) is a write op and takes zone_write_lock when
submitted to zoned device.
Native copy offload is not supported for stacked devices.

Signed-off-by: Nitesh Shetty <nj.shetty at samsung.com>
Signed-off-by: SelvaKumar S <selvakuma.s1 at samsung.com>
---
 block/blk-core.c          |  84 ++++++++++++-
 block/blk-lib.c           | 252 ++++++++++++++++++++++++++++++++++++++
 block/blk-zoned.c         |   1 +
 block/bounce.c            |   1 +
 include/linux/bio.h       |   1 +
 include/linux/blk_types.h |  20 +++
 include/linux/blkdev.h    |  13 ++
 include/uapi/linux/fs.h   |  12 ++
 8 files changed, 378 insertions(+), 6 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d2722ecd4d9b..541b1561b4af 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -704,6 +704,17 @@ static noinline int should_fail_bio(struct bio *bio)
 }
 ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
 
+static inline int bio_check_copy_eod(struct bio *bio, sector_t start,
+		sector_t nr_sectors, sector_t max_sect)
+{
+	if (nr_sectors && max_sect &&
+	    (nr_sectors > max_sect || start > max_sect - nr_sectors)) {
+		handle_bad_sector(bio, max_sect);
+		return -EIO;
+	}
+	return 0;
+}
+
 /*
  * Check whether this bio extends beyond the end of the device or partition.
  * This may well happen - the kernel calls bread() without checking the size of
@@ -723,6 +734,61 @@ static inline int bio_check_eod(struct bio *bio)
 	return 0;
 }
 
+/*
+ * check for eod limits and remap ranges if needed
+ */
+static int blk_check_copy(struct bio *bio)
+{
+	struct blk_copy_payload *payload = bio_data(bio);
+	sector_t dst_max_sect, dst_start_sect, copy_size = 0;
+	sector_t src_max_sect, src_start_sect;
+	struct block_device *bd_part;
+	int i, ret = -EIO;
+
+	rcu_read_lock();
+
+	bd_part = bio->bi_bdev;
+	if (unlikely(!bd_part))
+		goto err;
+
+	dst_max_sect =  bdev_nr_sectors(bd_part);
+	dst_start_sect = bd_part->bd_start_sect;
+
+	src_max_sect = bdev_nr_sectors(payload->src_bdev);
+	src_start_sect = payload->src_bdev->bd_start_sect;
+
+	if (unlikely(should_fail_request(bd_part, bio->bi_iter.bi_size)))
+		goto err;
+
+	if (unlikely(bio_check_ro(bio)))
+		goto err;
+
+	rcu_read_unlock();
+
+	for (i = 0; i < payload->copy_nr_ranges; i++) {
+		ret = bio_check_copy_eod(bio, payload->range[i].src,
+				payload->range[i].len, src_max_sect);
+		if (unlikely(ret))
+			goto out;
+
+		payload->range[i].src += src_start_sect;
+		copy_size += payload->range[i].len;
+	}
+
+	/* check if copy length crosses eod */
+	ret = bio_check_copy_eod(bio, bio->bi_iter.bi_sector,
+				copy_size, dst_max_sect);
+	if (unlikely(ret))
+		goto out;
+
+	bio->bi_iter.bi_sector += dst_start_sect;
+	return 0;
+err:
+	rcu_read_unlock();
+out:
+	return ret;
+}
+
 /*
  * Remap block n of partition p to block n+start(p) of the disk.
  */
@@ -799,13 +865,15 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 
 	if (should_fail_bio(bio))
 		goto end_io;
-	if (unlikely(bio_check_ro(bio)))
-		goto end_io;
-	if (!bio_flagged(bio, BIO_REMAPPED)) {
-		if (unlikely(bio_check_eod(bio)))
-			goto end_io;
-		if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
+	if (likely(!op_is_copy(bio->bi_opf))) {
+		if (unlikely(bio_check_ro(bio)))
 			goto end_io;
+		if (!bio_flagged(bio, BIO_REMAPPED)) {
+			if (unlikely(bio_check_eod(bio)))
+				goto end_io;
+			if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
+				goto end_io;
+		}
 	}
 
 	/*
@@ -829,6 +897,10 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
 		if (!blk_queue_discard(q))
 			goto not_supported;
 		break;
+	case REQ_OP_COPY:
+		if (unlikely(blk_check_copy(bio)))
+			goto end_io;
+		break;
 	case REQ_OP_SECURE_ERASE:
 		if (!blk_queue_secure_erase(q))
 			goto not_supported;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 9f09beadcbe3..7fee0ae95c44 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -151,6 +151,258 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
 
+/*
+ * Wait on and process all in-flight BIOs.  This must only be called once
+ * all bios have been issued so that the refcount can only decrease.
+ * This just waits for all bios to make it through cio_bio_end_io.  IO
+ * errors are propagated through cio->io_error.
+ */
+static int cio_await_completion(struct cio *cio)
+{
+	int ret = 0;
+
+	while (atomic_read(&cio->refcount)) {
+		cio->waiter = current;
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		blk_io_schedule();
+		/* wake up sets us TASK_RUNNING */
+		cio->waiter = NULL;
+		ret = cio->io_err;
+	}
+	kvfree(cio);
+
+	return ret;
+}
+
+/*
+ * The BIO completion handler simply decrements refcount.
+ * Also wake up process, if this is the last bio to be completed.
+ *
+ * During I/O bi_private points at the cio.
+ */
+static void cio_bio_end_io(struct bio *bio)
+{
+	struct cio *cio = bio->bi_private;
+
+	if (bio->bi_status)
+		cio->io_err = bio->bi_status;
+	kvfree(page_address(bio_first_bvec_all(bio)->bv_page) +
+			bio_first_bvec_all(bio)->bv_offset);
+	bio_put(bio);
+
+	if (atomic_dec_and_test(&cio->refcount) && cio->waiter)
+		wake_up_process(cio->waiter);
+}
+
+int blk_copy_offload_submit_bio(struct block_device *bdev,
+		struct blk_copy_payload *payload, int payload_size,
+		struct cio *cio, gfp_t gfp_mask)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct bio *bio;
+
+	bio = bio_map_kern(q, payload, payload_size, gfp_mask);
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
+
+	bio_set_dev(bio, bdev);
+	bio->bi_opf = REQ_OP_COPY | REQ_NOMERGE;
+	bio->bi_iter.bi_sector = payload->dest;
+	bio->bi_end_io = cio_bio_end_io;
+	bio->bi_private = cio;
+	atomic_inc(&cio->refcount);
+	submit_bio(bio);
+
+	return 0;
+}
+
+/* Go through all the enrties inside user provided payload, and determine the
+ * maximum number of entries in a payload, based on device's scc-limits.
+ */
+static inline int blk_max_payload_entries(int nr_srcs, struct range_entry *rlist,
+		int max_nr_srcs, sector_t max_copy_range_sectors, sector_t max_copy_len)
+{
+	sector_t range_len, copy_len = 0, remaining = 0;
+	int ri = 0, pi = 1, max_pi = 0;
+
+	for (ri = 0; ri < nr_srcs; ri++) {
+		for (remaining = rlist[ri].len; remaining > 0; remaining -= range_len) {
+			range_len = min3(remaining, max_copy_range_sectors,
+								max_copy_len - copy_len);
+			pi++;
+			copy_len += range_len;
+
+			if ((pi == max_nr_srcs) || (copy_len == max_copy_len)) {
+				max_pi = max(max_pi, pi);
+				pi = 1;
+				copy_len = 0;
+			}
+		}
+	}
+
+	return max(max_pi, pi);
+}
+
+/*
+ * blk_copy_offload_scc	- Use device's native copy offload feature
+ * Go through user provide payload, prepare new payload based on device's copy offload limits.
+ */
+int blk_copy_offload_scc(struct block_device *src_bdev, int nr_srcs,
+		struct range_entry *rlist, struct block_device *dest_bdev,
+		sector_t dest, gfp_t gfp_mask)
+{
+	struct request_queue *q = bdev_get_queue(dest_bdev);
+	struct cio *cio = NULL;
+	struct blk_copy_payload *payload;
+	sector_t range_len, copy_len = 0, remaining = 0;
+	sector_t src_blk, cdest = dest;
+	sector_t max_copy_range_sectors, max_copy_len;
+	int ri = 0, pi = 0, ret = 0, payload_size, max_pi, max_nr_srcs;
+
+	cio = kzalloc(sizeof(struct cio), GFP_KERNEL);
+	if (!cio)
+		return -ENOMEM;
+	atomic_set(&cio->refcount, 0);
+
+	max_nr_srcs = q->limits.max_copy_nr_ranges;
+	max_copy_range_sectors = q->limits.max_copy_range_sectors;
+	max_copy_len = q->limits.max_copy_sectors;
+
+	max_pi = blk_max_payload_entries(nr_srcs, rlist, max_nr_srcs,
+					max_copy_range_sectors, max_copy_len);
+	payload_size = struct_size(payload, range, max_pi);
+
+	payload = kvmalloc(payload_size, gfp_mask);
+	if (!payload) {
+		ret = -ENOMEM;
+		goto free_cio;
+	}
+	payload->src_bdev = src_bdev;
+
+	for (ri = 0; ri < nr_srcs; ri++) {
+		for (remaining = rlist[ri].len, src_blk = rlist[ri].src; remaining > 0;
+						remaining -= range_len, src_blk += range_len) {
+
+			range_len = min3(remaining, max_copy_range_sectors,
+								max_copy_len - copy_len);
+			payload->range[pi].len = range_len;
+			payload->range[pi].src = src_blk;
+			pi++;
+			copy_len += range_len;
+
+			/* Submit current payload, if crossing device copy limits */
+			if ((pi == max_nr_srcs) || (copy_len == max_copy_len)) {
+				payload->dest = cdest;
+				payload->copy_nr_ranges = pi;
+				ret = blk_copy_offload_submit_bio(dest_bdev, payload,
+								payload_size, cio, gfp_mask);
+				if (ret)
+					goto free_payload;
+
+				/* reset index, length and allocate new payload */
+				pi = 0;
+				cdest += copy_len;
+				copy_len = 0;
+				payload = kvmalloc(payload_size, gfp_mask);
+				if (!payload) {
+					ret = -ENOMEM;
+					goto free_cio;
+				}
+				payload->src_bdev = src_bdev;
+			}
+		}
+	}
+
+	if (pi) {
+		payload->dest = cdest;
+		payload->copy_nr_ranges = pi;
+		ret = blk_copy_offload_submit_bio(dest_bdev, payload, payload_size, cio, gfp_mask);
+		if (ret)
+			goto free_payload;
+	}
+
+	/* Wait for completion of all IO's*/
+	ret = cio_await_completion(cio);
+
+	return ret;
+
+free_payload:
+	kvfree(payload);
+free_cio:
+	cio_await_completion(cio);
+	return ret;
+}
+
+static inline sector_t blk_copy_len(struct range_entry *rlist, int nr_srcs)
+{
+	int i;
+	sector_t len = 0;
+
+	for (i = 0; i < nr_srcs; i++) {
+		if (rlist[i].len)
+			len += rlist[i].len;
+		else
+			return 0;
+	}
+
+	return len;
+}
+
+static inline bool blk_check_offload_scc(struct request_queue *src_q,
+		struct request_queue *dest_q)
+{
+	if (src_q == dest_q && src_q->limits.copy_offload == BLK_COPY_OFFLOAD_SCC)
+		return true;
+
+	return false;
+}
+
+/*
+ * blkdev_issue_copy - queue a copy
+ * @src_bdev:	source block device
+ * @nr_srcs:	number of source ranges to copy
+ * @src_rlist:	array of source ranges
+ * @dest_bdev:	destination block device
+ * @dest:	destination in sector
+ * @gfp_mask:   memory allocation flags (for bio_alloc)
+ * @flags:	BLKDEV_COPY_* flags to control behaviour
+ *
+ * Description:
+ *	Copy source ranges from source block device to destination block device.
+ *	length of a source range cannot be zero.
+ */
+int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
+		struct range_entry *src_rlist, struct block_device *dest_bdev,
+		sector_t dest, gfp_t gfp_mask, int flags)
+{
+	struct request_queue *src_q = bdev_get_queue(src_bdev);
+	struct request_queue *dest_q = bdev_get_queue(dest_bdev);
+	sector_t copy_len;
+	int ret = -EINVAL;
+
+	if (!src_q || !dest_q)
+		return -ENXIO;
+
+	if (!nr_srcs)
+		return -EINVAL;
+
+	if (nr_srcs >= MAX_COPY_NR_RANGE)
+		return -EINVAL;
+
+	copy_len = blk_copy_len(src_rlist, nr_srcs);
+	if (!copy_len && copy_len >= MAX_COPY_TOTAL_LENGTH)
+		return -EINVAL;
+
+	if (bdev_read_only(dest_bdev))
+		return -EPERM;
+
+	if (blk_check_offload_scc(src_q, dest_q))
+		ret = blk_copy_offload_scc(src_bdev, nr_srcs, src_rlist, dest_bdev, dest, gfp_mask);
+
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_copy);
+
 /**
  * __blkdev_issue_write_same - generate number of bios with same page
  * @bdev:	target blockdev
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 86fce751bb17..7643fc868521 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -67,6 +67,7 @@ bool blk_req_needs_zone_write_lock(struct request *rq)
 	case REQ_OP_WRITE_ZEROES:
 	case REQ_OP_WRITE_SAME:
 	case REQ_OP_WRITE:
+	case REQ_OP_COPY:
 		return blk_rq_zone_is_seq(rq);
 	default:
 		return false;
diff --git a/block/bounce.c b/block/bounce.c
index 05fc7148489d..d9b05aaf6e56 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -176,6 +176,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
 	bio->bi_iter.bi_size	= bio_src->bi_iter.bi_size;
 
 	switch (bio_op(bio)) {
+	case REQ_OP_COPY:
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
 	case REQ_OP_WRITE_ZEROES:
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 3d67d0fbc868..068fa2e8896a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -73,6 +73,7 @@ static inline bool bio_has_data(struct bio *bio)
 static inline bool bio_no_advance_iter(const struct bio *bio)
 {
 	return bio_op(bio) == REQ_OP_DISCARD ||
+	       bio_op(bio) == REQ_OP_COPY ||
 	       bio_op(bio) == REQ_OP_SECURE_ERASE ||
 	       bio_op(bio) == REQ_OP_WRITE_SAME ||
 	       bio_op(bio) == REQ_OP_WRITE_ZEROES;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 9e392daa1d7f..1ab77176cb46 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -347,6 +347,8 @@ enum req_opf {
 	REQ_OP_ZONE_RESET	= 15,
 	/* reset all the zone present on the device */
 	REQ_OP_ZONE_RESET_ALL	= 17,
+	/* copy ranges within device */
+	REQ_OP_COPY		= 19,
 
 	/* Driver private requests */
 	REQ_OP_DRV_IN		= 34,
@@ -470,6 +472,11 @@ static inline bool op_is_discard(unsigned int op)
 	return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
 }
 
+static inline bool op_is_copy(unsigned int op)
+{
+	return (op & REQ_OP_MASK) == REQ_OP_COPY;
+}
+
 /*
  * Check if a bio or request operation is a zone management operation, with
  * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case
@@ -529,4 +536,17 @@ struct blk_rq_stat {
 	u64 batch;
 };
 
+struct cio {
+	atomic_t refcount;
+	blk_status_t io_err;
+	struct task_struct *waiter;     /* waiting task (NULL if none) */
+};
+
+struct blk_copy_payload {
+	struct block_device	*src_bdev;
+	sector_t		dest;
+	int			copy_nr_ranges;
+	struct range_entry	range[];
+};
+
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fd4cfaadda5b..38369dff6a36 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -52,6 +52,12 @@ struct blk_keyslot_manager;
 /* Doing classic polling */
 #define BLK_MQ_POLL_CLASSIC -1
 
+/* Define copy offload options */
+enum blk_copy {
+	BLK_COPY_OFFLOAD_EMULATE = 0,
+	BLK_COPY_OFFLOAD_SCC,
+};
+
 /*
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
@@ -1051,6 +1057,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
 		return min(q->limits.max_discard_sectors,
 			   UINT_MAX >> SECTOR_SHIFT);
 
+	if (unlikely(op == REQ_OP_COPY))
+		return q->limits.max_copy_sectors;
+
 	if (unlikely(op == REQ_OP_WRITE_SAME))
 		return q->limits.max_write_same_sectors;
 
@@ -1326,6 +1335,10 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, int flags,
 		struct bio **biop);
 
+int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
+		struct range_entry *src_rlist, struct block_device *dest_bdev,
+		sector_t dest, gfp_t gfp_mask, int flags);
+
 #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
 #define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index bdf7b404b3e7..7a97b588d892 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -64,6 +64,18 @@ struct fstrim_range {
 	__u64 minlen;
 };
 
+/* Maximum no of entries supported */
+#define MAX_COPY_NR_RANGE	(1 << 12)
+
+/* maximum total copy length */
+#define MAX_COPY_TOTAL_LENGTH	(1 << 21)
+
+/* Source range entry for copy */
+struct range_entry {
+	__u64 src;
+	__u64 len;
+};
+
 /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
 #define FILE_DEDUPE_RANGE_SAME		0
 #define FILE_DEDUPE_RANGE_DIFFERS	1
-- 
2.25.1





More information about the dm-devel mailing list