[dm-devel] [PATCH 10/15] dm-zoned: per-device reclaim
Damien Le Moal
Damien.LeMoal at wdc.com
Thu May 28 03:19:02 UTC 2020
On Wed, 2020-05-27 at 08:22 +0200, Hannes Reinecke wrote:
> Instead of having one reclaim workqueue for the entire set we should
> be allocating a reclaim workqueue per device; that will reduce
> contention and should boost performance for a multi-device setup.
>
> Signed-off-by: Hannes Reinecke <hare at suse.de>
> ---
> drivers/md/dm-zoned-reclaim.c | 66 +++++++++++++++++++++++++++----------------
> drivers/md/dm-zoned-target.c | 39 +++++++++++++++----------
> drivers/md/dm-zoned.h | 38 +++++++++++++------------
> 3 files changed, 86 insertions(+), 57 deletions(-)
>
> diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c
> index e9e3b730e258..09843645248a 100644
> --- a/drivers/md/dm-zoned-reclaim.c
> +++ b/drivers/md/dm-zoned-reclaim.c
> @@ -21,6 +21,8 @@ struct dmz_reclaim {
> struct dm_kcopyd_throttle kc_throttle;
> int kc_err;
>
> + int dev_idx;
> +
> unsigned long flags;
>
> /* Last target access time */
> @@ -198,8 +200,8 @@ static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone)
> struct dmz_metadata *zmd = zrc->metadata;
> int ret;
>
> - DMDEBUG("(%s): Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)",
> - dmz_metadata_label(zmd),
> + DMDEBUG("(%s/%u): Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)",
> + dmz_metadata_label(zmd), zrc->dev_idx,
> dzone->chunk, bzone->id, dmz_weight(bzone),
> dzone->id, dmz_weight(dzone));
>
> @@ -237,8 +239,8 @@ static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
> struct dmz_metadata *zmd = zrc->metadata;
> int ret = 0;
>
> - DMDEBUG("(%s): Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)",
> - dmz_metadata_label(zmd),
> + DMDEBUG("(%s/%u): Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)",
> + dmz_metadata_label(zmd), zrc->dev_idx,
> chunk, dzone->id, dmz_weight(dzone),
> bzone->id, dmz_weight(bzone));
>
> @@ -295,8 +297,8 @@ static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone)
> if (!szone)
> return -ENOSPC;
>
> - DMDEBUG("(%s): Chunk %u, move %s zone %u (weight %u) to %s zone %u",
> - dmz_metadata_label(zmd), chunk,
> + DMDEBUG("(%s/%u): Chunk %u, move %s zone %u (weight %u) to %s zone %u",
> + dmz_metadata_label(zmd), zrc->dev_idx, chunk,
> dmz_is_cache(dzone) ? "cache" : "rnd",
> dzone->id, dmz_weight(dzone),
> dmz_is_rnd(szone) ? "rnd" : "seq", szone->id);
> @@ -369,8 +371,8 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc)
> /* Get a data zone */
> dzone = dmz_get_zone_for_reclaim(zmd, dmz_target_idle(zrc));
> if (!dzone) {
> - DMDEBUG("(%s): No zone found to reclaim",
> - dmz_metadata_label(zmd));
> + DMDEBUG("(%s/%u): No zone found to reclaim",
> + dmz_metadata_label(zmd), zrc->dev_idx);
> return -EBUSY;
> }
>
> @@ -417,24 +419,26 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc)
> out:
> if (ret) {
> if (ret == -EINTR)
> - DMDEBUG("(%s): reclaim zone %u interrupted",
> - dmz_metadata_label(zmd), rzone->id);
> + DMDEBUG("(%s/%u): reclaim zone %u interrupted",
> + dmz_metadata_label(zmd), zrc->dev_idx,
> + rzone->id);
> else
> - DMDEBUG("(%s): Failed to reclaim zone %u, err %d",
> - dmz_metadata_label(zmd), rzone->id, ret);
> + DMDEBUG("(%s/%u): Failed to reclaim zone %u, err %d",
> + dmz_metadata_label(zmd), zrc->dev_idx,
> + rzone->id, ret);
> dmz_unlock_zone_reclaim(dzone);
> return ret;
> }
>
> ret = dmz_flush_metadata(zrc->metadata);
> if (ret) {
> - DMDEBUG("(%s): Metadata flush for zone %u failed, err %d",
> - dmz_metadata_label(zmd), rzone->id, ret);
> + DMDEBUG("(%s/%u): Metadata flush for zone %u failed, err %d",
> + dmz_metadata_label(zmd), zrc->dev_idx, rzone->id, ret);
> return ret;
> }
>
> - DMDEBUG("(%s): Reclaimed zone %u in %u ms",
> - dmz_metadata_label(zmd),
> + DMDEBUG("(%s/%u): Reclaimed zone %u in %u ms",
> + dmz_metadata_label(zmd), zrc->dev_idx,
> rzone->id, jiffies_to_msecs(jiffies - start));
> return 0;
> }
> @@ -461,10 +465,20 @@ static unsigned int dmz_reclaim_percentage(struct dmz_reclaim *zrc)
> */
> static bool dmz_should_reclaim(struct dmz_reclaim *zrc, unsigned int p_unmap)
> {
> - unsigned int nr_reclaim = dmz_nr_rnd_zones(zrc->metadata);
> + unsigned int nr_reclaim;
> +
> + nr_reclaim = dmz_nr_rnd_zones(zrc->metadata);
>
> - if (dmz_nr_cache_zones(zrc->metadata))
> + if (dmz_nr_cache_zones(zrc->metadata)) {
> + /*
> + * The first device in a multi-device
> + * setup only contains cache zones, so
> + * never start reclaim there.
> + */
> + if (zrc->dev_idx == 0)
> + return false;
> nr_reclaim += dmz_nr_cache_zones(zrc->metadata);
> + }
>
> /* Reclaim when idle */
> if (dmz_target_idle(zrc) && nr_reclaim)
> @@ -488,7 +502,7 @@ static void dmz_reclaim_work(struct work_struct *work)
> {
> struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work);
> struct dmz_metadata *zmd = zrc->metadata;
> - unsigned int p_unmap;
> + unsigned int p_unmap, nr_unmap_rnd = 0, nr_rnd = 0;
> int ret;
>
> if (dmz_dev_is_dying(zmd))
> @@ -514,8 +528,11 @@ static void dmz_reclaim_work(struct work_struct *work)
> zrc->kc_throttle.throttle = min(75U, 100U - p_unmap / 2);
> }
>
> - DMDEBUG("(%s): Reclaim (%u): %s, %u%% free zones (%u/%u cache %u/%u random)",
> - dmz_metadata_label(zmd),
> + nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd);
> + nr_rnd = dmz_nr_rnd_zones(zmd);
> +
> + DMDEBUG("(%s/%u): Reclaim (%u): %s, %u%% free zones (%u/%u cache %u/%u random)",
> + dmz_metadata_label(zmd), zrc->dev_idx,
> zrc->kc_throttle.throttle,
> (dmz_target_idle(zrc) ? "Idle" : "Busy"),
> p_unmap, dmz_nr_unmap_cache_zones(zmd),
> @@ -536,7 +553,7 @@ static void dmz_reclaim_work(struct work_struct *work)
> * Initialize reclaim.
> */
> int dmz_ctr_reclaim(struct dmz_metadata *zmd,
> - struct dmz_reclaim **reclaim)
> + struct dmz_reclaim **reclaim, int idx)
> {
> struct dmz_reclaim *zrc;
> int ret;
> @@ -547,6 +564,7 @@ int dmz_ctr_reclaim(struct dmz_metadata *zmd,
>
> zrc->metadata = zmd;
> zrc->atime = jiffies;
> + zrc->dev_idx = idx;
>
> /* Reclaim kcopyd client */
> zrc->kc = dm_kcopyd_client_create(&zrc->kc_throttle);
> @@ -558,8 +576,8 @@ int dmz_ctr_reclaim(struct dmz_metadata *zmd,
>
> /* Reclaim work */
> INIT_DELAYED_WORK(&zrc->work, dmz_reclaim_work);
> - zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s", WQ_MEM_RECLAIM,
> - dmz_metadata_label(zmd));
> + zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s_%d", WQ_MEM_RECLAIM,
> + dmz_metadata_label(zmd), idx);
Nit: We do not need the wq for idx == 0 && nrdevs > 1.
> if (!zrc->wq) {
> ret = -ENOMEM;
> goto err;
> diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
> index fec1b6a9c6f1..fc1df9714f63 100644
> --- a/drivers/md/dm-zoned-target.c
> +++ b/drivers/md/dm-zoned-target.c
> @@ -51,9 +51,6 @@ struct dmz_target {
> /* For metadata handling */
> struct dmz_metadata *metadata;
>
> - /* For reclaim */
> - struct dmz_reclaim *reclaim;
> -
> /* For chunk work */
> struct radix_tree_root chunk_rxtree;
> struct workqueue_struct *chunk_wq;
> @@ -405,14 +402,15 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
> dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
> struct dmz_metadata *zmd = dmz->metadata;
> struct dm_zone *zone;
> - int ret;
> + int i, ret;
>
> /*
> * Write may trigger a zone allocation. So make sure the
> * allocation can succeed.
> */
> if (bio_op(bio) == REQ_OP_WRITE)
> - dmz_schedule_reclaim(dmz->reclaim);
> + for (i = 0; i < dmz->nr_ddevs; i++)
> + dmz_schedule_reclaim(dmz->dev[i].reclaim);
>
> dmz_lock_metadata(zmd);
>
> @@ -432,6 +430,7 @@ static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
> if (zone) {
> dmz_activate_zone(zone);
> bioctx->zone = zone;
> + dmz_reclaim_bio_acc(zone->dev->reclaim);
> }
>
> switch (bio_op(bio)) {
> @@ -578,7 +577,6 @@ static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
>
> bio_list_add(&cw->bio_list, bio);
>
> - dmz_reclaim_bio_acc(dmz->reclaim);
> if (queue_work(dmz->chunk_wq, &cw->work))
> dmz_get_chunk_work(cw);
> out:
> @@ -823,7 +821,7 @@ static int dmz_fixup_devices(struct dm_target *ti)
> static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
> {
> struct dmz_target *dmz;
> - int ret;
> + int ret, i;
>
> /* Check arguments */
> if (argc < 1 || argc > 2) {
> @@ -925,10 +923,12 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
> mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
>
> /* Initialize reclaim */
> - ret = dmz_ctr_reclaim(dmz->metadata, &dmz->reclaim);
> - if (ret) {
> - ti->error = "Zone reclaim initialization failed";
> - goto err_fwq;
> + for (i = 0; i < dmz->nr_ddevs; i++) {
We probably could start the loop from 1 here when nr_ddevs > 1 to not
initialize anything for the cache device since reclaim is not run from
it.
> + ret = dmz_ctr_reclaim(dmz->metadata, &dmz->dev[i].reclaim, i);
> + if (ret) {
> + ti->error = "Zone reclaim initialization failed";
> + goto err_fwq;
> + }
> }
>
> DMINFO("(%s): Target device: %llu 512-byte logical sectors (%llu blocks)",
> @@ -961,11 +961,13 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
> static void dmz_dtr(struct dm_target *ti)
> {
> struct dmz_target *dmz = ti->private;
> + int i;
>
> flush_workqueue(dmz->chunk_wq);
> destroy_workqueue(dmz->chunk_wq);
>
> - dmz_dtr_reclaim(dmz->reclaim);
> + for (i = 0; i < dmz->nr_ddevs; i++)
> + dmz_dtr_reclaim(dmz->dev[i].reclaim);
>
> cancel_delayed_work_sync(&dmz->flush_work);
> destroy_workqueue(dmz->flush_wq);
> @@ -1034,9 +1036,11 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
> static void dmz_suspend(struct dm_target *ti)
> {
> struct dmz_target *dmz = ti->private;
> + int i;
>
> flush_workqueue(dmz->chunk_wq);
> - dmz_suspend_reclaim(dmz->reclaim);
> + for (i = 0; i < dmz->nr_ddevs; i++)
> + dmz_suspend_reclaim(dmz->dev[i].reclaim);
> cancel_delayed_work_sync(&dmz->flush_work);
> }
>
> @@ -1046,9 +1050,11 @@ static void dmz_suspend(struct dm_target *ti)
> static void dmz_resume(struct dm_target *ti)
> {
> struct dmz_target *dmz = ti->private;
> + int i;
>
> queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
> - dmz_resume_reclaim(dmz->reclaim);
> + for (i = 0; i < dmz->nr_ddevs; i++)
> + dmz_resume_reclaim(dmz->dev[i].reclaim);
> }
>
> static int dmz_iterate_devices(struct dm_target *ti,
> @@ -1109,7 +1115,10 @@ static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv,
> int r = -EINVAL;
>
> if (!strcasecmp(argv[0], "reclaim")) {
> - dmz_schedule_reclaim(dmz->reclaim);
> + int i;
> +
> + for (i = 0; i < dmz->nr_ddevs; i++)
> + dmz_schedule_reclaim(dmz->dev[i].reclaim);
> r = 0;
> } else
> DMERR("unrecognized message %s", argv[0]);
> diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
> index 983f5b5e9fa0..0cc3459f78ce 100644
> --- a/drivers/md/dm-zoned.h
> +++ b/drivers/md/dm-zoned.h
> @@ -54,6 +54,7 @@ struct dmz_reclaim;
> struct dmz_dev {
> struct block_device *bdev;
> struct dmz_metadata *metadata;
> + struct dmz_reclaim *reclaim;
>
> char name[BDEVNAME_SIZE];
> uuid_t uuid;
> @@ -229,23 +230,6 @@ static inline void dmz_activate_zone(struct dm_zone *zone)
> atomic_inc(&zone->refcount);
> }
>
> -/*
> - * Deactivate a zone. This decrement the zone reference counter
> - * indicating that all BIOs to the zone have completed when the count is 0.
> - */
> -static inline void dmz_deactivate_zone(struct dm_zone *zone)
> -{
> - atomic_dec(&zone->refcount);
> -}
> -
> -/*
> - * Test if a zone is active, that is, has a refcount > 0.
> - */
> -static inline bool dmz_is_active(struct dm_zone *zone)
> -{
> - return atomic_read(&zone->refcount);
> -}
> -
> int dmz_lock_zone_reclaim(struct dm_zone *zone);
> void dmz_unlock_zone_reclaim(struct dm_zone *zone);
> struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd, bool idle);
> @@ -272,7 +256,7 @@ int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
> /*
> * Functions defined in dm-zoned-reclaim.c
> */
> -int dmz_ctr_reclaim(struct dmz_metadata *zmd, struct dmz_reclaim **zrc);
> +int dmz_ctr_reclaim(struct dmz_metadata *zmd, struct dmz_reclaim **zrc, int idx);
> void dmz_dtr_reclaim(struct dmz_reclaim *zrc);
> void dmz_suspend_reclaim(struct dmz_reclaim *zrc);
> void dmz_resume_reclaim(struct dmz_reclaim *zrc);
> @@ -285,4 +269,22 @@ void dmz_schedule_reclaim(struct dmz_reclaim *zrc);
> bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev);
> bool dmz_check_bdev(struct dmz_dev *dmz_dev);
>
> +/*
> + * Deactivate a zone. This decrement the zone reference counter
> + * indicating that all BIOs to the zone have completed when the count is 0.
> + */
> +static inline void dmz_deactivate_zone(struct dm_zone *zone)
> +{
> + dmz_reclaim_bio_acc(zone->dev->reclaim);
> + atomic_dec(&zone->refcount);
> +}
> +
> +/*
> + * Test if a zone is active, that is, has a refcount > 0.
> + */
> +static inline bool dmz_is_active(struct dm_zone *zone)
> +{
> + return atomic_read(&zone->refcount);
> +}
> +
> #endif /* DM_ZONED_H */
Apart from the nit above, looks good.
Reviewed-by: Damien Le Moal <damien.lemoal at wdc.com>
--
Damien Le Moal
Western Digital Research
More information about the dm-devel
mailing list