[dm-devel] [PATCH] dm zoned: update atime for new buffer zones

Wed Jul 15 09:20:59 UTC 2020

On Wed, 2020-07-15 at 11:09 +0200, Hannes Reinecke wrote:
> On 7/15/20 10:49 AM, Damien Le Moal wrote:
> > On 2020/07/15 17:18, Hannes Reinecke wrote:
> > > When a new buffer zone is allocated in dmz_handle_buffered_write()
> > > we should update the 'atime' to inform reclaim that this zone has
> > > been accessed.
> > > Otherwise we end up with the pathological case where the first write
> > > allocates a new buffer zone, but the next write will start reclaim
> > > before processing the bio. As the atime is not set reclaim declares
> > > the system idle and reclaims the zone. Then the write will be processed
> > > and re-allocate the very same zone again; this repeats for every
> > > consecutive write, making for a _very_ slow mkfs.
> > > 
> > > Signed-off-by: Hannes Reinecke <hare at suse.de>
> > > ---
> > >  drivers/md/dm-zoned-target.c | 3 +++
> > >  1 file changed, 3 insertions(+)
> > > 
> > > diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
> > > index cf915009c306..b32d37bef14f 100644
> > > --- a/drivers/md/dm-zoned-target.c
> > > +++ b/drivers/md/dm-zoned-target.c
> > > @@ -297,6 +297,9 @@ static int dmz_handle_buffered_write(struct dmz_target *dmz,
> > >  	if (dmz_is_readonly(bzone))
> > >  		return -EROFS;
> > >  
> > > +	/* Tell reclaim we're doing some work here */
> > > +	dmz_reclaim_bio_acc(bzone->dev->reclaim);
> > > +
> > >  	/* Submit write */
> > >  	ret = dmz_submit_bio(dmz, bzone, bio, chunk_block, nr_blocks);
> > >  	if (ret)
> > 
> > This is without a cache device, right ? Otherwise, since the cache device has no
> > reclaim, it would not make much sense.
> > 
> > In fact, I think that the atime timestamp being attached to each device reclaim
> > structure is a problem. We do not need that since we always trigger reclaim for
> > all drives. We only want to see if the target is busy or not, so atime should be
> > attached to struct dmz_metadata.
> > 
> > That will simplify things since we will not need to care about which zone/which
> > device is being accessed to track activity. We can just have:
> > 
> > 	dmz_reclaim_bio_acc(dmz->metadata);
> > 
> > Thoughts ?
> > 
> Well, I might be off the mark with this patch, but I did run into the
> the mentioned pathological behaviour; there was exactly _one_ zone
> cached, all I/O was going into that zone, and reclaim (seemed) to be
> busy with that very zone.
> The latter is actually conjecture, as I did _not_ get any messages from
> the reclaim on that device.
> I've seen idle messages from reclaim on the other devices, but reclaim
> from one device was suspiciously silent.
> And I/O went through, but _dead_ slow. All writes, mind (that was during
> mkfs time), so I gathered it might be due to the atime accounting not
> being done correctly.

What is your drive configuration and FS on the target ?

What about this:

diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-
metadata.c
index b298fefb022e..4196ec5469bf 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -202,6 +202,9 @@ struct dmz_metadata {
        struct list_head        reserved_seq_zones_list;
 
        wait_queue_head_t       free_wq;
+
+       /* Last target access time */
+       unsigned long           atime;
 };
 
 #define dmz_zmd_info(zmd, format, args...)     \
@@ -354,6 +357,19 @@ bool dmz_dev_is_dying(struct dmz_metadata *zmd)
        return false;
 }
 
+/*
+* For BIO accounting to track the target idle time.
+*/
+void dmz_bio_acc(struct dmz_metadata *zmd)
+{
+       zmd->atime = jiffies;
+}
+
+unsigned long dmz_atime(struct dmz_metadata *zmd)
+{
+       return zmd->atime;
+}
+
 /*
  * Lock/unlock mapping table.
  * The map lock also protects all the zone lists.
@@ -2888,6 +2904,7 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int
num_dev,
        strcpy(zmd->devname, devname);
        zmd->dev = dev;
        zmd->nr_devs = num_dev;
+       zmd->atime = jiffies;
        zmd->mblk_rbtree = RB_ROOT;
        init_rwsem(&zmd->mblk_sem);
        mutex_init(&zmd->mblk_flush_lock);
diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-
reclaim.c
index 9c0ecc9568a4..f70281a1ea8b 100644
--- a/drivers/md/dm-zoned-reclaim.c
+++ b/drivers/md/dm-zoned-reclaim.c
@@ -24,9 +24,6 @@ struct dmz_reclaim {
        int                     dev_idx;
 
        unsigned long           flags;
-
-       /* Last target access time */
-       unsigned long           atime;
 };
 
 /*
@@ -355,7 +352,7 @@ static void dmz_reclaim_empty(struct dmz_reclaim
*zrc, struct dm_zone *dzone)
  */
 static inline int dmz_target_idle(struct dmz_reclaim *zrc)
 {
-       return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD);
+       return time_is_before_jiffies(dmz_atime(zrc->metadata) +
DMZ_IDLE_PERIOD);
 }
 
 /*
@@ -561,7 +558,6 @@ int dmz_ctr_reclaim(struct dmz_metadata *zmd,
                return -ENOMEM;
 
        zrc->metadata = zmd;
-       zrc->atime = jiffies;
        zrc->dev_idx = idx;
 
        /* Reclaim kcopyd client */
@@ -620,14 +616,6 @@ void dmz_resume_reclaim(struct dmz_reclaim *zrc)
        queue_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD);
 }
 
-/*
- * BIO accounting.
- */
-void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc)
-{
-       zrc->atime = jiffies;
-}
-
 /*
  * Start reclaim if necessary.
  */
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-
target.c
index 42aa5139df7c..d4785bc8341b 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -404,6 +404,9 @@ static void dmz_handle_bio(struct dmz_target *dmz,
struct dm_chunk_work *cw,
 
        dmz_lock_metadata(zmd);
 
+       /* For reclaim, to track idle time */
+       dmz_bio_acc(zmd);
+
        /*
         * Get the data zone mapping the chunk. There may be no
         * mapping for read and discard. If a mapping is obtained,
@@ -420,7 +423,6 @@ static void dmz_handle_bio(struct dmz_target *dmz,
struct dm_chunk_work *cw,
        if (zone) {
                dmz_activate_zone(zone);
                bioctx->zone = zone;
-               dmz_reclaim_bio_acc(zone->dev->reclaim);
        }
 
        switch (bio_op(bio)) {
diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h
index 22f11440b423..30eaeb2ac9df 100644
--- a/drivers/md/dm-zoned.h
+++ b/drivers/md/dm-zoned.h
@@ -211,6 +211,9 @@ unsigned int dmz_nr_chunks(struct dmz_metadata
*zmd);
 bool dmz_check_dev(struct dmz_metadata *zmd);
 bool dmz_dev_is_dying(struct dmz_metadata *zmd);
 
+void dmz_bio_acc(struct dmz_metadata *zmd);
+unsigned long dmz_atime(struct dmz_metadata *zmd);
+
 #define DMZ_ALLOC_RND          0x01
 #define DMZ_ALLOC_CACHE                0x02
 #define DMZ_ALLOC_SEQ          0x04
@@ -274,7 +277,6 @@ int dmz_ctr_reclaim(struct dmz_metadata *zmd,
struct dmz_reclaim **zrc, int idx)
 void dmz_dtr_reclaim(struct dmz_reclaim *zrc);
 void dmz_suspend_reclaim(struct dmz_reclaim *zrc);
 void dmz_resume_reclaim(struct dmz_reclaim *zrc);
-void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc);
 void dmz_schedule_reclaim(struct dmz_reclaim *zrc);
 
 /*
@@ -289,7 +291,7 @@ bool dmz_check_bdev(struct dmz_dev *dmz_dev);
  */
 static inline void dmz_deactivate_zone(struct dm_zone *zone)
 {
-       dmz_reclaim_bio_acc(zone->dev->reclaim);
+       dmz_bio_acc(zone->dev->metadata);
        atomic_dec(&zone->refcount);
 }

Does that solve the problem ? (completely untested)

> 
> Cheers,
> 
> Hannes

-- 
Damien Le Moal
Western Digital Research