[dm-devel] [PATCH 10/12] Limit bio_endio recursion

Ming Lei tom.leiming at gmail.com
Thu Apr 7 05:44:43 UTC 2016


On Thu, 7 Apr 2016 11:54:49 +0800
Ming Lei <tom.leiming at gmail.com> wrote:

> On Mon, Apr 4, 2016 at 1:06 PM, Shaun Tancheff <shaun at tancheff.com> wrote:
> > Recursive endio calls can exceed 16k stack. Tested with
> > 32k stack and observed:
> >
> >             Depth    Size   Location    (293 entries)
> >             -----    ----   --------
> >       0)    21520      16   __module_text_address+0x12/0x60
> >       1)    21504       8   __module_address+0x5/0x140
> >       2)    21496      24   __module_text_address+0x12/0x60
> >       3)    21472      16   is_module_text_address+0xe/0x20
> >       4)    21456       8   __kernel_text_address+0x50/0x80
> >       5)    21448     136   print_context_stack+0x5a/0xf0
> >       6)    21312     144   dump_trace+0x14c/0x300
> >       7)    21168       8   save_stack_trace+0x2f/0x50
> >       8)    21160      88   set_track+0x64/0x130
> >       9)    21072      96   free_debug_processing+0x200/0x290
> >      10)    20976     176   __slab_free+0x164/0x290
> >      11)    20800      48   kmem_cache_free+0x1b0/0x1e0
> >      12)    20752      16   mempool_free_slab+0x17/0x20
> >      13)    20736      48   mempool_free+0x2f/0x90
> >      14)    20688      16   bvec_free+0x36/0x40
> >      15)    20672      32   bio_free+0x3b/0x60
> >      16)    20640      16   bio_put+0x23/0x30
> >      17)    20624      64   end_bio_extent_writepage+0xcf/0xe0
> >      18)    20560      48   bio_endio+0x57/0x90
> >      19)    20512      48   btrfs_end_bio+0xa8/0x160
> >      20)    20464      48   bio_endio+0x57/0x90
> >      21)    20416     112   dec_pending+0x121/0x270
> >      22)    20304      64   clone_endio+0x7a/0x100
> >      23)    20240      48   bio_endio+0x57/0x90
> >     ...
> >     277)     1264      64   clone_endio+0x7a/0x100
> >     278)     1200      48   bio_endio+0x57/0x90
> >     279)     1152     112   dec_pending+0x121/0x270
> >     280)     1040      64   clone_endio+0x7a/0x100
> >     281)      976      48   bio_endio+0x57/0x90
> >     282)      928      80   blk_update_request+0x8f/0x340
> >     283)      848      80   scsi_end_request+0x33/0x1c0
> >     284)      768     112   scsi_io_completion+0xb5/0x620
> >     285)      656      48   scsi_finish_command+0xcf/0x120
> >     286)      608      48   scsi_softirq_done+0x126/0x150
> >     287)      560      24   blk_done_softirq+0x78/0x90
> >     288)      536     136   __do_softirq+0xfd/0x280
> >     289)      400      16   run_ksoftirqd+0x28/0x50
> >     290)      384      64   smpboot_thread_fn+0x105/0x160
> >     291)      320     144   kthread+0xc9/0xe0
> >     292)      176     176   ret_from_fork+0x3f/0x70
> >
> > Based on earlier patch by Mikulas Patocka <mpatocka at redhat.com>.
> > https://lkml.org/lkml/2008/6/24/18
> 
> Looks a empty link, and the following had the discussion too:
> 
> http://linux-kernel.2935.n7.nabble.com/PATCH-1-2-Avoid-bio-endio-recursion-td306043.html
> 
> >
> > Signed-off-by: Shaun Tancheff <shaun.tancheff at seagate.com>
> > ---
> >  block/bio.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 54 insertions(+), 3 deletions(-)
> >
> > diff --git a/block/bio.c b/block/bio.c
> > index d42e69c..4ac19f6 100644
> > --- a/block/bio.c
> > +++ b/block/bio.c
> > @@ -1733,6 +1733,59 @@ static inline bool bio_remaining_done(struct bio *bio)
> >         return false;
> >  }
> >
> > +static DEFINE_PER_CPU(struct bio **, bio_end_queue) = { NULL };
> 
> The idea looks very nice, but this patch can't be applid to current block-next
> branch.
> 
> Looks it might be simpler to implement the approach by using percpu bio_list.

Cc Mikulas & Christoph

How about the following implementation with bio_list? Looks more readable and
simpler.

Just run a recent testcase of bcache over raid1(virtio-blk, virtio-blk), looks
it does work, :-)

---

diff --git a/block/bio.c b/block/bio.c
index f124a0a..b97dfe8 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -68,6 +68,8 @@ static DEFINE_MUTEX(bio_slab_lock);
 static struct bio_slab *bio_slabs;
 static unsigned int bio_slab_nr, bio_slab_max;
 
+static DEFINE_PER_CPU(struct bio_list *, bio_end_list) = { NULL };
+
 static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
 {
 	unsigned int sz = sizeof(struct bio) + extra_size;
@@ -1737,6 +1739,46 @@ static inline bool bio_remaining_done(struct bio *bio)
 	return false;
 }
 
+/* disable local irq when manipulating the percpu bio_list */
+static void unwind_bio_endio(struct bio *bio)
+{
+	struct bio_list bl_in_stack;
+	struct bio_list *bl;
+	unsigned long flags;
+	bool clear_list = false;
+
+	local_irq_save(flags);
+
+	bl = this_cpu_read(bio_end_list);
+	if (!bl) {
+		bl = &bl_in_stack;
+		bio_list_init(bl);
+		clear_list = true;
+	}
+
+	if (!bio_list_empty(bl)) {
+		bio_list_add(bl, bio);
+		goto out;
+	}
+
+	while (1) {
+		local_irq_restore(flags);
+
+		if (!bio)
+			break;
+
+		if (bio->bi_end_io)
+			bio->bi_end_io(bio);
+
+		local_irq_save(flags);
+		bio = bio_list_pop(bl);
+	}
+	if (clear_list)
+		this_cpu_write(bio_end_list, NULL);
+ out:
+	local_irq_restore(flags);
+}
+
 /**
  * bio_endio - end I/O on a bio
  * @bio:	bio
@@ -1765,8 +1807,7 @@ again:
 		goto again;
 	}
 
-	if (bio->bi_end_io)
-		bio->bi_end_io(bio);
+	unwind_bio_endio(bio);
 }
 EXPORT_SYMBOL(bio_endio);
 


> 
> > +
> > +static struct bio *unwind_bio_endio(struct bio *bio)
> > +{
> > +       struct bio ***bio_end_queue_ptr;
> > +       struct bio *bio_queue;
> > +       struct bio *chain_bio = NULL;
> > +       int error = bio->bi_error;
> > +       unsigned long flags;
> > +
> > +       local_irq_save(flags);
> 
> If we may resue current->bio_list for this purpose, disabling irq should have
> been avoided, but looks it is difficult to do in that way, maybe impossible.
> Also not realistic to introduce a new per-task variable.
> 
> > +       bio_end_queue_ptr = this_cpu_ptr(&bio_end_queue);
> > +
> > +       if (*bio_end_queue_ptr) {
> > +               **bio_end_queue_ptr = bio;
> > +               *bio_end_queue_ptr = &bio->bi_next;
> > +               bio->bi_next = NULL;
> > +       } else {
> > +               bio_queue = NULL;
> > +               *bio_end_queue_ptr = &bio_queue;
> 
> Suggest to comment that bio_queue is the bottom-most stack variable,
> otherwise looks a bit tricky to understand.
> 
> > +
> > +next_bio:
> > +               if (bio->bi_end_io == bio_chain_endio) {
> > +                       struct bio *parent = bio->bi_private;
> > +
> > +                       bio_put(bio);
> > +                       chain_bio = parent;
> > +                       goto out;
> > +               }
> > +
> > +               if (bio->bi_end_io) {
> > +                       if (!bio->bi_error)
> > +                               bio->bi_error = error;
> > +                       bio->bi_end_io(bio);
> > +               }
> > +
> > +               if (bio_queue) {
> > +                       bio = bio_queue;
> > +                       bio_queue = bio->bi_next;
> > +                       if (!bio_queue)
> > +                               *bio_end_queue_ptr = &bio_queue;
> > +                       goto next_bio;
> > +               }
> > +               *bio_end_queue_ptr = NULL;
> > +       }
> > +
> > +out:
> > +
> > +       local_irq_restore(flags);
> > +
> > +       return chain_bio;
> > +}
> > +
> >  /**
> >   * bio_endio - end I/O on a bio
> >   * @bio:       bio
> > @@ -1762,9 +1815,7 @@ void bio_endio(struct bio *bio)
> >                         bio_put(bio);
> >                         bio = parent;
> >                 } else {
> > -                       if (bio->bi_end_io)
> > -                               bio->bi_end_io(bio);
> > -                       bio = NULL;
> > +                       bio = unwind_bio_endio(bio);
> >                 }
> >         }
> >  }
> > --
> > 1.9.1
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-block" in
> > the body of a message to majordomo at vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> 




More information about the dm-devel mailing list