<div></div><div style="color:#000; font-size: 14px;font-family: arial;"><div><p class="p1">From 164a390febb03ec91355f2132d2430f1bae7d6c2 Mon Sep 17 00:00:00 2001</p><p class="p1">From: Minfei Huang <huangminfei@ucloud.cn></p><p class="p1">Date: Sat, 28 Jun 2014 08:26:20 +0800</p><p class="p1">Subject: [PATCH] dm-io: Fix a race condition in the wake up code for sync_io</p><p class="p2"><br></p><p class="p1">There's a race condition between the atomic_dec_and_test(&io->count)</p><p class="p1">in dec_count() and the waking of the sync_io() thread.  If the thread</p><p class="p1">is spuriously woken immediately after the decrement it may exit,</p><p class="p1">making the on the stack io struct invalid, yet the dec_count could</p><p class="p1">still be using it.</p><p class="p2"><br></p><p class="p1">There are smaller fixes than the one here (eg, just take the io object</p><p class="p1">off the stack).  But I feel this code could use a clean up.</p><p class="p2"><br></p><p class="p1">- simplify dec_count().</p><p class="p2"><br></p><p class="p1">  - It always calls a callback fn now.</p><p class="p1">  - It always frees the io object back to the pool.</p><p class="p2"><br></p><p class="p1">- sync_io()</p><p class="p2"><br></p><p class="p1">  - Take the io object off the stack and allocate it from the pool the</p><p class="p1">    same as async_io.</p><p class="p1">  - Use a completion object rather than an explicit io_schedule()</p><p class="p1">    loop.  The callback triggers the completion.</p><p class="p2"><br></p><p class="p1">Signed-off-by: Minfei Huang <huangminfei@ucloud.cn></p><p class="p1">---</p><p class="p1"> drivers/md/dm-io.c |   22 +++++++++-------------</p><p class="p1"> 1 files changed, 9 insertions(+), 13 deletions(-)</p><p class="p2"><br></p><p class="p1">diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c</p><p class="p1">index 3842ac7..05583da 100644</p><p class="p1">--- a/drivers/md/dm-io.c</p><p class="p1">+++ b/drivers/md/dm-io.c</p><p class="p1">@@ -10,6 +10,7 @@</p><p class="p1"> #include <linux/device-mapper.h></p><p class="p2"> </p><p class="p1"> #include <linux/bio.h></p><p class="p1">+#include <linux/completion.h></p><p class="p1"> #include <linux/mempool.h></p><p class="p1"> #include <linux/module.h></p><p class="p1"> #include <linux/sched.h></p><p class="p1">@@ -32,7 +33,7 @@ struct dm_io_client {</p><p class="p1"> struct io {</p><p class="p1"> <span class="Apple-tab-span">       </span>unsigned long error_bits;</p><p class="p1"> <span class="Apple-tab-span">       </span>atomic_t count;</p><p class="p1">-<span class="Apple-tab-span">      </span>struct task_struct *sleeper;</p><p class="p1">+<span class="Apple-tab-span"> </span>struct completion *wait;</p><p class="p1"> <span class="Apple-tab-span">        </span>struct dm_io_client *client;</p><p class="p1"> <span class="Apple-tab-span">    </span>io_notify_fn callback;</p><p class="p1"> <span class="Apple-tab-span">  </span>void *context;</p><p class="p1">@@ -121,8 +122,8 @@ static void dec_count(struct io *io, unsigned int region, int error)</p><p class="p1"> <span class="Apple-tab-span">  </span><span class="Apple-tab-span">      </span><span class="Apple-tab-span">      </span>invalidate_kernel_vmap_range(io->vma_invalidate_address,</p><p class="p1"> <span class="Apple-tab-span">     </span><span class="Apple-tab-span">      </span><span class="Apple-tab-span">      </span><span class="Apple-tab-span">      </span><span class="Apple-tab-span">      </span><span class="Apple-tab-span">      </span>     io->vma_invalidate_size);</p><p class="p2"> </p><p class="p1">-<span class="Apple-tab-span">    </span><span class="Apple-tab-span">      </span>if (io->sleeper)</p><p class="p1">-<span class="Apple-tab-span">  </span><span class="Apple-tab-span">      </span><span class="Apple-tab-span">      </span>wake_up_process(io->sleeper);</p><p class="p1">+<span class="Apple-tab-span">     </span><span class="Apple-tab-span">      </span>if (io->wait)</p><p class="p1">+<span class="Apple-tab-span">     </span><span class="Apple-tab-span">      </span><span class="Apple-tab-span">      </span>complete(io->wait);</p><p class="p2"> </p><p class="p1"> <span class="Apple-tab-span">    </span><span class="Apple-tab-span">      </span>else {</p><p class="p1"> <span class="Apple-tab-span">  </span><span class="Apple-tab-span">      </span><span class="Apple-tab-span">      </span>unsigned long r = io->error_bits;</p><p class="p1">@@ -387,6 +388,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,</p><p class="p1"> <span class="Apple-tab-span">       </span> */</p><p class="p1"> <span class="Apple-tab-span">     </span>volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];</p><p class="p1"> <span class="Apple-tab-span">      </span>struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));</p><p class="p1">+<span class="Apple-tab-span">    </span>DECLARE_COMPLETION_ONSTACK(wait);</p><p class="p2"> </p><p class="p1"> <span class="Apple-tab-span"> </span>if (num_regions > 1 && (rw & RW_MASK) != WRITE) {</p><p class="p1"> <span class="Apple-tab-span">        </span><span class="Apple-tab-span">      </span>WARN_ON(1);</p><p class="p1">@@ -395,7 +397,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,</p><p class="p2"> </p><p class="p1"> <span class="Apple-tab-span">  </span>io->error_bits = 0;</p><p class="p1"> <span class="Apple-tab-span">  </span>atomic_set(&io->count, 1); /* see dispatch_io() */</p><p class="p1">-<span class="Apple-tab-span">    </span>io->sleeper = current;</p><p class="p1">+<span class="Apple-tab-span">    </span>io->wait = &wait;</p><p class="p1"> <span class="Apple-tab-span">        </span>io->client = client;</p><p class="p2"> </p><p class="p1"> <span class="Apple-tab-span">   </span>io->vma_invalidate_address = dp->vma_invalidate_address;</p><p class="p1">@@ -403,15 +405,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,</p><p class="p2"> </p><p class="p1"> <span class="Apple-tab-span">      </span>dispatch_io(rw, num_regions, where, dp, io, 1);</p><p class="p2"> </p><p class="p1">-<span class="Apple-tab-span">        </span>while (1) {</p><p class="p1">-<span class="Apple-tab-span">  </span><span class="Apple-tab-span">      </span>set_current_state(TASK_UNINTERRUPTIBLE);</p><p class="p1">-</p><p class="p1">-<span class="Apple-tab-span">    </span><span class="Apple-tab-span">      </span>if (!atomic_read(&io->count))</p><p class="p1">-<span class="Apple-tab-span"> </span><span class="Apple-tab-span">      </span><span class="Apple-tab-span">      </span>break;</p><p class="p1">-</p><p class="p1">-<span class="Apple-tab-span">      </span><span class="Apple-tab-span">      </span>io_schedule();</p><p class="p1">+<span class="Apple-tab-span">       </span>while (atomic_read(&io->count) != 0) {</p><p class="p1">+<span class="Apple-tab-span">        </span><span class="Apple-tab-span">      </span>wait_for_completion_io_timeout(&wait, 5);</p><p class="p1"> <span class="Apple-tab-span">   </span>}</p><p class="p1">-<span class="Apple-tab-span">    </span>set_current_state(TASK_RUNNING);</p><p class="p2"> </p><p class="p1"> <span class="Apple-tab-span">  </span>if (error_bits)</p><p class="p1"> <span class="Apple-tab-span"> </span><span class="Apple-tab-span">      </span>*error_bits = io->error_bits;</p><p class="p1">@@ -434,7 +430,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,</p><p class="p1"> <span class="Apple-tab-span">  </span>io = mempool_alloc(client->pool, GFP_NOIO);</p><p class="p1"> <span class="Apple-tab-span">  </span>io->error_bits = 0;</p><p class="p1"> <span class="Apple-tab-span">  </span>atomic_set(&io->count, 1); /* see dispatch_io() */</p><p class="p1">-<span class="Apple-tab-span">    </span>io->sleeper = NULL;</p><p class="p1">+<span class="Apple-tab-span">       </span>io->wait = NULL;</p><p class="p1"> <span class="Apple-tab-span">     </span>io->client = client;</p><p class="p1"> <span class="Apple-tab-span"> </span>io->callback = fn;</p><p class="p1"> <span class="Apple-tab-span">   </span>io->context = context;</p><p class="p1">-- </p><p class="p1">1.7.1</p></div><div>----------------------------------</div><div><br></div><div>We should consider the condition of that if the sync io has been finished before execute "<span style="line-height: 1.7;">wait_for_completion_io",</span></div><div>the thread will be hang.</div><div><br></div><div>so we should add the io->count judgement to consider whether to exeute the function <span style="line-height: 1.7;">wait_for_completion_io.</span></div><div><br></div></div><blockquote id="isReplyContent" style="padding-left:1ex; margin: 0px 0px 0px 0.8ex; BORDER-LEFT: #ccc 1px solid"><br> <br><br>On Fri, 27 Jun 2014, Mikulas Patocka wrote:<br><br>> <br>> <br>> On Fri, 27 Jun 2014, Minfei Huang wrote:<br>> <br>> > BUG: unable to handle kernel NULL pointer dereference at 0000000000000046<br>> > IP: [<ffffffffa0009cef>] dec_count+0x5f/0x80 [dm_mod]<br>> > PGD 0<br>> > Oops: 0000 [#1] SMP<br>> > last sysfs file: /sys/devices/pci0000:00/0000:00:02.2/0000:02:00.0/host0/scsi_host/host0/proc_name<br>> > <br>> > Pid: 2708, comm: kcopyd Tainted: G        W  --------------- H  2.6.32-279.19.5.el6.ucloud.x86_64 #1 Dell Inc. PowerEdge R720xd/0DCWD1<br>> > RIP: 0010:[<ffffffffa0009cef>]  [<ffffffffa0009cef>] dec_count+0x5f/0x80 [dm_mod]<br>> > RSP: 0018:ffff880100603c30  EFLAGS: 00010246<br>> > RAX: 0000000000000046 RBX: ffff8817968a5c30 RCX: 0000000000000000<br>> > RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8817968a5c00<br>> > RBP: ffff880100603c50 R08: 0000000000000000 R09: 0000000000000000<br>> > R10: ffff880caa594cc0 R11: 0000000000000000 R12: ffff8817968a5c80<br>> > R13: ffffffff81013963 R14: 0000000000001000 R15: 0000000000000000<br>> > FS:  0000000000000000(0000) GS:ffff880100600000(0000) knlGS:0000000000000000<br>> > CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b<br>> > CR2: 0000000000000046 CR3: 000000020c309000 CR4: 00000000001426e0<br>> > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000<br>> > DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400<br>> > Process kcopyd (pid: 2708, threadinfo ffff88180cd26000, task ffff881841c9aa80)<br>> > Stack:<br>> >  ffff880100603c40 ffff880aa8b32300 0000000000000000 ffff8817968a5c00<br>> > <d> ffff880100603c80 ffffffffa000a12a 0000000000000000 ffff880aa8b32300<br>> > <d> 0000000000000000 ffff880caa594cc0 ffff880100603c90 ffffffff811bcf6d<br>> > Call Trace:<br>> >  <IRQ><br>> >  [<ffffffffa000a12a>] endio+0x4a/0x70 [dm_mod]<br>> >  [<ffffffff811bcf6d>] bio_endio+0x1d/0x40<br>> >  [<ffffffff81260beb>] req_bio_endio+0x9b/0xe0<br>> >  [<ffffffff81263114>] blk_update_request+0x104/0x500<br>> >  [<ffffffff81263331>] ? blk_update_request+0x321/0x500<br>> >  [<ffffffff81263537>] blk_update_bidi_request+0x27/0xa0<br>> >  [<ffffffff8126419f>] blk_end_bidi_request+0x2f/0x80<br>> >  [<ffffffff81264240>] blk_end_request+0x10/0x20<br>> >  [<ffffffff81375c6f>] scsi_io_completion+0xaf/0x6c0<br>> >  [<ffffffff8136cb92>] scsi_finish_command+0xc2/0x130<br>> >  [<ffffffff813763e5>] scsi_softirq_done+0x145/0x170<br>> >  [<ffffffff812698ed>] blk_done_softirq+0x8d/0xa0<br>> >  [<ffffffff81074c5f>] __do_softirq+0xdf/0x210<br>> >  [<ffffffff8100c2cc>] call_softirq+0x1c/0x30<br>> >  [<ffffffff8100df9d>] do_softirq+0xad/0xe0<br>> >  [<ffffffff81074995>] irq_exit+0x95/0xa0<br>> >  [<ffffffff81510515>] do_IRQ+0x75/0xf0<br>> >  [<ffffffff8100ba53>] ret_from_intr+0x0/0x16<br>> > <br>> > The value of rdi register(0xffff8817968a5c00) is the io pointer,<br>> > If the sync io, the address of io point must be alloc from stack.<br>> > SO<br>> > crash> struct thread_info ffff8817968a4000<br>> > struct thread_info {<br>> >   task = 0xffff88180cd9a580,<br>> >   exec_domain = 0xffffffff81a98ac0,<br>> >  ...<br>> > }<br>> > <br>> > crash> struct task_struct 0xffff88180cd9a580<br>> > struct task_struct {<br>> >   state = 2,<br>> >   stack = 0xffff8817968a4000,<br>> >  ...<br>> > }<br>> > <br>> > It shows value exactly when use the value of io address.<br>> > <br>> > The io address in callback function will become the danging point,<br>> > cause by the thread of sync io wakes up by other threads<br>> > and return to relieve the io address,<br>> > <br>> > Signed-off-by: Minfei Huang <<a href="mailto:huangminfei@ucloud.cn">huangminfei@ucloud.cn</a>><br>> > ---<br>> >  drivers/md/dm-io.c |   19 +++++++++++++++----<br>> >  1 files changed, 15 insertions(+), 4 deletions(-)<br>> > <br>> > diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c<br>> > index 3842ac7..f992913 100644<br>> > --- a/drivers/md/dm-io.c<br>> > +++ b/drivers/md/dm-io.c<br>> > @@ -38,6 +38,7 @@ struct io {<br>> >      void *context;<br>> >      void *vma_invalidate_address;<br>> >      unsigned long vma_invalidate_size;<br>> > +    atomic_t wakeup;<br>> >  } __attribute__((aligned(DM_IO_MAX_REGIONS)));<br>> >  <br>> >  static struct kmem_cache *_dm_io_cache;<br>> > @@ -121,10 +122,16 @@ static void dec_count(struct io *io, unsigned int region, int error)<br>> >              invalidate_kernel_vmap_range(io->vma_invalidate_address,<br>> >                               io->vma_invalidate_size);<br>> >  <br>> > -        if (io->sleeper)<br>> > -            wake_up_process(io->sleeper);<br>> > +        if (io->sleeper) {<br>> > +            struct task_struct *sleeper = io->sleeper;<br>> >  <br>> > -        else {<br>> > +            atomic_set(&io->wakeup, 1);<br>> <br>> The problem here is that the processor may reorder the read of io->sleeper <br>> with atomic_set(&io->wakeup, 1); (performing atomic_set first and "sleeper <br>> = io->sleeper" afterwards) exposing the same race condition.<br>> <br>> You need to use memory barriers to avoid reordering, but I think the <br>> solution with the completion is better (the completion takes care of <br>> barriers automatically).<br>> <br>> Mikulas<br><br>Also - there is another race - after atomic_set(&io->wakeup, 1), the <br>target process may terminate, so wake_up_process(sleeper) operates on <br>non-existing process. You need to declare a special wait queue or use <br>wait_on_atomic_t+wake_up_atomic_t (that uses uses pre-initialized hash of <br>wait queues) to avoid that race.<br><br>But as I said - the completion approach is better. It doesn't suffer from <br>these problems.<br><br>Mikulas<br><br>> > +/*<br>> > + * The thread may be waked up by other threads,<br>> > + * if then the sync io point will become the dangling pointer<br>> > + */<br>> > +            wake_up_process(sleeper);<br>> > +        } else {<br>> >              unsigned long r = io->error_bits;<br>> >              io_notify_fn fn = io->callback;<br>> >              void *context = io->context;<br>> > @@ -401,12 +408,14 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,<br>> >      io->vma_invalidate_address = dp->vma_invalidate_address;<br>> >      io->vma_invalidate_size = dp->vma_invalidate_size;<br>> >  <br>> > +    atomic_set(&io->wakeup, 0);<br>> > +<br>> >      dispatch_io(rw, num_regions, where, dp, io, 1);<br>> >  <br>> >      while (1) {<br>> >          set_current_state(TASK_UNINTERRUPTIBLE);<br>> >  <br>> > -        if (!atomic_read(&io->count))<br>> > +        if (atomic_read(&io->wakeup))<br>> >              break;<br>> >  <br>> >          io_schedule();<br>> > @@ -442,6 +451,8 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,<br>> >      io->vma_invalidate_address = dp->vma_invalidate_address;<br>> >      io->vma_invalidate_size = dp->vma_invalidate_size;<br>> >  <br>> > +    atomic_set(&io->wakeup, 0);<br>> > +<br>> >      dispatch_io(rw, num_regions, where, dp, io, 0);<br>> >      return 0;<br>> >  }<br>> > -- <br>> > 1.7.1<br>> > <br>> > <br>> > --<br>> > dm-devel mailing list<br>> > <a href="mailto:dm-devel@redhat.com">dm-devel@redhat.com</a><br>> > https://www.redhat.com/mailman/listinfo/dm-devel<br>> > <br>> <br></blockquote>