[dm-devel] [PATCH] iscsi: do not wait for IOs in dm shrinker

Mon Mar 23 15:58:49 UTC 2020

Hi

There are other cases where the kernel could issue or wait on I/O in a 
shrinker (for example in inode shrinker). Instead of fixing all these 
cases, you should fix the process giscsid to not issue an I/O when it is 
not safe.

For example, the lvm and dmeventd processes lock themselves in memory 
using mlock, to make sure that they won't generate I/Os when some device 
mapper targets are suspended - so you should use a similar thing in iscsi.

Another possibility is to set the flag PF_MEMALLOC_NOIO for the giscsid 
process - that will imply that all allocations done by this process have 
the GFP_NOIO flag set.

If still think that no shrinkers should issue I/O anytime, you should 
change the shrinker API (i.e. don't pass the gfp mask to shrinkers) and 
get this patch through the memory management maintainers.

Mikulas

On Wed, 4 Mar 2020, Gabriel Krisman Bertazi wrote:

> From: Tahsin Erdogan <tahsin at google.com>
> 
> If something goes wrong with an iscsi session, the problem is reported
> to giscsid via a netlink message. Then, giscsid tries to add a new device
> and destroy the old one. During old device destruction, the pending ios
> get completed with an error. Without destroying the device the io
> operations are stuck forever.
> 
> If dm shrinker is invoked with __GFP_IO, shrinker gets blocked waiting for
> the pending ios to complete. So, if the giscsid repair path ends up
> doing a memory allocation with __GFP_IO enabled, it could end up in a
> deadlock because the iscsi io cannot be completed until giscsid can do its
> job and giscsid cannot do its job until the io completes.
> 
> Even worse, the deadlock can also occur even if giscsid avoids __GFP_IO
> in all paths. For instance, if giscsid tries to grab a mutex held by
> another thread and that thread invokes the shrinker we again may enter a
> deadlock. Here is a scenario stitched from multiple bugs that
> demonstrates how the deadlock can occur:
> 
> iSCSI-write
>         holding: rx_queue_mutex
>         waiting: uevent_sock_mutex
> 
>         kobject_uevent_env+0x1bd/0x419
>         kobject_uevent+0xb/0xd
>         device_add+0x48a/0x678
>         scsi_add_host_with_dma+0xc5/0x22d
>         iscsi_host_add+0x53/0x55
>         iscsi_sw_tcp_session_create+0xa6/0x129
>         iscsi_if_rx+0x100/0x1247
>         netlink_unicast+0x213/0x4f0
>         netlink_sendmsg+0x230/0x3c0
> 
> iscsi_fail iscsi_conn_failure
>         waiting: rx_queue_mutex
> 
>         schedule_preempt_disabled+0x325/0x734
>         __mutex_lock_slowpath+0x18b/0x230
>         mutex_lock+0x22/0x40
>         iscsi_conn_failure+0x42/0x149
>         worker_thread+0x24a/0xbc0
> 
> EventManager_
>         holding: uevent_sock_mutex
>         waiting: dm_bufio_client->lock
> 
>         dm_bufio_lock+0xe/0x10
>         shrink+0x34/0xf7
>         shrink_slab+0x177/0x5d0
>         do_try_to_free_pages+0x129/0x470
>         try_to_free_mem_cgroup_pages+0x14f/0x210
>         memcg_kmem_newpage_charge+0xa6d/0x13b0
>         __alloc_pages_nodemask+0x4a3/0x1a70
>         fallback_alloc+0x1b2/0x36c
>         __kmalloc_node_track_caller+0xb9/0x10d0
>         __alloc_skb+0x83/0x2f0
>         kobject_uevent_env+0x26b/0x419
>         dm_kobject_uevent+0x70/0x79
>         dev_suspend+0x1a9/0x1e7
>         ctl_ioctl+0x3e9/0x411
>         dm_ctl_ioctl+0x13/0x17
>         do_vfs_ioctl+0xb3/0x460
>         SyS_ioctl+0x5e/0x90
> 
> MemcgReclaimerD"
>         holding: dm_bufio_client->lock
>         waiting: stuck io to finish (needs iscsi_fail thread to progress)
> 
>         schedule at ffffffffbd603618
>         io_schedule at ffffffffbd603ba4
>         do_io_schedule at ffffffffbdaf0d94
>         __wait_on_bit at ffffffffbd6008a6
>         out_of_line_wait_on_bit at ffffffffbd600960
>         wait_on_bit.constprop.10 at ffffffffbdaf0f17
>         __make_buffer_clean at ffffffffbdaf18ba
>         __cleanup_old_buffer at ffffffffbdaf192f
>         shrink at ffffffffbdaf19fd
>         do_shrink_slab at ffffffffbd6ec000
>         shrink_slab at ffffffffbd6ec24a
>         do_try_to_free_pages at ffffffffbd6eda09
>         try_to_free_mem_cgroup_pages at ffffffffbd6ede7e
>         mem_cgroup_resize_limit at ffffffffbd7024c0
>         mem_cgroup_write at ffffffffbd703149
>         cgroup_file_write at ffffffffbd6d9c6e
>         sys_write at ffffffffbd6662ea
>         system_call_fastpath at ffffffffbdbc34a2
> 
> Co-developed-by: Khazhismel Kumykov <khazhy at google.com>
> Signed-off-by: Khazhismel Kumykov <khazhy at google.com>
> Signed-off-by: Tahsin Erdogan <tahsin at google.com>
> Co-developed-by: Gabriel Krisman Bertazi <krisman at collabora.com>
> Signed-off-by: Gabriel Krisman Bertazi <krisman at collabora.com>
> ---
>  drivers/md/dm-bufio.c | 31 +++++++++++++------------------
>  1 file changed, 13 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
> index 2d519c223562..4c4f80e894b6 100644
> --- a/drivers/md/dm-bufio.c
> +++ b/drivers/md/dm-bufio.c
> @@ -1516,18 +1516,16 @@ static void drop_buffers(struct dm_bufio_client *c)
>   * We may not be able to evict this buffer if IO pending or the client
>   * is still using it.  Caller is expected to know buffer is too old.
>   *
> - * And if GFP_NOFS is used, we must not do any I/O because we hold
> - * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
> - * rerouted to different bufio client.
> + * We must not do any I/O because we hold dm_bufio_clients_lock and we
> + * would risk deadlock if the I/O gets rerouted to different bufio
> + * client.
>   */
> -static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
> +static bool __try_evict_buffer(struct dm_buffer *b)
>  {
> -	if (!(gfp & __GFP_FS)) {
> -		if (test_bit(B_READING, &b->state) ||
> -		    test_bit(B_WRITING, &b->state) ||
> -		    test_bit(B_DIRTY, &b->state))
> -			return false;
> -	}
> +	if (test_bit(B_READING, &b->state) ||
> +	    test_bit(B_WRITING, &b->state) ||
> +	    test_bit(B_DIRTY, &b->state))
> +		return false;
>  
>  	if (b->hold_count)
>  		return false;
> @@ -1549,8 +1547,7 @@ static unsigned long get_retain_buffers(struct dm_bufio_client *c)
>  	return retain_bytes;
>  }
>  
> -static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
> -			    gfp_t gfp_mask)
> +static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan)
>  {
>  	int l;
>  	struct dm_buffer *b, *tmp;
> @@ -1561,7 +1558,7 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
>  
>  	for (l = 0; l < LIST_SIZE; l++) {
>  		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
> -			if (__try_evict_buffer(b, gfp_mask))
> +			if (__try_evict_buffer(b))
>  				freed++;
>  			if (!--nr_to_scan || ((count - freed) <= retain_target))
>  				return freed;
> @@ -1578,12 +1575,10 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
>  	unsigned long freed;
>  
>  	c = container_of(shrink, struct dm_bufio_client, shrinker);
> -	if (sc->gfp_mask & __GFP_FS)
> -		dm_bufio_lock(c);
> -	else if (!dm_bufio_trylock(c))
> +	if (!dm_bufio_trylock(c))
>  		return SHRINK_STOP;
>  
> -	freed  = __scan(c, sc->nr_to_scan, sc->gfp_mask);
> +	freed  = __scan(c, sc->nr_to_scan);
>  	dm_bufio_unlock(c);
>  	return freed;
>  }
> @@ -1811,7 +1806,7 @@ static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
>  		if (!older_than(b, age_hz))
>  			break;
>  
> -		if (__try_evict_buffer(b, 0))
> +		if (__try_evict_buffer(b))
>  			count--;
>  
>  		cond_resched();
> -- 
> 2.25.0
> 
> 
> --
> dm-devel mailing list
> dm-devel at redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel
>