[Linux-cachefs] [PATCH] cachefiles deadlock with 3.10 + fscache-20130702

Shantanu Goel sgoel01 at yahoo.com
Wed Jul 9 15:28:43 UTC 2014


Hi,

We are running Linux 3.10 + fscache-20130702 + commit bae6b235905ab9dc6659395f7802c1d36fb63f15 from dhowells' git tree and hit the following on several hosts.  Attached is a patch we have been using for a few weeks now and it seems to fix it.  Have you seen this problem?  Is there an alternative fix we could use?


Thanks,
Shantanu

    Jun 26 11:26:00 kworker/u24:7   D ffff88053300700e     0 11163      2 0x00000080   5 FAIR    0     0-11
    Jun 26 11:26:00 Workqueue: fscache_operation fscache_op_work_func [fscache]
    Jun 26 11:26:00  ffff8800141410f8 0000000000000002 ffff880014140fd8 ffff88001fa9f0c0
    Jun 26 11:26:00  0000000000012cc0 ffff880014141fd8 ffff880014140010 0000000000004000
    Jun 26 11:26:00  ffff880014141fd8 0000000000012cc0 ffff8805fdda10c0 ffff88001fa9f0c0
    Jun 26 11:26:00 Call Trace:
    Jun 26 11:26:00  [<ffffffff81073cfc>] ? arch_vtime_task_switch+0x6c/0x90
    Jun 26 11:26:00  [<ffffffff81070aa5>] ? finish_task_switch+0xa5/0xf0
    Jun 26 11:26:00  [<ffffffff81428862>] ? __schedule+0x442/0xa00
    Jun 26 11:26:00  [<ffffffff81428ee9>] schedule+0x29/0x70
    Jun 26 11:26:00  [<ffffffffa04e13e5>] __fscache_wait_on_page_write+0x75/0xb0 [fscache]
    Jun 26 11:26:00  [<ffffffff81062c60>] ? wake_up_bit+0x40/0x40
    Jun 26 11:26:00  [<ffffffffa053fed5>] ? nfs_commit_clear_lock+0x25/0x30 [nfs]
    Jun 26 11:26:00  [<ffffffffa04e15b5>] __fscache_maybe_release_page+0x55/0x1a0 [fscache]
    Jun 26 11:26:00  [<ffffffffa05442e6>] nfs_fscache_release_page+0x76/0xd0 [nfs]
    Jun 26 11:26:00  [<ffffffffa0533245>] nfs_release_page+0x55/0xa0 [nfs]
    Jun 26 11:26:00  [<ffffffff810f5372>] try_to_release_page+0x32/0x60
    Jun 26 11:26:00  [<ffffffff811076a3>] shrink_page_list+0x603/0x970
    Jun 26 11:26:00  [<ffffffff811064b9>] ? isolate_lru_pages+0xd9/0x1c0
    Jun 26 11:26:00  [<ffffffff81107fef>] shrink_inactive_list+0x18f/0x490
    Jun 26 11:26:00  [<ffffffff8107308e>] ? try_to_wake_up+0x20e/0x2b0
    Jun 26 11:26:00  [<ffffffff811088a9>] shrink_lruvec+0x269/0x470
    Jun 26 11:26:00  [<ffffffff81108b5e>] shrink_zone+0xae/0x270
    Jun 26 11:26:00  [<ffffffff8110a0e3>] do_try_to_free_pages+0xe3/0x560
    Jun 26 11:26:00  [<ffffffff810fa08f>] ? zone_watermark_ok+0x1f/0x30
    Jun 26 11:26:00  [<ffffffff8110a75e>] try_to_free_pages+0xce/0x150
    Jun 26 11:26:00  [<ffffffff810fede1>] __alloc_pages_nodemask+0x591/0x910
    Jun 26 11:26:00  [<ffffffff81036150>] ? flush_tlb_mm_range+0x240/0x240
    Jun 26 11:26:01  [<ffffffff81139a9a>] alloc_pages_current+0xba/0x160
    Jun 26 11:26:01  [<ffffffff810f6617>] __page_cache_alloc+0xa7/0xc0
    Jun 26 11:26:01  [<ffffffff810f686c>] grab_cache_page_write_begin+0x7c/0xe0
    Jun 26 11:26:01  [<ffffffffa0060ee9>] ext4_da_write_begin+0x149/0x2d0 [ext4]
    Jun 26 11:26:01  [<ffffffff810f552e>] generic_file_buffered_write+0x10e/0x280
    Jun 26 11:26:01  [<ffffffff81181c2b>] ? __mark_inode_dirty+0x19b/0x280
    Jun 26 11:26:01  [<ffffffff810f766f>] __generic_file_aio_write+0x1af/0x3c0
    Jun 26 11:26:01  [<ffffffff810f78e5>] generic_file_aio_write+0x65/0xd0
    Jun 26 11:26:01  [<ffffffffa00560c2>] ext4_file_write+0x62/0x430 [ext4]
    Jun 26 11:26:01  [<ffffffff811b6fa3>] ? dquot_initialize+0x13/0x20
    Jun 26 11:26:01  [<ffffffff811b6ff3>] ? dquot_file_open+0x43/0x50
    Jun 26 11:26:01  [<ffffffffa00555fe>] ? ext4_file_open+0x7e/0x250 [ext4]
    Jun 26 11:26:01  [<ffffffff8107308e>] ? try_to_wake_up+0x20e/0x2b0
    Jun 26 11:26:01  [<ffffffff811586ef>] do_sync_write+0x7f/0xb0
    Jun 26 11:26:01  [<ffffffffa0517deb>] cachefiles_write_page+0x13b/0x320 [cachefiles]
    Jun 26 11:26:01  [<ffffffffa04e06c9>] fscache_write_op+0x149/0x220 [fscache]
    Jun 26 11:26:01  [<ffffffffa04dee5e>] fscache_op_work_func+0x2e/0x90 [fscache]
    Jun 26 11:26:01  [<ffffffff8105b949>] process_one_work+0x169/0x4b0
    Jun 26 11:26:01  [<ffffffff8105bdb1>] worker_thread+0x121/0x3f0
    Jun 26 11:26:01  [<ffffffff81429bbe>] ? _raw_spin_unlock_irqrestore+0xe/0x10
    Jun 26 11:26:01  [<ffffffff8105bc90>] ? process_one_work+0x4b0/0x4b0
    Jun 26 11:26:01  [<ffffffff8106256e>] kthread+0xce/0xe0



diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 317f9ee..3ed6412 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -945,6 +945,19 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
        } else {
                ret = -EIO;
                if (file->f_op->write) {
+                       struct address_space *mapping = file->f_mapping;
+                       gfp_t gfp_mask = mapping_gfp_mask(mapping);
+                       gfp_t gfp_mask_orig = gfp_mask;
+
+                       /*
+                        * Clear __GFP_FS to avoid potential deadlock
+                        * during memory reclaim.
+                        */
+                       if (gfp_mask & __GFP_FS) {
+                               gfp_mask &= ~__GFP_FS;
+                               mapping_set_gfp_mask(mapping, gfp_mask);
+                       }
+
                        pos = (loff_t) page->index << PAGE_SHIFT;
 
                        /* we mustn't write more data than we have, so we have
@@ -972,6 +985,9 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
                        file_end_write(file);
                        if (ret != len)
                                ret = -EIO;
+
+                       if (gfp_mask != gfp_mask_orig)
+                               mapping_set_gfp_mask(gfp_mask_orig);
                }
                fput(file);
        }




More information about the Linux-cachefs mailing list