[Cluster-devel] [fsck.gfs2 PATCH 25/30] fsck.gfs2: Use bitmaps instead of linked list for inodes w/nlink == 1
Bob Peterson
rpeterso at redhat.com
Fri Apr 15 13:38:19 UTC 2016
This patch adds two new in-core bitmaps. Their purpose is to replace
the most common case where inodes have an nlink count equal to 1.
If we've got petabytes of inodes, this will save us a lot of memory.
Signed-off-by: Bob Peterson <rpeterso at redhat.com>
---
gfs2/fsck/link.c | 110 +++++++++++++++++++++++++++++++++++++--------------
gfs2/fsck/link.h | 4 ++
gfs2/fsck/main.c | 3 ++
gfs2/fsck/metawalk.c | 2 +
gfs2/fsck/pass1.c | 40 +++++++++++++++++--
gfs2/fsck/pass1b.c | 3 ++
gfs2/fsck/pass2.c | 8 ++++
gfs2/fsck/pass4.c | 80 +++++++++++++++++++++++++------------
gfs2/fsck/util.h | 23 +++++++++++
9 files changed, 214 insertions(+), 59 deletions(-)
diff --git a/gfs2/fsck/link.c b/gfs2/fsck/link.c
index 7e8acb7..0243d85 100644
--- a/gfs2/fsck/link.c
+++ b/gfs2/fsck/link.c
@@ -15,6 +15,26 @@
#include "link.h"
#include "util.h"
+struct gfs2_bmap nlink1map = { 0 }; /* map of dinodes with nlink == 1 */
+struct gfs2_bmap clink1map = { 0 }; /* map of dinodes w/counted links == 1 */
+
+int link1_set(struct gfs2_bmap *bmap, uint64_t bblock, int mark)
+{
+ static unsigned char *byte;
+ static uint64_t b;
+
+ if (!bmap)
+ return 0;
+ if (bblock > bmap->size)
+ return -1;
+
+ byte = bmap->map + BLOCKMAP_SIZE1(bblock);
+ b = BLOCKMAP_BYTE_OFFSET1(bblock);
+ *byte &= ~(BLOCKMAP_MASK1 << b);
+ *byte |= (mark & BLOCKMAP_MASK1) << b;
+ return 0;
+}
+
int set_di_nlink(struct gfs2_inode *ip)
{
struct inode_info *ii;
@@ -32,6 +52,10 @@ int set_di_nlink(struct gfs2_inode *ip)
di->di_nlink = ip->i_di.di_nlink;
return 0;
}
+ if (ip->i_di.di_nlink == 1) {
+ link1_set(&nlink1map, ip->i_di.di_num.no_addr, 1);
+ return 0;
+ }
/*log_debug( _("Setting link count to %u for %" PRIu64
" (0x%" PRIx64 ")\n"), count, inode_no, inode_no);*/
/* If the list has entries, look for one that matches inode_no */
@@ -45,12 +69,21 @@ int set_di_nlink(struct gfs2_inode *ip)
return 0;
}
+/* I'm making whyincr a macro rather than function so that the debug output
+ * matches older versions. */
+#define whyincr(no_addr, why, referenced_from, counted_links) \
+ log_debug(_("Dir (0x%llx) incremented counted links to %u " \
+ "for (0x%llx) via %s\n"), \
+ (unsigned long long)referenced_from, counted_links, \
+ (unsigned long long)no_addr, why);
+
int incr_link_count(struct gfs2_inum no, struct gfs2_inode *ip,
const char *why)
{
struct inode_info *ii = NULL;
uint64_t referenced_from = ip ? ip->i_di.di_num.no_addr : 0;
struct dir_info *di;
+ struct gfs2_inode *link_ip;
di = dirtree_find(no.no_addr);
if (di) {
@@ -58,11 +91,7 @@ int incr_link_count(struct gfs2_inum no, struct gfs2_inode *ip,
return 1;
di->counted_links++;
- log_debug( _("Dir (0x%llx) incremented counted links to %u "
- "for (0x%llx) via %s\n"),
- (unsigned long long)referenced_from,
- di->counted_links,
- (unsigned long long)no.no_addr, why);
+ whyincr(no.no_addr, why, referenced_from, di->counted_links);
return 0;
}
ii = inodetree_find(no.no_addr);
@@ -72,27 +101,52 @@ int incr_link_count(struct gfs2_inum no, struct gfs2_inode *ip,
return 1;
ii->counted_links++;
- log_debug( _("Dir (0x%llx) incremented counted "
- "links to %u for (0x%llx) via %s\n"),
- (unsigned long long)referenced_from,
- ii->counted_links, (unsigned long long)no.no_addr,
- why);
+ whyincr(no.no_addr, why, referenced_from, ii->counted_links);
return 0;
}
- log_debug( _("Ref: (0x%llx) No match found when incrementing "
- "link for (0x%llx)!\n"),
- (unsigned long long)referenced_from,
- (unsigned long long)no.no_addr);
- /* If no match was found, add a new entry and set its
- * counted links to 1 */
+ if (link1_type(&clink1map, no.no_addr) != 1) {
+ link1_set(&clink1map, no.no_addr, 1);
+ whyincr(no.no_addr, why, referenced_from, 1);
+ return 0;
+ }
+
+ link_ip = fsck_load_inode(ip->i_sbd, no.no_addr);
+ /* Check formal ino against dinode before adding to inode tree. */
+ if (no.no_formal_ino != ip->i_di.di_num.no_formal_ino) {
+ fsck_inode_put(&link_ip);
+ return 1;
+ }
+ /* Move it from the link1 maps to a real inode tree entry */
+ link1_set(&nlink1map, no.no_addr, 0);
+ link1_set(&clink1map, no.no_addr, 0);
+
+ /* If no match was found, it must be a hard link. In theory, it can't
+ be a duplicate because those were resolved in pass1b. Add a new
+ inodetree entry and set its counted links to 2 */
ii = inodetree_insert(no);
- if (ii)
- ii->counted_links = 1;
- else
+ if (!ii) {
+ log_debug( _("Ref: (0x%llx) Error incrementing link for "
+ "(0x%llx)!\n"),
+ (unsigned long long)referenced_from,
+ (unsigned long long)no.no_addr);
+ fsck_inode_put(&link_ip);
return -1;
+ }
+ ii->di_num = link_ip->i_di.di_num;
+ fsck_inode_put(&link_ip);
+ ii->di_nlink = 1; /* Must be 1 or it wouldn't have gotten into the
+ nlink1map */
+ ii->counted_links = 2;
+ whyincr(no.no_addr, why, referenced_from, ii->counted_links);
return 0;
}
+#define whydecr(no_addr, why, referenced_from, counted_links) \
+ log_debug(_("Dir (0x%llx) decremented counted links to %u " \
+ "for (0x%llx) via %s\n"), \
+ (unsigned long long)referenced_from, counted_links, \
+ (unsigned long long)no_addr, why);
+
int decr_link_count(uint64_t inode_no, uint64_t referenced_from, int gfs1,
const char *why)
{
@@ -109,11 +163,7 @@ int decr_link_count(uint64_t inode_no, uint64_t referenced_from, int gfs1,
return 0;
}
di->counted_links--;
- log_debug( _("Dir (0x%llx) decremented counted links to %u "
- "for (0x%llx) via %s\n"),
- (unsigned long long)referenced_from,
- di->counted_links, (unsigned long long)inode_no,
- why);
+ whydecr(inode_no, why, referenced_from, di->counted_links);
return 0;
}
@@ -129,13 +179,15 @@ int decr_link_count(uint64_t inode_no, uint64_t referenced_from, int gfs1,
return 0;
}
ii->counted_links--;
- log_debug( _("Dir (0x%llx) decremented counted "
- "links to %u for (0x%llx) via %s\n"),
- (unsigned long long)referenced_from,
- ii->counted_links, (unsigned long long)inode_no,
- why);
+ whydecr(inode_no, why, referenced_from, ii->counted_links);
return 0;
}
+ if (link1_type(&clink1map, inode_no) == 1) { /* 1 -> 0 */
+ link1_set(&clink1map, inode_no, 0);
+ whydecr(inode_no, why, referenced_from, 0);
+ return 0;
+ }
+
log_debug( _("No match found when decrementing link for (0x%llx)!\n"),
(unsigned long long)inode_no);
return -1;
diff --git a/gfs2/fsck/link.h b/gfs2/fsck/link.h
index 76a195a..14534e5 100644
--- a/gfs2/fsck/link.h
+++ b/gfs2/fsck/link.h
@@ -1,6 +1,10 @@
#ifndef _LINK_H
#define _LINK_H
+extern struct gfs2_bmap nlink1map; /* map of dinodes with nlink == 1 */
+extern struct gfs2_bmap clink1map; /* map of dinodes w/counted links == 1 */
+
+int link1_set(struct gfs2_bmap *bmap, uint64_t bblock, int mark);
int set_di_nlink(struct gfs2_inode *ip);
int incr_link_count(struct gfs2_inum no, struct gfs2_inode *ip,
const char *why);
diff --git a/gfs2/fsck/main.c b/gfs2/fsck/main.c
index 91ebe28..c5967bc 100644
--- a/gfs2/fsck/main.c
+++ b/gfs2/fsck/main.c
@@ -20,6 +20,7 @@
#include "copyright.cf"
#include "libgfs2.h"
#include "fsck.h"
+#include "link.h"
#include "osi_list.h"
#include "metawalk.h"
#include "util.h"
@@ -352,6 +353,8 @@ int main(int argc, char **argv)
if (!opts.no && errors_corrected)
log_notice( _("Writing changes to disk\n"));
fsync(sdp->device_fd);
+ link1_destroy(&nlink1map);
+ link1_destroy(&clink1map);
destroy(sdp);
if (sb_fixed)
log_warn(_("Superblock was reset. Use tunegfs2 to manually "
diff --git a/gfs2/fsck/metawalk.c b/gfs2/fsck/metawalk.c
index b032029..dee8d5a 100644
--- a/gfs2/fsck/metawalk.c
+++ b/gfs2/fsck/metawalk.c
@@ -14,6 +14,7 @@
#include <logging.h>
#include "libgfs2.h"
+#include "link.h"
#include "osi_tree.h"
#include "fsck.h"
#include "util.h"
@@ -88,6 +89,7 @@ int check_n_fix_bitmap(struct gfs2_sbd *sdp, uint64_t blk, int error_on_dinode,
ii = inodetree_find(blk);
if (ii)
inodetree_delete(ii);
+ link1_set(&nlink1map, blk, 0);
}
rgd->rg.rg_free++;
if (old_bitmap_state == GFS2_BLKST_DINODE)
diff --git a/gfs2/fsck/pass1.c b/gfs2/fsck/pass1.c
index f4eea49..ee20b9e 100644
--- a/gfs2/fsck/pass1.c
+++ b/gfs2/fsck/pass1.c
@@ -2018,6 +2018,20 @@ static int gfs2_blockmap_create(struct gfs2_bmap *bmap, uint64_t size)
return 0;
}
+
+static int link1_create(struct gfs2_bmap *bmap, uint64_t size)
+{
+ bmap->size = size;
+
+ /* Have to add 1 to BLOCKMAP_SIZE since it's 0-based and mallocs
+ * must be 1-based */
+ bmap->mapsize = BLOCKMAP_SIZE1(size) + 1;
+
+ if (!(bmap->map = calloc(bmap->mapsize, sizeof(char))))
+ return -ENOMEM;
+ return 0;
+}
+
static struct gfs2_bmap *gfs2_bmap_create(struct gfs2_sbd *sdp, uint64_t size,
uint64_t *addl_mem_needed)
{
@@ -2054,6 +2068,14 @@ static void *gfs2_bmap_destroy(struct gfs2_sbd *sdp, struct gfs2_bmap *il)
return il;
}
+static void enomem(uint64_t addl_mem_needed)
+{
+ log_crit( _("This system doesn't have enough memory and swap space to fsck this file system.\n"));
+ log_crit( _("Additional memory needed is approximately: %lluMB\n"),
+ (unsigned long long)(addl_mem_needed / 1048576ULL));
+ log_crit( _("Please increase your swap space by that amount and run gfs2_fsck again.\n"));
+}
+
/**
* pass1 - walk through inodes and check inode state
*
@@ -2079,10 +2101,20 @@ int pass1(struct gfs2_sbd *sdp)
bl = gfs2_bmap_create(sdp, last_fs_block+1, &addl_mem_needed);
if (!bl) {
- log_crit( _("This system doesn't have enough memory and swap space to fsck this file system.\n"));
- log_crit( _("Additional memory needed is approximately: %lluMB\n"),
- (unsigned long long)(addl_mem_needed / 1048576ULL));
- log_crit( _("Please increase your swap space by that amount and run gfs2_fsck again.\n"));
+ enomem(addl_mem_needed);
+ return FSCK_ERROR;
+ }
+ addl_mem_needed = link1_create(&nlink1map, last_fs_block+1);
+ if (addl_mem_needed) {
+ enomem(addl_mem_needed);
+ gfs2_bmap_destroy(sdp, bl);
+ return FSCK_ERROR;
+ }
+ addl_mem_needed = link1_create(&clink1map, last_fs_block+1);
+ if (addl_mem_needed) {
+ enomem(addl_mem_needed);
+ link1_destroy(&nlink1map);
+ gfs2_bmap_destroy(sdp, bl);
return FSCK_ERROR;
}
osi_list_init(&gfs1_rindex_blks.list);
diff --git a/gfs2/fsck/pass1b.c b/gfs2/fsck/pass1b.c
index db477ff..d6ab285 100644
--- a/gfs2/fsck/pass1b.c
+++ b/gfs2/fsck/pass1b.c
@@ -10,6 +10,7 @@
#include <logging.h>
#include "libgfs2.h"
+#include "link.h"
#include "fsck.h"
#include "osi_list.h"
#include "util.h"
@@ -353,6 +354,8 @@ static void resolve_dup_references(struct gfs2_sbd *sdp, struct duptree *dt,
di = dirtree_find(ip->i_di.di_num.no_addr);
if (di)
dirtree_delete(di);
+ link1_set(&nlink1map, ip->i_di.di_num.no_addr,
+ 0);
fsck_bitmap_set(ip, ip->i_di.di_num.no_addr,
_("duplicate referencing bad"),
GFS2_BLKST_UNLINKED);
diff --git a/gfs2/fsck/pass2.c b/gfs2/fsck/pass2.c
index 3a8646f..d4f50d2 100644
--- a/gfs2/fsck/pass2.c
+++ b/gfs2/fsck/pass2.c
@@ -195,6 +195,8 @@ static int bad_formal_ino(struct gfs2_inode *ip, struct gfs2_dirent *dent,
di = dirtree_find(entry.no_addr);
if (di)
inum = di->dinode;
+ else if (link1_type(&clink1map, entry.no_addr) == 1)
+ inum = entry;
}
log_err( _("Directory entry '%s' pointing to block %llu (0x%llx) in "
"directory %llu (0x%llx) has the wrong 'formal' inode "
@@ -613,6 +615,12 @@ static int basic_dentry_checks(struct gfs2_inode *ip, struct gfs2_dirent *dent,
di = dirtree_find(entry->no_addr);
if (di)
inum = di->dinode;
+ else if (link1_type(&nlink1map, entry->no_addr) == 1) {
+ /* Since we don't have ii or di, the only way to
+ validate formal_ino is to read in the inode, which
+ would kill performance. So skip it for now. */
+ return 0;
+ }
}
if (inum.no_formal_ino != entry->no_formal_ino) {
log_err( _("Directory entry '%s' pointing to block %llu "
diff --git a/gfs2/fsck/pass4.c b/gfs2/fsck/pass4.c
index c65d321..16339cc 100644
--- a/gfs2/fsck/pass4.c
+++ b/gfs2/fsck/pass4.c
@@ -9,6 +9,7 @@
#include <logging.h>
#include "libgfs2.h"
#include "fsck.h"
+#include "link.h"
#include "lost_n_found.h"
#include "inode_hash.h"
#include "metawalk.h"
@@ -153,11 +154,30 @@ static void handle_inconsist(struct gfs2_sbd *sdp, uint64_t no_addr,
}
}
+static int adjust_lf_links(int lf_addition)
+{
+ struct dir_info *lf_di;
+
+ if (lf_dip == NULL)
+ return 0;
+
+ if (!lf_addition)
+ return 0;
+
+ if (!(lf_di = dirtree_find(lf_dip->i_di.di_num.no_addr))) {
+ log_crit(_("Unable to find lost+found inode in "
+ "inode_hash!!\n"));
+ return -1;
+ } else {
+ fix_link_count(lf_di->counted_links, lf_dip);
+ }
+ return 0;
+}
+
static int scan_inode_list(struct gfs2_sbd *sdp)
{
struct osi_node *tmp, *next = NULL;
struct inode_info *ii;
- struct dir_info *lf_di;
int lf_addition = 0;
/* FIXME: should probably factor this out into a generic
@@ -187,25 +207,13 @@ static int scan_inode_list(struct gfs2_sbd *sdp)
(unsigned long long)ii->di_num.no_addr, ii->di_nlink);
} /* osi_list_foreach(tmp, list) */
- if (lf_dip == NULL)
- return 0;
-
- if (lf_addition) {
- if (!(lf_di = dirtree_find(lf_dip->i_di.di_num.no_addr))) {
- log_crit( _("Unable to find lost+found inode in inode_hash!!\n"));
- return -1;
- } else {
- fix_link_count(lf_di->counted_links, lf_dip);
- }
- }
-
- return 0;
+ return adjust_lf_links(lf_addition);
}
static int scan_dir_list(struct gfs2_sbd *sdp)
{
struct osi_node *tmp, *next = NULL;
- struct dir_info *di, *lf_di;
+ struct dir_info *di;
int lf_addition = 0;
/* FIXME: should probably factor this out into a generic
@@ -232,19 +240,33 @@ static int scan_dir_list(struct gfs2_sbd *sdp)
(unsigned long long)di->dinode.no_addr, di->di_nlink);
} /* osi_list_foreach(tmp, list) */
- if (lf_dip == NULL)
- return 0;
+ return adjust_lf_links(lf_addition);
+}
- if (lf_addition) {
- if (!(lf_di = dirtree_find(lf_dip->i_di.di_num.no_addr))) {
- log_crit( _("Unable to find lost+found inode in inode_hash!!\n"));
- return -1;
- } else {
- fix_link_count(lf_di->counted_links, lf_dip);
+static int scan_nlink1_list(struct gfs2_sbd *sdp)
+{
+ uint64_t blk;
+ uint32_t counted_links;
+ int lf_addition = 0;
+
+ for (blk = 0; blk < last_fs_block; blk++) {
+ if (skip_this_pass || fsck_abort)
+ return 0;
+ if (link1_type(&nlink1map, blk) == 0)
+ continue;
+
+ if (link1_type(&clink1map, blk) == 0) {
+ /* In other cases, counted_links is a pointer to a
+ real count that gets incremented when it's added
+ to lost+found. In this case, however, there's not a
+ real count, so we fake it out to be 1. */
+ counted_links = 1;
+ if (handle_unlinked(sdp, blk, &counted_links,
+ &lf_addition))
+ continue;
}
}
-
- return 0;
+ return adjust_lf_links(lf_addition);
}
/**
@@ -261,15 +283,21 @@ int pass4(struct gfs2_sbd *sdp)
if (lf_dip)
log_debug( _("At beginning of pass4, lost+found entries is %u\n"),
lf_dip->i_di.di_entries);
- log_info( _("Checking inode reference counts.\n"));
+ log_info( _("Checking inode reference counts: multi-links.\n"));
if (scan_inode_list(sdp)) {
stack;
return FSCK_ERROR;
}
+ log_info( _("Checking inode reference counts: directories.\n"));
if (scan_dir_list(sdp)) {
stack;
return FSCK_ERROR;
}
+ log_info( _("Checking inode reference counts: normal links.\n"));
+ if (scan_nlink1_list(sdp)) {
+ stack;
+ return FSCK_ERROR;
+ }
if (lf_dip)
log_debug( _("At end of pass4, lost+found entries is %u\n"),
diff --git a/gfs2/fsck/util.h b/gfs2/fsck/util.h
index aa01552..4545594 100644
--- a/gfs2/fsck/util.h
+++ b/gfs2/fsck/util.h
@@ -23,9 +23,12 @@ extern void dup_listent_delete(struct duptree *dt, struct inode_with_dups *id);
extern const char *reftypes[ref_types + 1];
+#define BLOCKMAP_SIZE1(size) ((size) >> 3)
#define BLOCKMAP_SIZE2(size) ((size) >> 2)
#define BLOCKMAP_BYTE_OFFSET2(x) ((x & 0x0000000000000003) << 1)
+#define BLOCKMAP_BYTE_OFFSET1(x) (x & 0x0000000000000007)
#define BLOCKMAP_MASK2 (0x3)
+#define BLOCKMAP_MASK1 (1)
struct fsck_pass {
const char *name;
@@ -44,6 +47,26 @@ static inline int block_type(struct gfs2_bmap *bl, uint64_t bblock)
return btype;
}
+static inline int link1_type(struct gfs2_bmap *bl, uint64_t bblock)
+{
+ static unsigned char *byte;
+ static uint64_t b;
+ static int btype;
+
+ byte = bl->map + BLOCKMAP_SIZE1(bblock);
+ b = BLOCKMAP_BYTE_OFFSET1(bblock);
+ btype = (*byte & (BLOCKMAP_MASK1 << b )) >> b;
+ return btype;
+}
+
+static inline void link1_destroy(struct gfs2_bmap *bmap)
+{
+ if (bmap->map)
+ free(bmap->map);
+ bmap->size = 0;
+ bmap->mapsize = 0;
+}
+
static inline int bitmap_type(struct gfs2_sbd *sdp, uint64_t bblock)
{
struct rgrp_tree *rgd;
--
2.5.5
More information about the Cluster-devel
mailing list