[Cluster-devel] [fsck.gfs2 PATCH 25/30] fsck.gfs2: Use bitmaps instead of linked list for inodes w/nlink == 1

Bob Peterson rpeterso at redhat.com
Fri Apr 15 13:38:19 UTC 2016


This patch adds two new in-core bitmaps. Their purpose is to replace
the most common case where inodes have an nlink count equal to 1.
If we've got petabytes of inodes, this will save us a lot of memory.

Signed-off-by: Bob Peterson <rpeterso at redhat.com>
---
 gfs2/fsck/link.c     | 110 +++++++++++++++++++++++++++++++++++++--------------
 gfs2/fsck/link.h     |   4 ++
 gfs2/fsck/main.c     |   3 ++
 gfs2/fsck/metawalk.c |   2 +
 gfs2/fsck/pass1.c    |  40 +++++++++++++++++--
 gfs2/fsck/pass1b.c   |   3 ++
 gfs2/fsck/pass2.c    |   8 ++++
 gfs2/fsck/pass4.c    |  80 +++++++++++++++++++++++++------------
 gfs2/fsck/util.h     |  23 +++++++++++
 9 files changed, 214 insertions(+), 59 deletions(-)

diff --git a/gfs2/fsck/link.c b/gfs2/fsck/link.c
index 7e8acb7..0243d85 100644
--- a/gfs2/fsck/link.c
+++ b/gfs2/fsck/link.c
@@ -15,6 +15,26 @@
 #include "link.h"
 #include "util.h"
 
+struct gfs2_bmap nlink1map = { 0 }; /* map of dinodes with nlink == 1 */
+struct gfs2_bmap clink1map = { 0 }; /* map of dinodes w/counted links == 1 */
+
+int link1_set(struct gfs2_bmap *bmap, uint64_t bblock, int mark)
+{
+	static unsigned char *byte;
+	static uint64_t b;
+
+	if (!bmap)
+		return 0;
+	if (bblock > bmap->size)
+		return -1;
+
+	byte = bmap->map + BLOCKMAP_SIZE1(bblock);
+	b = BLOCKMAP_BYTE_OFFSET1(bblock);
+	*byte &= ~(BLOCKMAP_MASK1 << b);
+	*byte |= (mark & BLOCKMAP_MASK1) << b;
+	return 0;
+}
+
 int set_di_nlink(struct gfs2_inode *ip)
 {
 	struct inode_info *ii;
@@ -32,6 +52,10 @@ int set_di_nlink(struct gfs2_inode *ip)
 		di->di_nlink = ip->i_di.di_nlink;
 		return 0;
 	}
+	if (ip->i_di.di_nlink == 1) {
+		link1_set(&nlink1map, ip->i_di.di_num.no_addr, 1);
+		return 0;
+	}
 	/*log_debug( _("Setting link count to %u for %" PRIu64
 	  " (0x%" PRIx64 ")\n"), count, inode_no, inode_no);*/
 	/* If the list has entries, look for one that matches inode_no */
@@ -45,12 +69,21 @@ int set_di_nlink(struct gfs2_inode *ip)
 	return 0;
 }
 
+/* I'm making whyincr a macro rather than function so that the debug output
+ * matches older versions. */
+#define whyincr(no_addr, why, referenced_from, counted_links)		\
+	log_debug(_("Dir (0x%llx) incremented counted links to %u "	\
+		    "for (0x%llx) via %s\n"),				\
+		  (unsigned long long)referenced_from, counted_links,	\
+		  (unsigned long long)no_addr, why);
+
 int incr_link_count(struct gfs2_inum no, struct gfs2_inode *ip,
 		    const char *why)
 {
 	struct inode_info *ii = NULL;
 	uint64_t referenced_from = ip ? ip->i_di.di_num.no_addr : 0;
 	struct dir_info *di;
+	struct gfs2_inode *link_ip;
 
 	di = dirtree_find(no.no_addr);
 	if (di) {
@@ -58,11 +91,7 @@ int incr_link_count(struct gfs2_inum no, struct gfs2_inode *ip,
 			return 1;
 
 		di->counted_links++;
-		log_debug( _("Dir (0x%llx) incremented counted links to %u "
-			     "for (0x%llx) via %s\n"),
-			   (unsigned long long)referenced_from,
-			   di->counted_links,
-			   (unsigned long long)no.no_addr, why);
+		whyincr(no.no_addr, why, referenced_from, di->counted_links);
 		return 0;
 	}
 	ii = inodetree_find(no.no_addr);
@@ -72,27 +101,52 @@ int incr_link_count(struct gfs2_inum no, struct gfs2_inode *ip,
 			return 1;
 
 		ii->counted_links++;
-		log_debug( _("Dir (0x%llx) incremented counted "
-			     "links to %u for (0x%llx) via %s\n"),
-			   (unsigned long long)referenced_from,
-			   ii->counted_links, (unsigned long long)no.no_addr,
-			   why);
+		whyincr(no.no_addr, why, referenced_from, ii->counted_links);
 		return 0;
 	}
-	log_debug( _("Ref: (0x%llx) No match found when incrementing "
-		     "link for (0x%llx)!\n"),
-		   (unsigned long long)referenced_from,
-		   (unsigned long long)no.no_addr);
-	/* If no match was found, add a new entry and set its
-	 * counted links to 1 */
+	if (link1_type(&clink1map, no.no_addr) != 1) {
+		link1_set(&clink1map, no.no_addr, 1);
+		whyincr(no.no_addr, why, referenced_from, 1);
+		return 0;
+	}
+
+	link_ip = fsck_load_inode(ip->i_sbd, no.no_addr);
+	/* Check formal ino against dinode before adding to inode tree. */
+	if (no.no_formal_ino != ip->i_di.di_num.no_formal_ino) {
+		fsck_inode_put(&link_ip);
+		return 1;
+	}
+	/* Move it from the link1 maps to a real inode tree entry */
+	link1_set(&nlink1map, no.no_addr, 0);
+	link1_set(&clink1map, no.no_addr, 0);
+
+	/* If no match was found, it must be a hard link. In theory, it can't
+	   be a duplicate because those were resolved in pass1b. Add a new
+	   inodetree entry and set its counted links to 2 */
 	ii = inodetree_insert(no);
-	if (ii)
-		ii->counted_links = 1;
-	else
+	if (!ii) {
+		log_debug( _("Ref: (0x%llx) Error incrementing link for "
+			     "(0x%llx)!\n"),
+			   (unsigned long long)referenced_from,
+			   (unsigned long long)no.no_addr);
+		fsck_inode_put(&link_ip);
 		return -1;
+	}
+	ii->di_num = link_ip->i_di.di_num;
+	fsck_inode_put(&link_ip);
+	ii->di_nlink = 1; /* Must be 1 or it wouldn't have gotten into the
+			     nlink1map */
+	ii->counted_links = 2;
+	whyincr(no.no_addr, why, referenced_from, ii->counted_links);
 	return 0;
 }
 
+#define whydecr(no_addr, why, referenced_from, counted_links)		\
+	log_debug(_("Dir (0x%llx) decremented counted links to %u "	\
+		    "for (0x%llx) via %s\n"),				\
+		  (unsigned long long)referenced_from, counted_links,	\
+		  (unsigned long long)no_addr, why);
+
 int decr_link_count(uint64_t inode_no, uint64_t referenced_from, int gfs1,
 		    const char *why)
 {
@@ -109,11 +163,7 @@ int decr_link_count(uint64_t inode_no, uint64_t referenced_from, int gfs1,
 			return 0;
 		}
 		di->counted_links--;
-		log_debug( _("Dir (0x%llx) decremented counted links to %u "
-			     "for (0x%llx) via %s\n"),
-			   (unsigned long long)referenced_from,
-			   di->counted_links, (unsigned long long)inode_no,
-			   why);
+		whydecr(inode_no, why, referenced_from, di->counted_links);
 		return 0;
 	}
 
@@ -129,13 +179,15 @@ int decr_link_count(uint64_t inode_no, uint64_t referenced_from, int gfs1,
 			return 0;
 		}
 		ii->counted_links--;
-		log_debug( _("Dir (0x%llx) decremented counted "
-			     "links to %u for (0x%llx) via %s\n"),
-			   (unsigned long long)referenced_from,
-			   ii->counted_links, (unsigned long long)inode_no,
-			   why);
+		whydecr(inode_no, why, referenced_from, ii->counted_links);
 		return 0;
 	}
+	if (link1_type(&clink1map, inode_no) == 1) { /* 1 -> 0 */
+		link1_set(&clink1map, inode_no, 0);
+		whydecr(inode_no, why, referenced_from, 0);
+		return 0;
+	}
+
 	log_debug( _("No match found when decrementing link for (0x%llx)!\n"),
 		   (unsigned long long)inode_no);
 	return -1;
diff --git a/gfs2/fsck/link.h b/gfs2/fsck/link.h
index 76a195a..14534e5 100644
--- a/gfs2/fsck/link.h
+++ b/gfs2/fsck/link.h
@@ -1,6 +1,10 @@
 #ifndef _LINK_H
 #define _LINK_H
 
+extern struct gfs2_bmap nlink1map; /* map of dinodes with nlink == 1 */
+extern struct gfs2_bmap clink1map; /* map of dinodes w/counted links == 1 */
+
+int link1_set(struct gfs2_bmap *bmap, uint64_t bblock, int mark);
 int set_di_nlink(struct gfs2_inode *ip);
 int incr_link_count(struct gfs2_inum no, struct gfs2_inode *ip,
 		    const char *why);
diff --git a/gfs2/fsck/main.c b/gfs2/fsck/main.c
index 91ebe28..c5967bc 100644
--- a/gfs2/fsck/main.c
+++ b/gfs2/fsck/main.c
@@ -20,6 +20,7 @@
 #include "copyright.cf"
 #include "libgfs2.h"
 #include "fsck.h"
+#include "link.h"
 #include "osi_list.h"
 #include "metawalk.h"
 #include "util.h"
@@ -352,6 +353,8 @@ int main(int argc, char **argv)
 	if (!opts.no && errors_corrected)
 		log_notice( _("Writing changes to disk\n"));
 	fsync(sdp->device_fd);
+	link1_destroy(&nlink1map);
+	link1_destroy(&clink1map);
 	destroy(sdp);
 	if (sb_fixed)
 		log_warn(_("Superblock was reset. Use tunegfs2 to manually "
diff --git a/gfs2/fsck/metawalk.c b/gfs2/fsck/metawalk.c
index b032029..dee8d5a 100644
--- a/gfs2/fsck/metawalk.c
+++ b/gfs2/fsck/metawalk.c
@@ -14,6 +14,7 @@
 
 #include <logging.h>
 #include "libgfs2.h"
+#include "link.h"
 #include "osi_tree.h"
 #include "fsck.h"
 #include "util.h"
@@ -88,6 +89,7 @@ int check_n_fix_bitmap(struct gfs2_sbd *sdp, uint64_t blk, int error_on_dinode,
 					ii = inodetree_find(blk);
 					if (ii)
 						inodetree_delete(ii);
+					link1_set(&nlink1map, blk, 0);
 				}
 				rgd->rg.rg_free++;
 				if (old_bitmap_state == GFS2_BLKST_DINODE)
diff --git a/gfs2/fsck/pass1.c b/gfs2/fsck/pass1.c
index f4eea49..ee20b9e 100644
--- a/gfs2/fsck/pass1.c
+++ b/gfs2/fsck/pass1.c
@@ -2018,6 +2018,20 @@ static int gfs2_blockmap_create(struct gfs2_bmap *bmap, uint64_t size)
 	return 0;
 }
 
+
+static int link1_create(struct gfs2_bmap *bmap, uint64_t size)
+{
+	bmap->size = size;
+
+	/* Have to add 1 to BLOCKMAP_SIZE since it's 0-based and mallocs
+	 * must be 1-based */
+	bmap->mapsize = BLOCKMAP_SIZE1(size) + 1;
+
+	if (!(bmap->map = calloc(bmap->mapsize, sizeof(char))))
+		return -ENOMEM;
+	return 0;
+}
+
 static struct gfs2_bmap *gfs2_bmap_create(struct gfs2_sbd *sdp, uint64_t size,
 					  uint64_t *addl_mem_needed)
 {
@@ -2054,6 +2068,14 @@ static void *gfs2_bmap_destroy(struct gfs2_sbd *sdp, struct gfs2_bmap *il)
 	return il;
 }
 
+static void enomem(uint64_t addl_mem_needed)
+{
+	log_crit( _("This system doesn't have enough memory and swap space to fsck this file system.\n"));
+	log_crit( _("Additional memory needed is approximately: %lluMB\n"),
+		  (unsigned long long)(addl_mem_needed / 1048576ULL));
+	log_crit( _("Please increase your swap space by that amount and run gfs2_fsck again.\n"));
+}
+
 /**
  * pass1 - walk through inodes and check inode state
  *
@@ -2079,10 +2101,20 @@ int pass1(struct gfs2_sbd *sdp)
 
 	bl = gfs2_bmap_create(sdp, last_fs_block+1, &addl_mem_needed);
 	if (!bl) {
-		log_crit( _("This system doesn't have enough memory and swap space to fsck this file system.\n"));
-		log_crit( _("Additional memory needed is approximately: %lluMB\n"),
-			 (unsigned long long)(addl_mem_needed / 1048576ULL));
-		log_crit( _("Please increase your swap space by that amount and run gfs2_fsck again.\n"));
+		enomem(addl_mem_needed);
+		return FSCK_ERROR;
+	}
+	addl_mem_needed = link1_create(&nlink1map, last_fs_block+1);
+	if (addl_mem_needed) {
+		enomem(addl_mem_needed);
+		gfs2_bmap_destroy(sdp, bl);
+		return FSCK_ERROR;
+	}
+	addl_mem_needed = link1_create(&clink1map, last_fs_block+1);
+	if (addl_mem_needed) {
+		enomem(addl_mem_needed);
+		link1_destroy(&nlink1map);
+		gfs2_bmap_destroy(sdp, bl);
 		return FSCK_ERROR;
 	}
 	osi_list_init(&gfs1_rindex_blks.list);
diff --git a/gfs2/fsck/pass1b.c b/gfs2/fsck/pass1b.c
index db477ff..d6ab285 100644
--- a/gfs2/fsck/pass1b.c
+++ b/gfs2/fsck/pass1b.c
@@ -10,6 +10,7 @@
 
 #include <logging.h>
 #include "libgfs2.h"
+#include "link.h"
 #include "fsck.h"
 #include "osi_list.h"
 #include "util.h"
@@ -353,6 +354,8 @@ static void resolve_dup_references(struct gfs2_sbd *sdp, struct duptree *dt,
 				di = dirtree_find(ip->i_di.di_num.no_addr);
 				if (di)
 					dirtree_delete(di);
+				link1_set(&nlink1map, ip->i_di.di_num.no_addr,
+					  0);
 				fsck_bitmap_set(ip, ip->i_di.di_num.no_addr,
 						_("duplicate referencing bad"),
 						GFS2_BLKST_UNLINKED);
diff --git a/gfs2/fsck/pass2.c b/gfs2/fsck/pass2.c
index 3a8646f..d4f50d2 100644
--- a/gfs2/fsck/pass2.c
+++ b/gfs2/fsck/pass2.c
@@ -195,6 +195,8 @@ static int bad_formal_ino(struct gfs2_inode *ip, struct gfs2_dirent *dent,
 		di = dirtree_find(entry.no_addr);
 		if (di)
 			inum = di->dinode;
+		else if (link1_type(&clink1map, entry.no_addr) == 1)
+			inum = entry;
 	}
 	log_err( _("Directory entry '%s' pointing to block %llu (0x%llx) in "
 		   "directory %llu (0x%llx) has the wrong 'formal' inode "
@@ -613,6 +615,12 @@ static int basic_dentry_checks(struct gfs2_inode *ip, struct gfs2_dirent *dent,
 		di = dirtree_find(entry->no_addr);
 		if (di)
 			inum = di->dinode;
+		else if (link1_type(&nlink1map, entry->no_addr) == 1) {
+			/* Since we don't have ii or di, the only way to
+			   validate formal_ino is to read in the inode, which
+			   would kill performance. So skip it for now. */
+			return 0;
+		}
 	}
 	if (inum.no_formal_ino != entry->no_formal_ino) {
 		log_err( _("Directory entry '%s' pointing to block %llu "
diff --git a/gfs2/fsck/pass4.c b/gfs2/fsck/pass4.c
index c65d321..16339cc 100644
--- a/gfs2/fsck/pass4.c
+++ b/gfs2/fsck/pass4.c
@@ -9,6 +9,7 @@
 #include <logging.h>
 #include "libgfs2.h"
 #include "fsck.h"
+#include "link.h"
 #include "lost_n_found.h"
 #include "inode_hash.h"
 #include "metawalk.h"
@@ -153,11 +154,30 @@ static void handle_inconsist(struct gfs2_sbd *sdp, uint64_t no_addr,
 	}
 }
 
+static int adjust_lf_links(int lf_addition)
+{
+	struct dir_info *lf_di;
+
+	if (lf_dip == NULL)
+		return 0;
+
+	if (!lf_addition)
+		return 0;
+
+	if (!(lf_di = dirtree_find(lf_dip->i_di.di_num.no_addr))) {
+		log_crit(_("Unable to find lost+found inode in "
+			   "inode_hash!!\n"));
+		return -1;
+	} else {
+		fix_link_count(lf_di->counted_links, lf_dip);
+	}
+	return 0;
+}
+
 static int scan_inode_list(struct gfs2_sbd *sdp)
 {
 	struct osi_node *tmp, *next = NULL;
 	struct inode_info *ii;
-	struct dir_info *lf_di;
 	int lf_addition = 0;
 
 	/* FIXME: should probably factor this out into a generic
@@ -187,25 +207,13 @@ static int scan_inode_list(struct gfs2_sbd *sdp)
 			 (unsigned long long)ii->di_num.no_addr, ii->di_nlink);
 	} /* osi_list_foreach(tmp, list) */
 
-	if (lf_dip == NULL)
-		return 0;
-
-	if (lf_addition) {
-		if (!(lf_di = dirtree_find(lf_dip->i_di.di_num.no_addr))) {
-			log_crit( _("Unable to find lost+found inode in inode_hash!!\n"));
-			return -1;
-		} else {
-			fix_link_count(lf_di->counted_links, lf_dip);
-		}
-	}
-
-	return 0;
+	return adjust_lf_links(lf_addition);
 }
 
 static int scan_dir_list(struct gfs2_sbd *sdp)
 {
 	struct osi_node *tmp, *next = NULL;
-	struct dir_info *di, *lf_di;
+	struct dir_info *di;
 	int lf_addition = 0;
 
 	/* FIXME: should probably factor this out into a generic
@@ -232,19 +240,33 @@ static int scan_dir_list(struct gfs2_sbd *sdp)
 			 (unsigned long long)di->dinode.no_addr, di->di_nlink);
 	} /* osi_list_foreach(tmp, list) */
 
-	if (lf_dip == NULL)
-		return 0;
+	return adjust_lf_links(lf_addition);
+}
 
-	if (lf_addition) {
-		if (!(lf_di = dirtree_find(lf_dip->i_di.di_num.no_addr))) {
-			log_crit( _("Unable to find lost+found inode in inode_hash!!\n"));
-			return -1;
-		} else {
-			fix_link_count(lf_di->counted_links, lf_dip);
+static int scan_nlink1_list(struct gfs2_sbd *sdp)
+{
+	uint64_t blk;
+	uint32_t counted_links;
+	int lf_addition = 0;
+
+	for (blk = 0; blk < last_fs_block; blk++) {
+		if (skip_this_pass || fsck_abort)
+			return 0;
+		if (link1_type(&nlink1map, blk) == 0)
+			continue;
+
+		if (link1_type(&clink1map, blk) == 0) {
+			/* In other cases, counted_links is a pointer to a
+			   real count that gets incremented when it's added
+			   to lost+found. In this case, however, there's not a
+			   real count, so we fake it out to be 1. */
+			counted_links = 1;
+			if (handle_unlinked(sdp, blk, &counted_links,
+					    &lf_addition))
+				continue;
 		}
 	}
-
-	return 0;
+	return adjust_lf_links(lf_addition);
 }
 
 /**
@@ -261,15 +283,21 @@ int pass4(struct gfs2_sbd *sdp)
 	if (lf_dip)
 		log_debug( _("At beginning of pass4, lost+found entries is %u\n"),
 				  lf_dip->i_di.di_entries);
-	log_info( _("Checking inode reference counts.\n"));
+	log_info( _("Checking inode reference counts: multi-links.\n"));
 	if (scan_inode_list(sdp)) {
 		stack;
 		return FSCK_ERROR;
 	}
+	log_info( _("Checking inode reference counts: directories.\n"));
 	if (scan_dir_list(sdp)) {
 		stack;
 		return FSCK_ERROR;
 	}
+	log_info( _("Checking inode reference counts: normal links.\n"));
+	if (scan_nlink1_list(sdp)) {
+		stack;
+		return FSCK_ERROR;
+	}
 
 	if (lf_dip)
 		log_debug( _("At end of pass4, lost+found entries is %u\n"),
diff --git a/gfs2/fsck/util.h b/gfs2/fsck/util.h
index aa01552..4545594 100644
--- a/gfs2/fsck/util.h
+++ b/gfs2/fsck/util.h
@@ -23,9 +23,12 @@ extern void dup_listent_delete(struct duptree *dt, struct inode_with_dups *id);
 
 extern const char *reftypes[ref_types + 1];
 
+#define BLOCKMAP_SIZE1(size) ((size) >> 3)
 #define BLOCKMAP_SIZE2(size) ((size) >> 2)
 #define BLOCKMAP_BYTE_OFFSET2(x) ((x & 0x0000000000000003) << 1)
+#define BLOCKMAP_BYTE_OFFSET1(x) (x & 0x0000000000000007)
 #define BLOCKMAP_MASK2 (0x3)
+#define BLOCKMAP_MASK1 (1)
 
 struct fsck_pass {
 	const char *name;
@@ -44,6 +47,26 @@ static inline int block_type(struct gfs2_bmap *bl, uint64_t bblock)
 	return btype;
 }
 
+static inline int link1_type(struct gfs2_bmap *bl, uint64_t bblock)
+{
+	static unsigned char *byte;
+	static uint64_t b;
+	static int btype;
+
+	byte = bl->map + BLOCKMAP_SIZE1(bblock);
+	b = BLOCKMAP_BYTE_OFFSET1(bblock);
+	btype = (*byte & (BLOCKMAP_MASK1 << b )) >> b;
+	return btype;
+}
+
+static inline void link1_destroy(struct gfs2_bmap *bmap)
+{
+	if (bmap->map)
+		free(bmap->map);
+	bmap->size = 0;
+	bmap->mapsize = 0;
+}
+
 static inline int bitmap_type(struct gfs2_sbd *sdp, uint64_t bblock)
 {
 	struct rgrp_tree *rgd;
-- 
2.5.5




More information about the Cluster-devel mailing list