[dm-devel] [PATCH 2/4] kpartx: read devices with direct IO

Benjamin Marzinski bmarzins at redhat.com
Thu Jul 2 16:14:59 UTC 2020


On Thu, Jul 02, 2020 at 03:00:37PM +0000, Martin Wilck wrote:
> On Wed, 2020-07-01 at 17:39 -0500, Benjamin Marzinski wrote:
> > If kpartx is used on top of shared storage, and a device has its
> > partition table changed on one machine, and then kpartx is run on
> > another, it may not see the new data, because the cache still
> > contains
> > the old data, and there is nothing to tell the machine running kpartx
> > to
> > invalidate it. To solve this, kpartx should read the devices using
> > direct io.
> > 
> > One issue with how this code has been updated is that the original
> > code
> > for getblock() always read 1024 bytes. The new code reads a logical
> > sector size chunk of the device, and returns a pointer to the 512
> > byte
> > sector that the caller asked for, within that (possibly larger)
> > chunk.
> > This means that if the logical sector size is 512, then the code is
> > now
> > only reading 512 bytes.  Looking through the code for the various
> > partition types, I can't see a case where more than 512 bytes is
> > needed
> > and getblock() is used.  If anyone has a reason why this code should
> > be
> > reading 1024 bytes at minmum, I can certainly change this.  But when
> > I
> > looked, I couldn't find a case where reading 512 bytes would cause a
> > problem.
> > 
> > Signed-off-by: Benjamin Marzinski <bmarzins at redhat.com>
> 
> The value 1024 comes from first partx in util-linux, where Christophe 
> pulled the first kpartx from: 
> http://ftp.be.debian.org/pub/linux/utils/util-linux/v2.10/util-linux-2.10m.tar.gz 
> I, too, see no reason for reading more than a single block.
> 
> Two nits below. Here's one more, I'd have preferred two separate
> patches for these two different issues. But never mind.
> 
> Regards,
> Martin
> 
> > ---
> >  kpartx/dasd.c   |  7 ++++---
> >  kpartx/gpt.c    | 22 ++++++++++------------
> >  kpartx/kpartx.c | 46 +++++++++++++++++++++++++++++++++++-----------
> >  kpartx/kpartx.h |  2 ++
> >  4 files changed, 51 insertions(+), 26 deletions(-)
> > 
> > diff --git a/kpartx/dasd.c b/kpartx/dasd.c
> > index 14b9d3aa..f0398645 100644
> > --- a/kpartx/dasd.c
> > +++ b/kpartx/dasd.c
> > @@ -22,6 +22,7 @@
> >   * along with this program.  If not, see <
> > http://www.gnu.org/licenses/>;.
> >   */
> >  
> > +#define _GNU_SOURCE
> >  #include <stdio.h>
> >  #include <stdlib.h>
> >  #include <unistd.h>
> > @@ -117,13 +118,13 @@ read_dasd_pt(int fd, __attribute__((unused))
> > struct slice all,
> >  
> >  		sprintf(pathname, "/dev/.kpartx-node-%u-%u",
> >  			(unsigned int)major(dev), (unsigned
> > int)minor(dev));
> > -		if ((fd_dasd = open(pathname, O_RDONLY)) == -1) {
> > +		if ((fd_dasd = open(pathname, O_RDONLY | O_DIRECT)) ==
> > -1) {
> >  			/* Devicenode does not exist. Try to create one
> > */
> >  			if (mknod(pathname, 0600 | S_IFBLK, dev) == -1)
> > {
> >  				/* Couldn't create a device node */
> >  				return -1;
> >  			}
> > -			fd_dasd = open(pathname, O_RDONLY);
> > +			fd_dasd = open(pathname, O_RDONLY | O_DIRECT);
> >  			/*
> >  			 * The file will vanish when the last process
> > (we)
> >  			 * has ceased to access it.
> > @@ -175,7 +176,7 @@ read_dasd_pt(int fd, __attribute__((unused))
> > struct slice all,
> >  	 * Get volume label, extract name and type.
> >  	 */
> >  
> > -	if (!(data = (unsigned char *)malloc(blocksize)))
> > +	if (aligned_malloc((void **)&data, blocksize, NULL))
> >  		goto out;
> >  
> >  
> > diff --git a/kpartx/gpt.c b/kpartx/gpt.c
> > index 785b34ea..f7fefb70 100644
> > --- a/kpartx/gpt.c
> > +++ b/kpartx/gpt.c
> > @@ -243,8 +243,7 @@ alloc_read_gpt_entries(int fd, gpt_header * gpt)
> >  
> >  	if (!count) return NULL;
> >  
> > -	pte = (gpt_entry *)malloc(count);
> > -	if (!pte)
> > +	if (aligned_malloc((void **)&pte, get_sector_size(fd), &count))
> >  		return NULL;
> >  	memset(pte, 0, count);
> >  
> > @@ -269,12 +268,11 @@ static gpt_header *
> >  alloc_read_gpt_header(int fd, uint64_t lba)
> >  {
> >  	gpt_header *gpt;
> > -	gpt = (gpt_header *)
> > -	    malloc(sizeof (gpt_header));
> > -	if (!gpt)
> > +	size_t size = sizeof (gpt_header);
> > +	if (aligned_malloc((void **)&gpt, get_sector_size(fd), &size))
> >  		return NULL;
> > -	memset(gpt, 0, sizeof (*gpt));
> > -	if (!read_lba(fd, lba, gpt, sizeof (gpt_header))) {
> > +	memset(gpt, 0, size);
> > +	if (!read_lba(fd, lba, gpt, size)) {
> >  		free(gpt);
> >  		return NULL;
> >  	}
> > @@ -498,6 +496,7 @@ find_valid_gpt(int fd, gpt_header ** gpt,
> > gpt_entry ** ptes)
> >  	gpt_header *pgpt = NULL, *agpt = NULL;
> >  	gpt_entry *pptes = NULL, *aptes = NULL;
> >  	legacy_mbr *legacymbr = NULL;
> > +	size_t size = sizeof(legacy_mbr);
> >  	uint64_t lastlba;
> >  	if (!gpt || !ptes)
> >  		return 0;
> > @@ -526,11 +525,10 @@ find_valid_gpt(int fd, gpt_header ** gpt,
> > gpt_entry ** ptes)
> >  	}
> >  
> >  	/* This will be added to the EFI Spec. per Intel after v1.02.
> > */
> > -	legacymbr = malloc(sizeof (*legacymbr));
> > -	if (legacymbr) {
> > -		memset(legacymbr, 0, sizeof (*legacymbr));
> > -		read_lba(fd, 0, (uint8_t *) legacymbr,
> > -			 sizeof (*legacymbr));
> > +	if (aligned_malloc((void **)&legacymbr, get_sector_size(fd),
> > +			   &size) == 0) {
> > +		memset(legacymbr, 0, size);
> > +		read_lba(fd, 0, (uint8_t *) legacymbr, size);
> >  		good_pmbr = is_pmbr_valid(legacymbr);
> >  		free(legacymbr);
> >  		legacymbr=NULL;
> > diff --git a/kpartx/kpartx.c b/kpartx/kpartx.c
> > index d3620c5c..4716dd4d 100644
> > --- a/kpartx/kpartx.c
> > +++ b/kpartx/kpartx.c
> > @@ -19,6 +19,7 @@
> >   * cva, 2002-10-26
> >   */
> >  
> > +#define _GNU_SOURCE
> >  #include <stdio.h>
> >  #include <fcntl.h>
> >  #include <errno.h>
> > @@ -41,7 +42,6 @@
> >  
> >  #define SIZE(a) (sizeof(a)/sizeof((a)[0]))
> >  
> > -#define READ_SIZE	1024
> >  #define MAXTYPES	64
> >  #define MAXSLICES	256
> >  #define DM_TARGET	"linear"
> > @@ -388,7 +388,7 @@ main(int argc, char **argv){
> >  		set_delimiter(mapname, delim);
> >  	}
> >  
> > -	fd = open(device, O_RDONLY);
> > +	fd = open(device, O_RDONLY | O_DIRECT);
> >  
> >  	if (fd == -1) {
> >  		perror(device);
> > @@ -690,9 +690,9 @@ xmalloc (size_t size) {
> >   */
> >  
> >  static int
> > -sseek(int fd, unsigned int secnr) {
> > +sseek(int fd, unsigned int secnr, int secsz) {
> >  	off64_t in, out;
> > -	in = ((off64_t) secnr << 9);
> > +	in = ((off64_t) secnr * secsz);
> >  	out = 1;
> >  
> >  	if ((out = lseek64(fd, in, SEEK_SET)) != in)
> > @@ -703,6 +703,21 @@ sseek(int fd, unsigned int secnr) {
> >  	return 0;
> >  }
> >  
> > +int
> > +aligned_malloc(void **mem_p, size_t align, size_t *size_p)
> > +{
> > +	size_t pgsize = getpagesize();
> 
> Nit: I'd use a static variable here and call getpagesize() only once.
> 
> > +	size_t size = pgsize;
> > +	if (!mem_p || !align || (size_p && !*size_p))
> > +		return EINVAL;
> > +
> > +	if (size_p)
> > +		*size_p = size = ((*size_p + align - 1) / align) *
> > align;
> 
> It would be cleaner to set *size_p only after successful return from
> posix_memalign().

Sure. I can change these.

-Ben
 
> > +
> > +	return posix_memalign(mem_p, pgsize, size);
> > +}
> 
> 
> 
> 
> > +
> > +/* always in sector size blocks */
> >  static
> >  struct block {
> >  	unsigned int secnr;
> > @@ -710,30 +725,39 @@ struct block {
> >  	struct block *next;
> >  } *blockhead;
> >  
> > +/* blknr is always in 512 byte blocks */
> >  char *
> > -getblock (int fd, unsigned int secnr) {
> > +getblock (int fd, unsigned int blknr) {
> > +	unsigned int secsz = get_sector_size(fd);
> > +	unsigned int blks_per_sec = secsz / 512;
> > +	unsigned int secnr = blknr / blks_per_sec;
> > +	unsigned int blk_off = (blknr % blks_per_sec) * 512;
> >  	struct block *bp;
> >  
> >  	for (bp = blockhead; bp; bp = bp->next)
> >  
> >  		if (bp->secnr == secnr)
> > -			return bp->block;
> > +			return bp->block + blk_off;
> >  
> > -	if (sseek(fd, secnr))
> > +	if (sseek(fd, secnr, secsz))
> >  		return NULL;
> >  
> >  	bp = xmalloc(sizeof(struct block));
> >  	bp->secnr = secnr;
> >  	bp->next = blockhead;
> >  	blockhead = bp;
> > -	bp->block = (char *) xmalloc(READ_SIZE);
> > +	if (aligned_malloc((void **)&bp->block, secsz, NULL)) {
> > +		fprintf(stderr, "aligned_malloc failed\n");
> > +		exit(1);
> > +	}
> >  
> > -	if (read(fd, bp->block, READ_SIZE) != READ_SIZE) {
> > +	if (read(fd, bp->block, secsz) != secsz) {
> >  		fprintf(stderr, "read error, sector %d\n", secnr);
> > -		bp->block = NULL;
> > +		blockhead = bp->next;
> > +		return NULL;
> >  	}
> >  
> > -	return bp->block;
> > +	return bp->block + blk_off;
> >  }
> >  
> >  int
> > diff --git a/kpartx/kpartx.h b/kpartx/kpartx.h
> > index 67edeb82..727632c1 100644
> > --- a/kpartx/kpartx.h
> > +++ b/kpartx/kpartx.h
> > @@ -1,6 +1,7 @@
> >  #ifndef _KPARTX_H
> >  #define _KPARTX_H
> >  
> > +#include <stddef.h>
> >  #include <stdint.h>
> >  #include <sys/ioctl.h>
> >  
> > @@ -61,6 +62,7 @@ extern ptreader read_mac_pt;
> >  extern ptreader read_sun_pt;
> >  extern ptreader read_ps3_pt;
> >  
> > +int aligned_malloc(void **mem_p, size_t align, size_t *size_p);
> >  char *getblock(int fd, unsigned int secnr);
> >  
> >  static inline unsigned int
> 
> -- 
> Dr. Martin Wilck <mwilck at suse.com>, Tel. +49 (0)911 74053 2107
> SUSE  Software Solutions Germany GmbH
> HRB 36809, AG Nürnberg GF: Felix
> Imendörffer
> 




More information about the dm-devel mailing list