#ifndef _GNU_SOURCE # define _GNU_SOURCE #endif #ifndef _FILE_OFFSET_BITS # define _FILE_OFFSET_BITS 64 ///< We must be in 64-bit file I/O mode #endif #ifndef _XOPEN_SOURCE # define _XOPEN_SOURCE 500 #endif #include #include #include #include #include #include #include #include #include #include #include /* FROM dm.h */ #define SECTOR_SHIFT 9 #define SECSIZE 512 /* * Magic for persistent snapshots */ #define SNAP_MAGIC 0x70416e53 /* * The on-disk version of the metadata */ #define SNAPSHOT_DISK_VERSION 1 /* * LVM snapshot on-disk Layout * * The disk is divided into chunks each of (chunk_size * sector_size) bytes. * The boxes below represent chunks. chunk_size itself is stored in the header. * * +-------+ 0 * | | ====> contain header info * | | * | | * |-------| 1 * | | ====> contains metadata. i.e. exceptions_per_area (say X) * | | (lv_chunk, cow_chunk) pairs. the next X chunks have * | | the actual data. * |-------| 2 * | | * | | * | | * |-------| 3 * . . * . . * . . * |-------| X + 2 * | | =====> meta data chunk again ! similar to 1. * | | * | | * |-------| X + 3 * . . * . . * . . * * Speficially, with an 8192 byte chunk size and 16 bytes per exception entry, * we have 512 exception entries per metadata chunk. The disk layout is * therefore: * 0x00000000 Header Information * 0x00002000 1st Metadata area: 512 exception entries * 0x00004000 1st exception data block * 0x00006000 2nd exception data block * ... * 0x00402000 512th exception data block * 0x00404000 2nd Metadata area: 512 exception entries * 0x00406000 1st exception data block * 0x00408000 2nd exception data block * ... * 0x00804000 512th exception data block * 0x00806000 3rd Metadata area: 512 exception entries * etc * * * The snapshot COW table is filled from the beginning to the end. Each * metadata area is completely filled so the end of the file can be determined * by looking for a metadata area where the COW chunk offset is 0. This is * illegal because offset 0 in the COW file is the header and not a data block, * and therefore marks the end of the table. * * There is no attempt to optimise the table for faster searches using hash * tables or similar. These may be done if the table is loaded into memory and * then restructured. If the table is not loaded into memory and the on-disk * layout must be used, a linear search is the only solution to determine if a * COW block exists. This is not a problem if the COW data is small, but if * the LV is large and the COW proportion significant then a lot of metadata * must be read from disk (or cached in memory). * e.g. 100GB LV with 10% COW * COW data = 10GB * COW data blocks = 1,310,720 blocks * COW metadata areas = 2,560 * COW metadata total size = 20MB * * Hence, in this example, checking to see if a single 8KB block of data exists * in the COW will require up to 2,560 separate disk reads totalling 20 MB of * data. */ struct disk_header { uint32_t magic; /* * Is this snapshot valid. There is no way of recovering * an invalid snapshot. */ uint32_t valid; /* * Simple, incrementing version. no backward * compatibility. */ uint32_t version; /* In sectors */ uint32_t chunk_size; // The rest of the header chunk is unused }; struct disk_exception { uint64_t lv_chunk; uint64_t cow_chunk; }; #define CHUNK2SECTOR(_sec) ((_sec) * chunk_size) #define SECTOR2CHUNK(_chunk) ((_chunk) / chunk_size) #define CHUNK2BYTES(_sec) ((CHUNK2SECTOR(_sec))* (1 << SECTOR_SHIFT)) #define DISK_BUF_SIZE (1024*1024) #if __BYTE_ORDER == __LITTLE_ENDIAN #define le32_to_cpu(x) (x) #define le64_to_cpu(x) (x) #else #define le32_to_cpu(x) __bswap_32(x) #define le64_to_cpu(x) __bswap_64(x) #endif /* * Globals used in the file */ int32_t chunk_size; /* chunk size in sectors */ // ---------------------------------------------------------------------------- // Wrapper functions since it is permitted for a file read/write operation to // return a partial result. // ---------------------------------------------------------------------------- ssize_t safe_pread(int fildes, void *buf, size_t nbyte, off_t offset) { ssize_t res; ssize_t total; // printf("safe_pread(%i,%p,%lli,0x%016llx)\n", fildes, buf, (off_t)nbyte, offset); for(total = 0; total < nbyte; total += res) { res = pread(fildes, buf + total, nbyte - total, offset + total); if (res < 0) { fprintf(stderr,"File read error in %s line %i, error %i (%s)\n", __FILE__, __LINE__, errno, strerror(errno)); exit(1); } } return total; } ssize_t safe_pwrite(int fildes, void *buf, size_t nbyte, off_t offset) { ssize_t res; ssize_t total; // printf("safe_pwrite(%i,%p,%lli,0x%016llx)\n", fildes, buf, (off_t)nbyte, offset); for(total = 0; total < nbyte; total += res) { res = pwrite(fildes, buf + total, nbyte - total, offset + total); if (res < 0) { fprintf(stderr,"File read error in %s line %i, error %i (%s)\n", __FILE__, __LINE__, errno, strerror(errno)); exit(1); } } return total; } // ---------------------------------------------------------------------------- // Wrapper functions to allow O_DIRECT file access to be robust like general // file access. This has a performance impact if it has to rewrite the call // but makes life much easier at the higher layers. // ---------------------------------------------------------------------------- int isAligned(off_t offset) { int pagesize=getpagesize(); return !(offset % pagesize); } off_t getPageBase(off_t offset) { int pagesize=getpagesize(); return (offset/pagesize)*pagesize; } off_t getPageOffset(off_t offset) { int pagesize=getpagesize(); return offset % pagesize; } ssize_t getPageLength(off_t offset, ssize_t len) { int pagesize=getpagesize(); return ((offset+len + pagesize - 1)/pagesize)*pagesize; } ssize_t pread_direct(int fd, void *buf, size_t nbyte, off_t pos) { if (isAligned((unsigned)buf) && isAligned(nbyte) && isAligned(pos)) { // printf("SAFE PREAD DIRECT\n"); // All parameters are safe, so call directly. return safe_pread(fd, buf, nbyte, pos); } else { // We cannot take any chances so we must reallocate the required buffer ssize_t r; off_t readBase = getPageBase(pos); off_t readLen = getPageLength(pos, nbyte); void *alignedbuff = valloc(readLen); if (!alignedbuff) { printf("Cannot allocate buffer\n"); exit(1); } // printf("UNSAFE PREAD DIRECT(%i,%p,%lli,0x%016llx)\n", fd, buf, (off_t)nbyte, pos); r = safe_pread(fd, alignedbuff, readLen, readBase); if (r < 0) { fprintf(stderr,"File read error in %s line %i, error %i (%s)\n", __FILE__, __LINE__, errno, strerror(errno)); exit(1); } memcpy(buf, alignedbuff + getPageOffset(pos), nbyte); free(alignedbuff); return (nbyte < r)?nbyte:r; } } void restore_cow_block(int ofd, int ifd, off_t dst, off_t src, ssize_t length) { ssize_t allocSize = (DISK_BUF_SIZE < length)?DISK_BUF_SIZE:length; char *buf = valloc(allocSize); // printf("restore_cow_block(%i,%i,0x%016llx,0x%016llx, %lli)\n", // ofd, ifd, dst, src, (long long)length); off_t done, burst; for(done = 0; done < length; done += burst) { off_t remaining = length - done; burst = (remaining > allocSize)?allocSize:remaining; pread_direct(ifd, buf, burst, src + done); safe_pwrite(ofd, buf, burst, dst + done); } free(buf); } static int read_header(int ifd, struct disk_header *dh) { pread_direct(ifd, dh, sizeof(*dh), 0); // Perform endian corrections in-situ dh->valid = le32_to_cpu(dh->valid); dh->version = le32_to_cpu(dh->version); dh->chunk_size = le32_to_cpu(dh->chunk_size); return 0; } /* * convert (if reqd), * from : /dev/vg/lv * * to : /dev/mapper/vg-lv-cow */ static int convert_to_mapper(char *lvm_dev, char **mapper_dev) { char *lv; if (lvm_dev[0] != '/') { printf("convert_to_mapper: %s not an absolute path\n", lvm_dev); return -1; } /* 5 -> "/dev/" */ if (!(lv = strrchr(lvm_dev + 5, '/'))) { printf("convert_to_mapper: invalid path %s\n", lvm_dev); return -1; } /* * check if this is already a mapper device */ if (!strncmp(lvm_dev + 5, "mapper", 6)) { printf("convert_to_mapper: should not already be a mapper path %s\n", lvm_dev); return -1; } /* 12 -> "mapper", "-", "-cow", '\0' */ *mapper_dev = malloc(strlen(lvm_dev) + 12); strcpy(*mapper_dev, "/dev/mapper/"); /* copy vg */ strncat(*mapper_dev, lvm_dev + 5, (size_t) (lv - (lvm_dev + 5))); strcat(*mapper_dev, "-"); /* copy lv */ strcat(*mapper_dev, lv + 1); strcat(*mapper_dev, "-cow"); return 0; } static int process_mappings(int snapCowFd, int lvFd, char *buf, uint32_t exceptions_per_area, int *full) { unsigned int i; uint64_t lv_chunk, cow_chunk; struct disk_exception *de; static struct disk_exception de_prev = { 0, 0}; static uint64_t num_contig_chunks = 0; /* presume the area is full */ *full = 1; for (i = 0; i < exceptions_per_area; i++) { /* * buf contains a bunch of disk_exceptions */ de = ((struct disk_exception *) buf) + i; lv_chunk = le64_to_cpu(de->lv_chunk); cow_chunk = le64_to_cpu(de->cow_chunk); /* * If the cow_chunk is pointing at the start of * the COW device, where the first metadata area * is we know that we've hit the end of the * exceptions. Therefore the area is not full. */ if (cow_chunk == 0LL) { *full = 0; // printf("END COW BLOCK: 0x%016llx, 0x%016llx, 0x%016llx\n", // de_prev.lv_chunk, de_prev.cow_chunk, num_contig_chunks); if (num_contig_chunks) { restore_cow_block(lvFd, snapCowFd, CHUNK2BYTES(de_prev.lv_chunk), CHUNK2BYTES(de_prev.cow_chunk), CHUNK2BYTES(num_contig_chunks)); } break; } /* * check if this entry can be merged with previous entry * it can be if lv_chunk and cow_chunk both are * contiguous */ if (!num_contig_chunks) { /* * we're getting in for the first time. */ num_contig_chunks = 1; de_prev = *de; } else if ((de_prev.lv_chunk + num_contig_chunks == de->lv_chunk) && (de_prev.cow_chunk + num_contig_chunks == de->cow_chunk)) { /* * case of contiguous chunk. */ num_contig_chunks++; } else { // printf("BREAK COW BLOCK: 0x%016llx, 0x%016llx, 0x%016llx\n", // de_prev.lv_chunk, de_prev.cow_chunk, num_contig_chunks); restore_cow_block(lvFd, snapCowFd, CHUNK2BYTES(de_prev.lv_chunk), CHUNK2BYTES(de_prev.cow_chunk), CHUNK2BYTES(num_contig_chunks)); num_contig_chunks = 1; de_prev = *de; } } return 0; } static int process_snapshot(int snapCowFd, int lvFd) { uint32_t area; int full = 1; char *buf; uint32_t exceptions_per_area; uint64_t chunk_offset, chunk_in_bytes, byte_offset; struct disk_header dh; /* * Read the snapshot header. */ read_header(snapCowFd, &dh); /* * Sanity checks. */ if (!dh.valid) { printf("read_snapshot_metadata: snapshot is marked" " invalid\n"); return -1; } if (dh.version != SNAPSHOT_DISK_VERSION) { printf("read_snapshot_metadata: unable to handle" " snapshot disk version %u\n", dh.version); return -1; } chunk_size = dh.chunk_size; /* * Read the metadata. */ chunk_in_bytes = (chunk_size << SECTOR_SHIFT); exceptions_per_area = chunk_in_bytes / sizeof (struct disk_exception); // Allocate an aligned buffer for each incoming disk metadata chunk buf = valloc(chunk_in_bytes); /* * Keeping reading chunks until we find a partially full area. */ for (area = 0; full; area++) { memset(buf, 0, chunk_in_bytes); /* * area is the index in the metadata area, which is laid * out every exceptions_per_area chunks on the disk * (see description at the beginning of the file) */ chunk_offset = 1 + (exceptions_per_area + 1) * area; byte_offset = chunk_offset * chunk_in_bytes; /* * read the metadata chunk */ pread_direct(snapCowFd, buf, chunk_in_bytes, byte_offset); process_mappings(snapCowFd, lvFd, buf, exceptions_per_area, &full); } return 0; } int main(int argc, char **argv) { int _f1, _f2; char *cowdevname; int snapCowFd, lvFd; if (argc != 3) { printf("Usage: %s \n", argv[0]); return -1; } _f1 = 1; _f2 = 2; // If we open the COW block device, we MUST open it in O_DIRECT mode to // stop Linux caching it. If instead we opened the LV snapshot rather than // the COW file, we could use normal file I/O (although it would be less // efficient as we already know the snapshot->COW mapping data). if (convert_to_mapper(argv[_f1], &cowdevname) < 0) { printf("%s: Bad snapshot name: %s\n", argv[0], argv[_f1]); return -1; } snapCowFd = open(cowdevname, O_RDONLY|O_DIRECT); if (snapCowFd < 0) { printf("%s: Unable to open %s\n", argv[0], argv[_f1]); return -1; } // The output LV can be opened in the normal way and can be cached if the // system wants to. lvFd = open(argv[_f2], O_RDWR); if (lvFd < 0) { printf("%s: Unable to open %s\n", argv[0], argv[_f2]); return -1; } return process_snapshot(snapCowFd, lvFd); }