cvs dm 1.0 --- diff/Documentation/Configure.help 2003-08-26 13:49:59.000000000 +0100 +++ source/Documentation/Configure.help 2003-08-26 13:59:04.000000000 +0100 @@ -1856,6 +1856,20 @@ want), say M here and read . The module will be called lvm-mod.o. +Device-mapper support +CONFIG_BLK_DEV_DM + Device-mapper is a low level volume manager. It works by allowing + people to specify mappings for ranges of logical sectors. Various + mapping types are available, in addition people may write their own + modules containing custom mappings if they wish. + + Higher level volume managers such as LVM2 use this driver. + + If you want to compile this as a module, say M here and read + . The module will be called dm-mod.o. + + If unsure, say N. + Multiple devices driver support (RAID and LVM) CONFIG_MD Support multiple physical spindles through a single logical device. --- diff/MAINTAINERS 2003-08-26 13:49:59.000000000 +0100 +++ source/MAINTAINERS 2003-08-26 13:59:04.000000000 +0100 @@ -554,6 +554,13 @@ W: http://www.debian.org/~dz/i8k/ S: Maintained +DEVICE MAPPER +P: Joe Thornber +M: dm@uk.sistina.com +L: linux-LVM@sistina.com +W: http://www.sistina.com/lvm +S: Maintained + DEVICE NUMBER REGISTRY P: H. Peter Anvin M: hpa@zytor.com --- diff/arch/mips64/kernel/ioctl32.c 2003-08-26 13:50:03.000000000 +0100 +++ source/arch/mips64/kernel/ioctl32.c 2003-08-26 14:18:17.000000000 +0100 @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -1228,6 +1229,22 @@ IOCTL32_DEFAULT(SBPROF_ZBWAITFULL), #endif /* CONFIG_SIBYTE_TBPROF */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) + IOCTL32_DEFAULT(DM_VERSION), + IOCTL32_DEFAULT(DM_REMOVE_ALL), + IOCTL32_DEFAULT(DM_DEV_CREATE), + IOCTL32_DEFAULT(DM_DEV_REMOVE), + IOCTL32_DEFAULT(DM_TABLE_LOAD), + IOCTL32_DEFAULT(DM_DEV_SUSPEND), + IOCTL32_DEFAULT(DM_DEV_RENAME), + IOCTL32_DEFAULT(DM_TABLE_DEPS), + IOCTL32_DEFAULT(DM_DEV_STATUS), + IOCTL32_DEFAULT(DM_TABLE_STATUS), + IOCTL32_DEFAULT(DM_DEV_WAIT), + IOCTL32_DEFAULT(DM_LIST_DEVICES), + IOCTL32_DEFAULT(DM_TABLE_CLEAR), +#endif /* CONFIG_BLK_DEV_DM */ + IOCTL32_DEFAULT(MTIOCTOP), /* mtio.h ioctls */ IOCTL32_HANDLER(MTIOCGET32, mt_ioctl_trans), IOCTL32_HANDLER(MTIOCPOS32, mt_ioctl_trans), --- diff/arch/parisc/kernel/ioctl32.c 2003-08-26 13:50:03.000000000 +0100 +++ source/arch/parisc/kernel/ioctl32.c 2003-08-26 13:59:04.000000000 +0100 @@ -55,6 +55,7 @@ #define max max */ #include #endif /* LVM */ +#include #include /* Ugly hack. */ @@ -3423,6 +3424,22 @@ COMPATIBLE_IOCTL(LV_BMAP) COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE) #endif /* LVM */ +/* Device-Mapper */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) +COMPATIBLE_IOCTL(DM_VERSION) +COMPATIBLE_IOCTL(DM_REMOVE_ALL) +COMPATIBLE_IOCTL(DM_DEV_CREATE) +COMPATIBLE_IOCTL(DM_DEV_REMOVE) +COMPATIBLE_IOCTL(DM_TABLE_LOAD) +COMPATIBLE_IOCTL(DM_DEV_SUSPEND) +COMPATIBLE_IOCTL(DM_DEV_RENAME) +COMPATIBLE_IOCTL(DM_TABLE_DEPS) +COMPATIBLE_IOCTL(DM_DEV_STATUS) +COMPATIBLE_IOCTL(DM_TABLE_STATUS) +COMPATIBLE_IOCTL(DM_DEV_WAIT) +COMPATIBLE_IOCTL(DM_LIST_DEVICES) +COMPATIBLE_IOCTL(DM_TABLE_CLEAR) +#endif /* CONFIG_BLK_DEV_DM */ #if defined(CONFIG_DRM) || defined(CONFIG_DRM_MODULE) COMPATIBLE_IOCTL(DRM_IOCTL_GET_MAGIC) COMPATIBLE_IOCTL(DRM_IOCTL_IRQ_BUSID) --- diff/arch/ppc64/kernel/ioctl32.c 2003-08-26 13:50:04.000000000 +0100 +++ source/arch/ppc64/kernel/ioctl32.c 2003-08-26 13:59:04.000000000 +0100 @@ -66,6 +66,7 @@ #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE) #include #endif /* LVM */ +#include #include /* Ugly hack. */ @@ -4435,6 +4436,22 @@ COMPATIBLE_IOCTL(NBD_PRINT_DEBUG), COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS), COMPATIBLE_IOCTL(NBD_DISCONNECT), +/* device-mapper */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) +COMPATIBLE_IOCTL(DM_VERSION), +COMPATIBLE_IOCTL(DM_REMOVE_ALL), +COMPATIBLE_IOCTL(DM_DEV_CREATE), +COMPATIBLE_IOCTL(DM_DEV_REMOVE), +COMPATIBLE_IOCTL(DM_TABLE_LOAD), +COMPATIBLE_IOCTL(DM_DEV_SUSPEND), +COMPATIBLE_IOCTL(DM_DEV_RENAME), +COMPATIBLE_IOCTL(DM_TABLE_DEPS), +COMPATIBLE_IOCTL(DM_DEV_STATUS), +COMPATIBLE_IOCTL(DM_TABLE_STATUS), +COMPATIBLE_IOCTL(DM_DEV_WAIT), +COMPATIBLE_IOCTL(DM_LIST_DEVICES), +COMPATIBLE_IOCTL(DM_TABLE_CLEAR), +#endif /* CONFIG_BLK_DEV_DM */ /* Remove *PRIVATE in 2.5 */ COMPATIBLE_IOCTL(SIOCDEVPRIVATE), COMPATIBLE_IOCTL(SIOCDEVPRIVATE+1), --- diff/arch/s390x/kernel/ioctl32.c 2003-08-26 13:50:04.000000000 +0100 +++ source/arch/s390x/kernel/ioctl32.c 2003-08-26 14:13:12.000000000 +0100 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -627,6 +628,20 @@ IOCTL32_DEFAULT(SIOCGSTAMP), + IOCTL32_DEFAULT(DM_VERSION), + IOCTL32_DEFAULT(DM_REMOVE_ALL), + IOCTL32_DEFAULT(DM_DEV_CREATE), + IOCTL32_DEFAULT(DM_DEV_REMOVE), + IOCTL32_DEFAULT(DM_TABLE_LOAD), + IOCTL32_DEFAULT(DM_DEV_SUSPEND), + IOCTL32_DEFAULT(DM_DEV_RENAME), + IOCTL32_DEFAULT(DM_TABLE_DEPS), + IOCTL32_DEFAULT(DM_DEV_STATUS), + IOCTL32_DEFAULT(DM_TABLE_STATUS), + IOCTL32_DEFAULT(DM_DEV_WAIT), + IOCTL32_DEFAULT(DM_LIST_DEVICES), + IOCTL32_DEFAULT(DM_TABLE_CLEAR), + IOCTL32_DEFAULT(LOOP_SET_FD), IOCTL32_DEFAULT(LOOP_CLR_FD), --- diff/arch/sparc64/kernel/ioctl32.c 2003-08-26 13:50:05.000000000 +0100 +++ source/arch/sparc64/kernel/ioctl32.c 2003-08-26 13:59:04.000000000 +0100 @@ -56,6 +56,7 @@ #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE) #include #endif /* LVM */ +#include #include /* Ugly hack. */ @@ -5086,6 +5087,22 @@ COMPATIBLE_IOCTL(NBD_PRINT_DEBUG) COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS) COMPATIBLE_IOCTL(NBD_DISCONNECT) +/* device-mapper */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) +COMPATIBLE_IOCTL(DM_VERSION) +COMPATIBLE_IOCTL(DM_REMOVE_ALL) +COMPATIBLE_IOCTL(DM_DEV_CREATE) +COMPATIBLE_IOCTL(DM_DEV_REMOVE) +COMPATIBLE_IOCTL(DM_TABLE_LOAD) +COMPATIBLE_IOCTL(DM_DEV_SUSPEND) +COMPATIBLE_IOCTL(DM_DEV_RENAME) +COMPATIBLE_IOCTL(DM_TABLE_DEPS) +COMPATIBLE_IOCTL(DM_DEV_STATUS) +COMPATIBLE_IOCTL(DM_TABLE_STATUS) +COMPATIBLE_IOCTL(DM_DEV_WAIT) +COMPATIBLE_IOCTL(DM_LIST_DEVICES) +COMPATIBLE_IOCTL(DM_TABLE_CLEAR) +#endif /* CONFIG_BLK_DEV_DM */ /* Linux-1394 */ #if defined(CONFIG_IEEE1394) || defined(CONFIG_IEEE1394_MODULE) COMPATIBLE_IOCTL(AMDTP_IOC_CHANNEL) --- diff/arch/x86_64/ia32/ia32_ioctl.c 2003-08-26 13:50:05.000000000 +0100 +++ source/arch/x86_64/ia32/ia32_ioctl.c 2003-08-26 13:59:04.000000000 +0100 @@ -67,6 +67,7 @@ #define max max #include #endif /* LVM */ +#include #include /* Ugly hack. */ @@ -4047,6 +4048,22 @@ COMPATIBLE_IOCTL(LV_BMAP) COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE) #endif /* LVM */ +/* Device-Mapper */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) +COMPATIBLE_IOCTL(DM_VERSION) +COMPATIBLE_IOCTL(DM_REMOVE_ALL) +COMPATIBLE_IOCTL(DM_DEV_CREATE) +COMPATIBLE_IOCTL(DM_DEV_REMOVE) +COMPATIBLE_IOCTL(DM_TABLE_LOAD) +COMPATIBLE_IOCTL(DM_DEV_SUSPEND) +COMPATIBLE_IOCTL(DM_DEV_RENAME) +COMPATIBLE_IOCTL(DM_TABLE_DEPS) +COMPATIBLE_IOCTL(DM_DEV_STATUS) +COMPATIBLE_IOCTL(DM_TABLE_STATUS) +COMPATIBLE_IOCTL(DM_DEV_WAIT) +COMPATIBLE_IOCTL(DM_LIST_DEVICES) +COMPATIBLE_IOCTL(DM_TABLE_CLEAR) +#endif /* CONFIG_BLK_DEV_DM */ #ifdef CONFIG_AUTOFS_FS COMPATIBLE_IOCTL(AUTOFS_IOC_READY) COMPATIBLE_IOCTL(AUTOFS_IOC_FAIL) --- diff/drivers/md/Config.in 2001-09-26 16:15:05.000000000 +0100 +++ source/drivers/md/Config.in 2003-08-26 13:59:04.000000000 +0100 @@ -14,5 +14,9 @@ dep_tristate ' Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + dep_tristate ' Device-mapper support (EXPERIMENTAL)' CONFIG_BLK_DEV_DM $CONFIG_MD + dep_tristate ' Mirror (RAID-1) support (EXPERIMENTAL)' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM +fi endmenu --- diff/drivers/md/Makefile 2002-01-17 10:07:52.000000000 +0000 +++ source/drivers/md/Makefile 2003-08-26 13:59:04.000000000 +0100 @@ -4,24 +4,41 @@ O_TARGET := mddev.o -export-objs := md.o xor.o -list-multi := lvm-mod.o +export-objs := md.o xor.o dm-table.o dm-target.o kcopyd.o dm-daemon.o \ + dm-log.o dm-io.o dm.o + +list-multi := lvm-mod.o dm-mod.o dm-mirror-mod.o lvm-mod-objs := lvm.o lvm-snap.o lvm-fs.o +dm-mod-objs := dm.o dm-table.o dm-target.o dm-ioctl.o \ + dm-linear.o dm-stripe.o dm-snapshot.o dm-exception-store.o \ + kcopyd.o dm-daemon.o dm-io.o +dm-mirror-mod-objs := dm-raid1.o dm-log.o # Note: link order is important. All raid personalities # and xor.o must come before md.o, as they each initialise # themselves, and md.o may use the personalities when it # auto-initialised. -obj-$(CONFIG_MD_LINEAR) += linear.o -obj-$(CONFIG_MD_RAID0) += raid0.o -obj-$(CONFIG_MD_RAID1) += raid1.o -obj-$(CONFIG_MD_RAID5) += raid5.o xor.o -obj-$(CONFIG_MD_MULTIPATH) += multipath.o -obj-$(CONFIG_BLK_DEV_MD) += md.o -obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o +obj-$(CONFIG_MD_LINEAR) += linear.o +obj-$(CONFIG_MD_RAID0) += raid0.o +obj-$(CONFIG_MD_RAID1) += raid1.o +obj-$(CONFIG_MD_RAID5) += raid5.o xor.o +obj-$(CONFIG_MD_MULTIPATH) += multipath.o +obj-$(CONFIG_BLK_DEV_MD) += md.o + +obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o + +obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o +obj-$(CONFIG_BLK_DEV_DM_MIRROR) += dm-mirror.o include $(TOPDIR)/Rules.make lvm-mod.o: $(lvm-mod-objs) $(LD) -r -o $@ $(lvm-mod-objs) + +dm-mod.o: $(dm-mod-objs) + $(LD) -r -o $@ $(dm-mod-objs) + +dm-mirror.o: $(dm-mirror-mod-objs) + $(LD) -r -o $@ $(dm-mirror-mod-objs) + --- diff/fs/buffer.c 2003-08-26 13:50:12.000000000 +0100 +++ source/fs/buffer.c 2003-08-26 13:59:04.000000000 +0100 @@ -756,6 +756,7 @@ bh->b_list = BUF_CLEAN; bh->b_end_io = handler; bh->b_private = private; + bh->b_journal_head = NULL; } static void end_buffer_io_async(struct buffer_head * bh, int uptodate) --- diff/fs/jbd/journal.c 2003-08-26 13:50:12.000000000 +0100 +++ source/fs/jbd/journal.c 2003-08-26 13:59:04.000000000 +0100 @@ -1802,9 +1802,9 @@ if (buffer_jbd(bh)) { /* Someone did it for us! */ - J_ASSERT_BH(bh, bh->b_private != NULL); + J_ASSERT_BH(bh, bh->b_journal_head != NULL); journal_free_journal_head(jh); - jh = bh->b_private; + jh = bh->b_journal_head; } else { /* * We actually don't need jh_splice_lock when @@ -1812,7 +1812,7 @@ */ spin_lock(&jh_splice_lock); set_bit(BH_JBD, &bh->b_state); - bh->b_private = jh; + bh->b_journal_head = jh; jh->b_bh = bh; atomic_inc(&bh->b_count); spin_unlock(&jh_splice_lock); @@ -1821,7 +1821,7 @@ } jh->b_jcount++; spin_unlock(&journal_datalist_lock); - return bh->b_private; + return bh->b_journal_head; } /* @@ -1854,7 +1854,7 @@ J_ASSERT_BH(bh, jh2bh(jh) == bh); BUFFER_TRACE(bh, "remove journal_head"); spin_lock(&jh_splice_lock); - bh->b_private = NULL; + bh->b_journal_head = NULL; jh->b_bh = NULL; /* debug, really */ clear_bit(BH_JBD, &bh->b_state); __brelse(bh); --- diff/include/linux/fs.h 2003-08-26 13:50:14.000000000 +0100 +++ source/include/linux/fs.h 2003-08-26 14:20:27.000000000 +0100 @@ -265,7 +265,7 @@ struct page *b_page; /* the page this bh is mapped to */ void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */ void *b_private; /* reserved for b_end_io */ - + void *b_journal_head; /* ext3 journal_heads */ unsigned long b_rsector; /* Real buffer location on disk */ wait_queue_head_t b_wait; --- diff/include/linux/jbd.h 2003-06-16 09:56:12.000000000 +0100 +++ source/include/linux/jbd.h 2003-08-26 13:59:04.000000000 +0100 @@ -311,7 +311,7 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh) { - return bh->b_private; + return bh->b_journal_head; } #define HAVE_JOURNAL_CALLBACK_STATUS --- diff/include/linux/vmalloc.h 2003-08-26 13:50:14.000000000 +0100 +++ source/include/linux/vmalloc.h 2003-08-26 14:20:27.000000000 +0100 @@ -29,6 +29,7 @@ extern void vmfree_area_pages(unsigned long address, unsigned long size); extern int vmalloc_area_pages(unsigned long address, unsigned long size, int gfp_mask, pgprot_t prot); +extern void *vcalloc(unsigned long nmemb, unsigned long elem_size); /* * Allocate any pages --- diff/kernel/ksyms.c 2003-08-26 13:50:14.000000000 +0100 +++ source/kernel/ksyms.c 2003-08-26 13:59:04.000000000 +0100 @@ -114,6 +114,7 @@ EXPORT_SYMBOL(__vmalloc); EXPORT_SYMBOL(vmap); EXPORT_SYMBOL(vmalloc_to_page); +EXPORT_SYMBOL(vcalloc); EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(remap_page_range); EXPORT_SYMBOL(max_mapnr); --- diff/mm/Makefile 2002-08-05 14:57:44.000000000 +0100 +++ source/mm/Makefile 2003-08-26 13:59:04.000000000 +0100 @@ -9,12 +9,12 @@ O_TARGET := mm.o -export-objs := shmem.o filemap.o memory.o page_alloc.o +export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ - shmem.o + shmem.o mempool.o obj-$(CONFIG_HIGHMEM) += highmem.o --- diff/mm/filemap.c 2003-08-26 13:50:14.000000000 +0100 +++ source/mm/filemap.c 2003-08-26 14:03:41.000000000 +0100 @@ -1742,7 +1742,8 @@ } up(&inode->i_sem); up_read(&inode->i_alloc_sem); - UPDATE_ATIME(filp->f_dentry->d_inode); + if (!S_ISBLK(inode->i_mode)) + UPDATE_ATIME(filp->f_dentry->d_inode); goto out; } } @@ -3120,8 +3121,12 @@ goto out; remove_suid(inode); - inode->i_ctime = inode->i_mtime = CURRENT_TIME; - mark_inode_dirty_sync(inode); + + /* Don't update times for block devices using O_DIRECT */ + if (!(file->f_flags & O_DIRECT) || !S_ISBLK(inode->i_mode)) { + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + mark_inode_dirty_sync(inode); + } do { unsigned long index, offset; --- diff/mm/vmalloc.c 2003-08-26 13:50:14.000000000 +0100 +++ source/mm/vmalloc.c 2003-08-26 13:59:04.000000000 +0100 @@ -374,3 +374,22 @@ read_unlock(&vmlist_lock); return buf - buf_start; } + +void *vcalloc(unsigned long nmemb, unsigned long elem_size) +{ + unsigned long size; + void *addr; + + /* + * Check that we're not going to overflow. + */ + if (nmemb > (ULONG_MAX / elem_size)) + return NULL; + + size = nmemb * elem_size; + addr = vmalloc(size); + if (addr) + memset(addr, 0, size); + + return addr; +} --- diff/arch/mips64/kernel/ioctl32.c.rej 1970-01-01 01:00:00.000000000 +0100 +++ source/arch/mips64/kernel/ioctl32.c.rej 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,16 @@ +*************** +*** 33,38 **** + #include + #include + #include + + #include + #undef __KERNEL__ /* This file was born to be ugly ... */ +--- 33,39 ---- + #include + #include + #include ++ #include + + #include + #undef __KERNEL__ /* This file was born to be ugly ... */ --- diff/arch/s390x/kernel/ioctl32.c.rej 1970-01-01 01:00:00.000000000 +0100 +++ source/arch/s390x/kernel/ioctl32.c.rej 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,45 @@ +*************** +*** 25,30 **** + #include + #include + #include + #include + #include + #include +--- 25,31 ---- + #include + #include + #include ++ #include + #include + #include + #include +*************** +*** 508,513 **** + + IOCTL32_DEFAULT(SIOCGSTAMP), + + IOCTL32_HANDLER(SIOCGIFNAME, dev_ifname32), + IOCTL32_HANDLER(SIOCGIFCONF, dev_ifconf), + IOCTL32_HANDLER(SIOCGIFFLAGS, dev_ifsioc), +--- 509,528 ---- + + IOCTL32_DEFAULT(SIOCGSTAMP), + ++ IOCTL32_DEFAULT(DM_VERSION), ++ IOCTL32_DEFAULT(DM_REMOVE_ALL), ++ IOCTL32_DEFAULT(DM_DEV_CREATE), ++ IOCTL32_DEFAULT(DM_DEV_REMOVE), ++ IOCTL32_DEFAULT(DM_TABLE_LOAD), ++ IOCTL32_DEFAULT(DM_DEV_SUSPEND), ++ IOCTL32_DEFAULT(DM_DEV_RENAME), ++ IOCTL32_DEFAULT(DM_TABLE_DEPS), ++ IOCTL32_DEFAULT(DM_DEV_STATUS), ++ IOCTL32_DEFAULT(DM_TABLE_STATUS), ++ IOCTL32_DEFAULT(DM_DEV_WAIT), ++ IOCTL32_DEFAULT(DM_LIST_DEVICES), ++ IOCTL32_DEFAULT(DM_TABLE_CLEAR), ++ + IOCTL32_HANDLER(SIOCGIFNAME, dev_ifname32), + IOCTL32_HANDLER(SIOCGIFCONF, dev_ifconf), + IOCTL32_HANDLER(SIOCGIFFLAGS, dev_ifsioc), --- diff/drivers/md/dm-daemon.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-daemon.c 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the LGPL. + */ + +#include "dm.h" +#include "dm-daemon.h" + +#include +#include + +static int daemon(void *arg) +{ + struct dm_daemon *dd = (struct dm_daemon *) arg; + DECLARE_WAITQUEUE(wq, current); + + daemonize(); + reparent_to_init(); + + /* block all signals */ + spin_lock_irq(¤t->sigmask_lock); + sigfillset(¤t->blocked); + flush_signals(current); + spin_unlock_irq(¤t->sigmask_lock); + + strcpy(current->comm, dd->name); + atomic_set(&dd->please_die, 0); + + add_wait_queue(&dd->job_queue, &wq); + + down(&dd->run_lock); + up(&dd->start_lock); + + /* + * dd->fn() could do anything, very likely it will + * suspend. So we can't set the state to + * TASK_INTERRUPTIBLE before calling it. In order to + * prevent a race with a waking thread we do this little + * dance with the dd->woken variable. + */ + while (1) { + do { + set_current_state(TASK_RUNNING); + + if (atomic_read(&dd->please_die)) + goto out; + + atomic_set(&dd->woken, 0); + dd->fn(); + yield(); + + set_current_state(TASK_INTERRUPTIBLE); + } while (atomic_read(&dd->woken)); + + schedule(); + } + + out: + remove_wait_queue(&dd->job_queue, &wq); + up(&dd->run_lock); + return 0; +} + +int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void)) +{ + pid_t pid = 0; + + /* + * Initialise the dm_daemon. + */ + dd->fn = fn; + strncpy(dd->name, name, sizeof(dd->name) - 1); + sema_init(&dd->start_lock, 1); + sema_init(&dd->run_lock, 1); + init_waitqueue_head(&dd->job_queue); + + /* + * Start the new thread. + */ + down(&dd->start_lock); + pid = kernel_thread(daemon, dd, 0); + if (pid <= 0) { + DMERR("Failed to start kcopyd thread"); + return -EAGAIN; + } + + /* + * wait for the daemon to up this mutex. + */ + down(&dd->start_lock); + up(&dd->start_lock); + + return 0; +} + +void dm_daemon_stop(struct dm_daemon *dd) +{ + atomic_set(&dd->please_die, 1); + dm_daemon_wake(dd); + down(&dd->run_lock); + up(&dd->run_lock); +} + +void dm_daemon_wake(struct dm_daemon *dd) +{ + atomic_set(&dd->woken, 1); + wake_up_interruptible(&dd->job_queue); +} + +EXPORT_SYMBOL(dm_daemon_start); +EXPORT_SYMBOL(dm_daemon_stop); +EXPORT_SYMBOL(dm_daemon_wake); --- diff/drivers/md/dm-daemon.h 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-daemon.h 2003-08-26 14:21:40.000000000 +0100 @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the LGPL. + */ + +#ifndef DM_DAEMON_H +#define DM_DAEMON_H + +#include +#include + +struct dm_daemon { + void (*fn)(void); + char name[16]; + atomic_t please_die; + struct semaphore start_lock; + struct semaphore run_lock; + + atomic_t woken; + wait_queue_head_t job_queue; +}; + +int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void)); +void dm_daemon_stop(struct dm_daemon *dd); +void dm_daemon_wake(struct dm_daemon *dd); +int dm_daemon_running(struct dm_daemon *dd); + +#endif --- diff/drivers/md/dm-exception-store.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-exception-store.c 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,675 @@ +/* + * dm-snapshot.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm-snapshot.h" +#include "dm-io.h" +#include "kcopyd.h" + +#include +#include +#include +#include + +/*----------------------------------------------------------------- + * Persistent snapshots, by persistent we mean that the snapshot + * will survive a reboot. + *---------------------------------------------------------------*/ + +/* + * We need to store a record of which parts of the origin have + * been copied to the snapshot device. The snapshot code + * requires that we copy exception chunks to chunk aligned areas + * of the COW store. It makes sense therefore, to store the + * metadata in chunk size blocks. + * + * There is no backward or forward compatibility implemented, + * snapshots with different disk versions than the kernel will + * not be usable. It is expected that "lvcreate" will blank out + * the start of a fresh COW device before calling the snapshot + * constructor. + * + * The first chunk of the COW device just contains the header. + * After this there is a chunk filled with exception metadata, + * followed by as many exception chunks as can fit in the + * metadata areas. + * + * All on disk structures are in little-endian format. The end + * of the exceptions info is indicated by an exception with a + * new_chunk of 0, which is invalid since it would point to the + * header chunk. + */ + +/* + * Magic for persistent snapshots: "SnAp" - Feeble isn't it. + */ +#define SNAP_MAGIC 0x70416e53 + +/* + * The on-disk version of the metadata. + */ +#define SNAPSHOT_DISK_VERSION 1 + +struct disk_header { + uint32_t magic; + + /* + * Is this snapshot valid. There is no way of recovering + * an invalid snapshot. + */ + uint32_t valid; + + /* + * Simple, incrementing version. no backward + * compatibility. + */ + uint32_t version; + + /* In sectors */ + uint32_t chunk_size; +}; + +struct disk_exception { + uint64_t old_chunk; + uint64_t new_chunk; +}; + +struct commit_callback { + void (*callback)(void *, int success); + void *context; +}; + +/* + * The top level structure for a persistent exception store. + */ +struct pstore { + struct dm_snapshot *snap; /* up pointer to my snapshot */ + int version; + int valid; + uint32_t chunk_size; + uint32_t exceptions_per_area; + + /* + * Now that we have an asynchronous kcopyd there is no + * need for large chunk sizes, so it wont hurt to have a + * whole chunks worth of metadata in memory at once. + */ + void *area; + + /* + * Used to keep track of which metadata area the data in + * 'chunk' refers to. + */ + uint32_t current_area; + + /* + * The next free chunk for an exception. + */ + uint32_t next_free; + + /* + * The index of next free exception in the current + * metadata area. + */ + uint32_t current_committed; + + atomic_t pending_count; + uint32_t callback_count; + struct commit_callback *callbacks; +}; + +static inline unsigned int sectors_to_pages(unsigned int sectors) +{ + return sectors / (PAGE_SIZE / SECTOR_SIZE); +} + +static int alloc_area(struct pstore *ps) +{ + int r = -ENOMEM; + size_t i, len, nr_pages; + struct page *page, *last = NULL; + + len = ps->chunk_size << SECTOR_SHIFT; + + /* + * Allocate the chunk_size block of memory that will hold + * a single metadata area. + */ + ps->area = vmalloc(len); + if (!ps->area) + return r; + + nr_pages = sectors_to_pages(ps->chunk_size); + + /* + * We lock the pages for ps->area into memory since + * they'll be doing a lot of io. We also chain them + * together ready for dm-io. + */ + for (i = 0; i < nr_pages; i++) { + page = vmalloc_to_page(ps->area + (i * PAGE_SIZE)); + LockPage(page); + if (last) + last->list.next = &page->list; + last = page; + } + + return 0; +} + +static void free_area(struct pstore *ps) +{ + size_t i, nr_pages; + struct page *page; + + nr_pages = sectors_to_pages(ps->chunk_size); + for (i = 0; i < nr_pages; i++) { + page = vmalloc_to_page(ps->area + (i * PAGE_SIZE)); + page->list.next = NULL; + UnlockPage(page); + } + + vfree(ps->area); +} + +/* + * Read or write a chunk aligned and sized block of data from a device. + */ +static int chunk_io(struct pstore *ps, uint32_t chunk, int rw) +{ + struct io_region where; + unsigned int bits; + + where.dev = ps->snap->cow->dev; + where.sector = ps->chunk_size * chunk; + where.count = ps->chunk_size; + + return dm_io_sync(1, &where, rw, vmalloc_to_page(ps->area), 0, &bits); +} + +/* + * Read or write a metadata area. Remembering to skip the first + * chunk which holds the header. + */ +static int area_io(struct pstore *ps, uint32_t area, int rw) +{ + int r; + uint32_t chunk; + + /* convert a metadata area index to a chunk index */ + chunk = 1 + ((ps->exceptions_per_area + 1) * area); + + r = chunk_io(ps, chunk, rw); + if (r) + return r; + + ps->current_area = area; + return 0; +} + +static int zero_area(struct pstore *ps, uint32_t area) +{ + memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); + return area_io(ps, area, WRITE); +} + +static int read_header(struct pstore *ps, int *new_snapshot) +{ + int r; + struct disk_header *dh; + + r = chunk_io(ps, 0, READ); + if (r) + return r; + + dh = (struct disk_header *) ps->area; + + if (le32_to_cpu(dh->magic) == 0) { + *new_snapshot = 1; + + } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) { + *new_snapshot = 0; + ps->valid = le32_to_cpu(dh->valid); + ps->version = le32_to_cpu(dh->version); + ps->chunk_size = le32_to_cpu(dh->chunk_size); + + } else { + DMWARN("Invalid/corrupt snapshot"); + r = -ENXIO; + } + + return r; +} + +static int write_header(struct pstore *ps) +{ + struct disk_header *dh; + + memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); + + dh = (struct disk_header *) ps->area; + dh->magic = cpu_to_le32(SNAP_MAGIC); + dh->valid = cpu_to_le32(ps->valid); + dh->version = cpu_to_le32(ps->version); + dh->chunk_size = cpu_to_le32(ps->chunk_size); + + return chunk_io(ps, 0, WRITE); +} + +/* + * Access functions for the disk exceptions, these do the endian conversions. + */ +static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) +{ + if (index >= ps->exceptions_per_area) + return NULL; + + return ((struct disk_exception *) ps->area) + index; +} + +static int read_exception(struct pstore *ps, + uint32_t index, struct disk_exception *result) +{ + struct disk_exception *e; + + e = get_exception(ps, index); + if (!e) + return -EINVAL; + + /* copy it */ + result->old_chunk = le64_to_cpu(e->old_chunk); + result->new_chunk = le64_to_cpu(e->new_chunk); + + return 0; +} + +static int write_exception(struct pstore *ps, + uint32_t index, struct disk_exception *de) +{ + struct disk_exception *e; + + e = get_exception(ps, index); + if (!e) + return -EINVAL; + + /* copy it */ + e->old_chunk = cpu_to_le64(de->old_chunk); + e->new_chunk = cpu_to_le64(de->new_chunk); + + return 0; +} + +/* + * Registers the exceptions that are present in the current area. + * 'full' is filled in to indicate if the area has been + * filled. + */ +static int insert_exceptions(struct pstore *ps, int *full) +{ + int r; + unsigned int i; + struct disk_exception de; + + /* presume the area is full */ + *full = 1; + + for (i = 0; i < ps->exceptions_per_area; i++) { + r = read_exception(ps, i, &de); + + if (r) + return r; + + /* + * If the new_chunk is pointing at the start of + * the COW device, where the first metadata area + * is we know that we've hit the end of the + * exceptions. Therefore the area is not full. + */ + if (de.new_chunk == 0LL) { + ps->current_committed = i; + *full = 0; + break; + } + + /* + * Keep track of the start of the free chunks. + */ + if (ps->next_free <= de.new_chunk) + ps->next_free = de.new_chunk + 1; + + /* + * Otherwise we add the exception to the snapshot. + */ + r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk); + if (r) + return r; + } + + return 0; +} + +static int read_exceptions(struct pstore *ps) +{ + uint32_t area; + int r, full = 1; + + /* + * Keeping reading chunks and inserting exceptions until + * we find a partially full area. + */ + for (area = 0; full; area++) { + r = area_io(ps, area, READ); + if (r) + return r; + + r = insert_exceptions(ps, &full); + if (r) + return r; + + area++; + } + + return 0; +} + +static inline struct pstore *get_info(struct exception_store *store) +{ + return (struct pstore *) store->context; +} + +static void persistent_fraction_full(struct exception_store *store, + sector_t *numerator, sector_t *denominator) +{ + *numerator = get_info(store)->next_free * store->snap->chunk_size; + *denominator = get_dev_size(store->snap->cow->dev); +} + +static void persistent_destroy(struct exception_store *store) +{ + struct pstore *ps = get_info(store); + + dm_io_put(sectors_to_pages(ps->chunk_size)); + vfree(ps->callbacks); + free_area(ps); + kfree(ps); +} + +static int persistent_read_metadata(struct exception_store *store) +{ + int r, new_snapshot; + struct pstore *ps = get_info(store); + + /* + * Read the snapshot header. + */ + r = read_header(ps, &new_snapshot); + if (r) + return r; + + /* + * Do we need to setup a new snapshot ? + */ + if (new_snapshot) { + r = write_header(ps); + if (r) { + DMWARN("write_header failed"); + return r; + } + + r = zero_area(ps, 0); + if (r) { + DMWARN("zero_area(0) failed"); + return r; + } + + } else { + /* + * Sanity checks. + */ + if (!ps->valid) { + DMWARN("snapshot is marked invalid"); + return -EINVAL; + } + + if (ps->version != SNAPSHOT_DISK_VERSION) { + DMWARN("unable to handle snapshot disk version %d", + ps->version); + return -EINVAL; + } + + /* + * Read the metadata. + */ + r = read_exceptions(ps); + if (r) + return r; + } + + return 0; +} + +static int persistent_prepare(struct exception_store *store, + struct exception *e) +{ + struct pstore *ps = get_info(store); + uint32_t stride; + sector_t size = get_dev_size(store->snap->cow->dev); + + /* Is there enough room ? */ + if (size < ((ps->next_free + 1) * store->snap->chunk_size)) + return -ENOSPC; + + e->new_chunk = ps->next_free; + + /* + * Move onto the next free pending, making sure to take + * into account the location of the metadata chunks. + */ + stride = (ps->exceptions_per_area + 1); + if ((++ps->next_free % stride) == 1) + ps->next_free++; + + atomic_inc(&ps->pending_count); + return 0; +} + +static void persistent_commit(struct exception_store *store, + struct exception *e, + void (*callback) (void *, int success), + void *callback_context) +{ + int r; + unsigned int i; + struct pstore *ps = get_info(store); + struct disk_exception de; + struct commit_callback *cb; + + de.old_chunk = e->old_chunk; + de.new_chunk = e->new_chunk; + write_exception(ps, ps->current_committed++, &de); + + /* + * Add the callback to the back of the array. This code + * is the only place where the callback array is + * manipulated, and we know that it will never be called + * multiple times concurrently. + */ + cb = ps->callbacks + ps->callback_count++; + cb->callback = callback; + cb->context = callback_context; + + /* + * If there are no more exceptions in flight, or we have + * filled this metadata area we commit the exceptions to + * disk. + */ + if (atomic_dec_and_test(&ps->pending_count) || + (ps->current_committed == ps->exceptions_per_area)) { + r = area_io(ps, ps->current_area, WRITE); + if (r) + ps->valid = 0; + + for (i = 0; i < ps->callback_count; i++) { + cb = ps->callbacks + i; + cb->callback(cb->context, r == 0 ? 1 : 0); + } + + ps->callback_count = 0; + } + + /* + * Have we completely filled the current area ? + */ + if (ps->current_committed == ps->exceptions_per_area) { + ps->current_committed = 0; + r = zero_area(ps, ps->current_area + 1); + if (r) + ps->valid = 0; + } +} + +static void persistent_drop(struct exception_store *store) +{ + struct pstore *ps = get_info(store); + + ps->valid = 0; + if (write_header(ps)) + DMWARN("write header failed"); +} + +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) +{ + int r; + struct pstore *ps; + + r = dm_io_get(sectors_to_pages(chunk_size)); + if (r) + return r; + + /* allocate the pstore */ + ps = kmalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) { + r = -ENOMEM; + goto bad; + } + + ps->snap = store->snap; + ps->valid = 1; + ps->version = SNAPSHOT_DISK_VERSION; + ps->chunk_size = chunk_size; + ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) / + sizeof(struct disk_exception); + ps->next_free = 2; /* skipping the header and first area */ + ps->current_committed = 0; + + r = alloc_area(ps); + if (r) + goto bad; + + /* + * Allocate space for all the callbacks. + */ + ps->callback_count = 0; + atomic_set(&ps->pending_count, 0); + ps->callbacks = vcalloc(ps->exceptions_per_area, + sizeof(*ps->callbacks)); + + if (!ps->callbacks) { + r = -ENOMEM; + goto bad; + } + + store->destroy = persistent_destroy; + store->read_metadata = persistent_read_metadata; + store->prepare_exception = persistent_prepare; + store->commit_exception = persistent_commit; + store->drop_snapshot = persistent_drop; + store->fraction_full = persistent_fraction_full; + store->context = ps; + + return 0; + + bad: + dm_io_put(sectors_to_pages(chunk_size)); + if (ps) { + if (ps->callbacks) + vfree(ps->callbacks); + + kfree(ps); + } + return r; +} + +/*----------------------------------------------------------------- + * Implementation of the store for non-persistent snapshots. + *---------------------------------------------------------------*/ +struct transient_c { + sector_t next_free; +}; + +void transient_destroy(struct exception_store *store) +{ + kfree(store->context); +} + +int transient_read_metadata(struct exception_store *store) +{ + return 0; +} + +int transient_prepare(struct exception_store *store, struct exception *e) +{ + struct transient_c *tc = (struct transient_c *) store->context; + sector_t size = get_dev_size(store->snap->cow->dev); + + if (size < (tc->next_free + store->snap->chunk_size)) + return -1; + + e->new_chunk = sector_to_chunk(store->snap, tc->next_free); + tc->next_free += store->snap->chunk_size; + + return 0; +} + +void transient_commit(struct exception_store *store, + struct exception *e, + void (*callback) (void *, int success), + void *callback_context) +{ + /* Just succeed */ + callback(callback_context, 1); +} + +static void transient_fraction_full(struct exception_store *store, + sector_t *numerator, sector_t *denominator) +{ + *numerator = ((struct transient_c *) store->context)->next_free; + *denominator = get_dev_size(store->snap->cow->dev); +} + +int dm_create_transient(struct exception_store *store, + struct dm_snapshot *s, int blocksize) +{ + struct transient_c *tc; + + memset(store, 0, sizeof(*store)); + store->destroy = transient_destroy; + store->read_metadata = transient_read_metadata; + store->prepare_exception = transient_prepare; + store->commit_exception = transient_commit; + store->fraction_full = transient_fraction_full; + store->snap = s; + + tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); + if (!tc) + return -ENOMEM; + + tc->next_free = 0; + store->context = tc; + + return 0; +} --- diff/drivers/md/dm-io.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-io.c 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,344 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the GPL. + */ + +#include "dm-io.h" + +#include +#include +#include +#include + +/* FIXME: can we shrink this ? */ +struct io_context { + int rw; + unsigned int error; + atomic_t count; + struct task_struct *sleeper; + io_notify_fn callback; + void *context; +}; + +/* + * We maintain a pool of buffer heads for dispatching the io. + */ +static unsigned int _num_bhs; +static mempool_t *_buffer_pool; + +/* + * io contexts are only dynamically allocated for asynchronous + * io. Since async io is likely to be the majority of io we'll + * have the same number of io contexts as buffer heads ! (FIXME: + * must reduce this). + */ +mempool_t *_io_pool; + +static void *alloc_bh(int gfp_mask, void *pool_data) +{ + struct buffer_head *bh; + + bh = kmem_cache_alloc(bh_cachep, gfp_mask); + if (bh) { + bh->b_reqnext = NULL; + init_waitqueue_head(&bh->b_wait); + INIT_LIST_HEAD(&bh->b_inode_buffers); + } + + return bh; +} + +static void *alloc_io(int gfp_mask, void *pool_data) +{ + return kmalloc(sizeof(struct io_context), gfp_mask); +} + +static void free_io(void *element, void *pool_data) +{ + kfree(element); +} + +static unsigned int pages_to_buffers(unsigned int pages) +{ + return 4 * pages; /* too many ? */ +} + +static int resize_pool(unsigned int new_bhs) +{ + int r = 0; + + if (_buffer_pool) { + if (new_bhs == 0) { + /* free off the pools */ + mempool_destroy(_buffer_pool); + mempool_destroy(_io_pool); + _buffer_pool = _io_pool = NULL; + } else { + /* resize the pools */ + r = mempool_resize(_buffer_pool, new_bhs, GFP_KERNEL); + if (!r) + r = mempool_resize(_io_pool, + new_bhs, GFP_KERNEL); + } + } else { + /* create new pools */ + _buffer_pool = mempool_create(new_bhs, alloc_bh, + mempool_free_slab, bh_cachep); + if (!_buffer_pool) + r = -ENOMEM; + + _io_pool = mempool_create(new_bhs, alloc_io, free_io, NULL); + if (!_io_pool) { + mempool_destroy(_buffer_pool); + _buffer_pool = NULL; + r = -ENOMEM; + } + } + + if (!r) + _num_bhs = new_bhs; + + return r; +} + +int dm_io_get(unsigned int num_pages) +{ + return resize_pool(_num_bhs + pages_to_buffers(num_pages)); +} + +void dm_io_put(unsigned int num_pages) +{ + resize_pool(_num_bhs - pages_to_buffers(num_pages)); +} + +/*----------------------------------------------------------------- + * We need to keep track of which region a buffer is doing io + * for. In order to save a memory allocation we store this in an + * unused field of the buffer head, and provide these access + * functions. + * + * FIXME: add compile time check that an unsigned int can fit + * into a pointer. + * + *---------------------------------------------------------------*/ +static inline void bh_set_region(struct buffer_head *bh, unsigned int region) +{ + bh->b_journal_head = (void *) region; +} + +static inline int bh_get_region(struct buffer_head *bh) +{ + return (unsigned int) bh->b_journal_head; +} + +/*----------------------------------------------------------------- + * We need an io object to keep track of the number of bhs that + * have been dispatched for a particular io. + *---------------------------------------------------------------*/ +static void dec_count(struct io_context *io, unsigned int region, int error) +{ + if (error) + set_bit(region, &io->error); + + if (atomic_dec_and_test(&io->count)) { + if (io->sleeper) + wake_up_process(io->sleeper); + + else { + int r = io->error; + io_notify_fn fn = io->callback; + void *context = io->context; + + mempool_free(io, _io_pool); + fn(r, context); + } + } +} + +static void endio(struct buffer_head *bh, int uptodate) +{ + struct io_context *io = (struct io_context *) bh->b_private; + + if (!uptodate && io->rw != WRITE) { + /* + * We need to zero this region, otherwise people + * like kcopyd may write the arbitrary contents + * of the page. + */ + memset(bh->b_data, 0, bh->b_size); + } + + dec_count((struct io_context *) bh->b_private, + bh_get_region(bh), !uptodate); + mempool_free(bh, _buffer_pool); +} + +/* + * Primitives for alignment calculations. + */ +int fls(unsigned n) +{ + return generic_fls32(n); +} + +static inline int log2_floor(unsigned n) +{ + return ffs(n) - 1; +} + +static inline int log2_align(unsigned n) +{ + return fls(n) - 1; +} + +/* + * Returns the next block for io. + */ +static int do_page(kdev_t dev, sector_t *block, sector_t end_block, + unsigned int block_size, + struct page *p, unsigned int offset, + unsigned int region, struct io_context *io) +{ + struct buffer_head *bh; + sector_t b = *block; + sector_t blocks_per_page = PAGE_SIZE / block_size; + unsigned int this_size; /* holds the size of the current io */ + unsigned int len; + + while ((offset < PAGE_SIZE) && (b != end_block)) { + bh = mempool_alloc(_buffer_pool, GFP_NOIO); + init_buffer(bh, endio, io); + bh_set_region(bh, region); + + /* + * Block size must be a power of 2 and aligned + * correctly. + */ + len = end_block - b; + this_size = min((sector_t) 1 << log2_floor(b), blocks_per_page); + if (this_size > len) + this_size = 1 << log2_align(len); + + /* + * Add in the job offset. + */ + bh->b_blocknr = (b / this_size); + bh->b_size = block_size * this_size; + set_bh_page(bh, p, offset); + bh->b_this_page = bh; + + bh->b_dev = dev; + atomic_set(&bh->b_count, 1); + + bh->b_state = ((1 << BH_Uptodate) | (1 << BH_Mapped) | + (1 << BH_Lock)); + + if (io->rw == WRITE) + clear_bit(BH_Dirty, &bh->b_state); + + atomic_inc(&io->count); + submit_bh(io->rw, bh); + + b += this_size; + offset += block_size * this_size; + } + + *block = b; + return (b == end_block); +} + +static void do_region(unsigned int region, struct io_region *where, + struct page *page, unsigned int offset, + struct io_context *io) +{ + unsigned int block_size = get_hardsect_size(where->dev); + unsigned int sblock_size = block_size >> 9; + sector_t block = where->sector / sblock_size; + sector_t end_block = (where->sector + where->count) / sblock_size; + + while (1) { + if (do_page(where->dev, &block, end_block, block_size, + page, offset, region, io)) + break; + + offset = 0; /* only offset the first page */ + + page = list_entry(page->list.next, struct page, list); + } +} + +static void dispatch_io(unsigned int num_regions, struct io_region *where, + struct page *pages, unsigned int offset, + struct io_context *io) +{ + int i; + + for (i = 0; i < num_regions; i++) + if (where[i].count) + do_region(i, where + i, pages, offset, io); + + /* + * Drop the extra refence that we were holding to avoid + * the io being completed too early. + */ + dec_count(io, 0, 0); +} + +/* + * Synchronous io + */ +int dm_io_sync(unsigned int num_regions, struct io_region *where, + int rw, struct page *pages, unsigned int offset, + unsigned int *error_bits) +{ + struct io_context io; + + BUG_ON(num_regions > 1 && rw != WRITE); + + io.rw = rw; + io.error = 0; + atomic_set(&io.count, 1); /* see dispatch_io() */ + io.sleeper = current; + + dispatch_io(num_regions, where, pages, offset, &io); + run_task_queue(&tq_disk); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + + if (!atomic_read(&io.count)) + break; + + schedule(); + } + set_current_state(TASK_RUNNING); + + *error_bits = io.error; + return io.error ? -EIO : 0; +} + +/* + * Asynchronous io + */ +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, + struct page *pages, unsigned int offset, + io_notify_fn fn, void *context) +{ + struct io_context *io = mempool_alloc(_io_pool, GFP_NOIO); + + io->rw = rw; + io->error = 0; + atomic_set(&io->count, 1); /* see dispatch_io() */ + io->sleeper = NULL; + io->callback = fn; + io->context = context; + + dispatch_io(num_regions, where, pages, offset, io); + return 0; +} + +EXPORT_SYMBOL(dm_io_get); +EXPORT_SYMBOL(dm_io_put); +EXPORT_SYMBOL(dm_io_sync); +EXPORT_SYMBOL(dm_io_async); --- diff/drivers/md/dm-io.h 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-io.h 2003-08-26 14:21:35.000000000 +0100 @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the GPL. + */ + +#ifndef _DM_IO_H +#define _DM_IO_H + +#include "dm.h" + +#include + +/* Move these to bitops.h eventually */ +/* Improved generic_fls algorithm (in 2.4 there is no generic_fls so far) */ +/* (c) 2002, D.Phillips and Sistina Software */ +/* Licensed under Version 2 of the GPL */ + +static unsigned generic_fls8(unsigned n) +{ + return n & 0xf0 ? + n & 0xc0 ? (n >> 7) + 7 : (n >> 5) + 5: + n & 0x0c ? (n >> 3) + 3 : n - ((n + 1) >> 2); +} + +static inline unsigned generic_fls16(unsigned n) +{ + return n & 0xff00? generic_fls8(n >> 8) + 8 : generic_fls8(n); +} + +static inline unsigned generic_fls32(unsigned n) +{ + return n & 0xffff0000 ? generic_fls16(n >> 16) + 16 : generic_fls16(n); +} + +/* FIXME make this configurable */ +#define DM_MAX_IO_REGIONS 8 + +struct io_region { + kdev_t dev; + sector_t sector; + sector_t count; +}; + + +/* + * 'error' is a bitset, with each bit indicating whether an error + * occurred doing io to the corresponding region. + */ +typedef void (*io_notify_fn)(unsigned int error, void *context); + + +/* + * Before anyone uses the IO interface they should call + * dm_io_get(), specifying roughly how many pages they are + * expecting to perform io on concurrently. + * + * This function may block. + */ +int dm_io_get(unsigned int num_pages); +void dm_io_put(unsigned int num_pages); + + +/* + * Synchronous IO. + * + * Please ensure that the rw flag in the next two functions is + * either READ or WRITE, ie. we don't take READA. Any + * regions with a zero count field will be ignored. + */ +int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw, + struct page *pages, unsigned int offset, + unsigned int *error_bits); + + +/* + * Aynchronous IO. + * + * The 'where' array may be safely allocated on the stack since + * the function takes a copy. + */ +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw, + struct page *pages, unsigned int offset, + io_notify_fn fn, void *context); + +#endif --- diff/drivers/md/dm-ioctl.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-ioctl.c 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,1272 @@ +/* + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define DM_DRIVER_EMAIL "dm@uk.sistina.com" + +/*----------------------------------------------------------------- + * The ioctl interface needs to be able to look up devices by + * name or uuid. + *---------------------------------------------------------------*/ +struct hash_cell { + struct list_head name_list; + struct list_head uuid_list; + + char *name; + char *uuid; + struct mapped_device *md; + struct dm_table *new_map; + + /* I hate devfs */ + devfs_handle_t devfs_entry; +}; + +#define NUM_BUCKETS 64 +#define MASK_BUCKETS (NUM_BUCKETS - 1) +static struct list_head _name_buckets[NUM_BUCKETS]; +static struct list_head _uuid_buckets[NUM_BUCKETS]; + +static devfs_handle_t _dev_dir; +void dm_hash_remove_all(void); + +/* + * Guards access to both hash tables. + */ +static DECLARE_RWSEM(_hash_lock); + +static void init_buckets(struct list_head *buckets) +{ + unsigned int i; + + for (i = 0; i < NUM_BUCKETS; i++) + INIT_LIST_HEAD(buckets + i); +} + +int dm_hash_init(void) +{ + init_buckets(_name_buckets); + init_buckets(_uuid_buckets); + _dev_dir = devfs_mk_dir(0, DM_DIR, NULL); + return 0; +} + +void dm_hash_exit(void) +{ + dm_hash_remove_all(); + devfs_unregister(_dev_dir); +} + +/*----------------------------------------------------------------- + * Hash function: + * We're not really concerned with the str hash function being + * fast since it's only used by the ioctl interface. + *---------------------------------------------------------------*/ +static unsigned int hash_str(const char *str) +{ + const unsigned int hash_mult = 2654435387U; + unsigned int h = 0; + + while (*str) + h = (h + (unsigned int) *str++) * hash_mult; + + return h & MASK_BUCKETS; +} + +/*----------------------------------------------------------------- + * Code for looking up a device by name + *---------------------------------------------------------------*/ +static struct hash_cell *__get_name_cell(const char *str) +{ + struct list_head *tmp; + struct hash_cell *hc; + unsigned int h = hash_str(str); + + list_for_each (tmp, _name_buckets + h) { + hc = list_entry(tmp, struct hash_cell, name_list); + if (!strcmp(hc->name, str)) + return hc; + } + + return NULL; +} + +static struct hash_cell *__get_uuid_cell(const char *str) +{ + struct list_head *tmp; + struct hash_cell *hc; + unsigned int h = hash_str(str); + + list_for_each (tmp, _uuid_buckets + h) { + hc = list_entry(tmp, struct hash_cell, uuid_list); + if (!strcmp(hc->uuid, str)) + return hc; + } + + return NULL; +} + +/*----------------------------------------------------------------- + * Inserting, removing and renaming a device. + *---------------------------------------------------------------*/ +static inline char *kstrdup(const char *str) +{ + char *r = kmalloc(strlen(str) + 1, GFP_KERNEL); + if (r) + strcpy(r, str); + return r; +} + +static struct hash_cell *alloc_cell(const char *name, const char *uuid, + struct mapped_device *md) +{ + struct hash_cell *hc; + + hc = kmalloc(sizeof(*hc), GFP_KERNEL); + if (!hc) + return NULL; + + hc->name = kstrdup(name); + if (!hc->name) { + kfree(hc); + return NULL; + } + + if (!uuid) + hc->uuid = NULL; + + else { + hc->uuid = kstrdup(uuid); + if (!hc->uuid) { + kfree(hc->name); + kfree(hc); + return NULL; + } + } + + INIT_LIST_HEAD(&hc->name_list); + INIT_LIST_HEAD(&hc->uuid_list); + hc->md = md; + hc->new_map = NULL; + return hc; +} + +static void free_cell(struct hash_cell *hc) +{ + if (hc) { + kfree(hc->name); + kfree(hc->uuid); + kfree(hc); + } +} + +/* + * devfs stuff. + */ +static int register_with_devfs(struct hash_cell *hc) +{ + kdev_t dev = dm_kdev(hc->md); + + hc->devfs_entry = + devfs_register(_dev_dir, hc->name, DEVFS_FL_CURRENT_OWNER, + major(dev), minor(dev), + S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP, + &dm_blk_dops, NULL); + + return 0; +} + +static int unregister_with_devfs(struct hash_cell *hc) +{ + devfs_unregister(hc->devfs_entry); + return 0; +} + +/* + * The kdev_t and uuid of a device can never change once it is + * initially inserted. + */ +int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md) +{ + struct hash_cell *cell; + + /* + * Allocate the new cells. + */ + cell = alloc_cell(name, uuid, md); + if (!cell) + return -ENOMEM; + + /* + * Insert the cell into both hash tables. + */ + down_write(&_hash_lock); + if (__get_name_cell(name)) + goto bad; + + list_add(&cell->name_list, _name_buckets + hash_str(name)); + + if (uuid) { + if (__get_uuid_cell(uuid)) { + list_del(&cell->name_list); + goto bad; + } + list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); + } + register_with_devfs(cell); + dm_get(md); + up_write(&_hash_lock); + + return 0; + + bad: + up_write(&_hash_lock); + free_cell(cell); + return -EBUSY; +} + +void __hash_remove(struct hash_cell *hc) +{ + /* remove from the dev hash */ + list_del(&hc->uuid_list); + list_del(&hc->name_list); + unregister_with_devfs(hc); + dm_put(hc->md); + if (hc->new_map) + dm_table_put(hc->new_map); + free_cell(hc); +} + +void dm_hash_remove_all(void) +{ + int i; + struct hash_cell *hc; + struct list_head *tmp, *n; + + down_write(&_hash_lock); + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_safe (tmp, n, _name_buckets + i) { + hc = list_entry(tmp, struct hash_cell, name_list); + __hash_remove(hc); + } + } + up_write(&_hash_lock); +} + +int dm_hash_rename(const char *old, const char *new) +{ + char *new_name, *old_name; + struct hash_cell *hc; + + /* + * duplicate new. + */ + new_name = kstrdup(new); + if (!new_name) + return -ENOMEM; + + down_write(&_hash_lock); + + /* + * Is new free ? + */ + hc = __get_name_cell(new); + if (hc) { + DMWARN("asked to rename to an already existing name %s -> %s", + old, new); + up_write(&_hash_lock); + kfree(new_name); + return -EBUSY; + } + + /* + * Is there such a device as 'old' ? + */ + hc = __get_name_cell(old); + if (!hc) { + DMWARN("asked to rename a non existent device %s -> %s", + old, new); + up_write(&_hash_lock); + kfree(new_name); + return -ENXIO; + } + + /* + * rename and move the name cell. + */ + list_del(&hc->name_list); + old_name = hc->name; + hc->name = new_name; + list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + + /* rename the device node in devfs */ + unregister_with_devfs(hc); + register_with_devfs(hc); + + up_write(&_hash_lock); + kfree(old_name); + return 0; +} + +/*----------------------------------------------------------------- + * Implementation of the ioctl commands + *---------------------------------------------------------------*/ +/* + * All the ioctl commands get dispatched to functions with this + * prototype. + */ +typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size); + +static int remove_all(struct dm_ioctl *param, size_t param_size) +{ + dm_hash_remove_all(); + param->data_size = 0; + return 0; +} + +/* + * Round up the ptr to an 8-byte boundary. + */ +#define ALIGN_MASK 7 +static inline void *align_ptr(void *ptr) +{ + return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK); +} + +/* + * Retrieves the data payload buffer from an already allocated + * struct dm_ioctl. + */ +static void *get_result_buffer(struct dm_ioctl *param, size_t param_size, + size_t *len) +{ + param->data_start = align_ptr(param + 1) - (void *) param; + + if (param->data_start < param_size) + *len = param_size - param->data_start; + else + *len = 0; + + return ((void *) param) + param->data_start; +} + +static int list_devices(struct dm_ioctl *param, size_t param_size) +{ + unsigned int i; + struct hash_cell *hc; + size_t len, needed = 0; + struct dm_name_list *nl, *old_nl = NULL; + + down_write(&_hash_lock); + + /* + * Loop through all the devices working out how much + * space we need. + */ + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_entry (hc, _name_buckets + i, name_list) { + needed += sizeof(struct dm_name_list); + needed += strlen(hc->name); + needed += ALIGN_MASK; + } + } + + /* + * Grab our output buffer. + */ + nl = get_result_buffer(param, param_size, &len); + if (len < needed) { + param->flags |= DM_BUFFER_FULL_FLAG; + goto out; + } + param->data_size = param->data_start + needed; + + nl->dev = 0; /* Flags no data */ + + /* + * Now loop through filling out the names. + */ + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_entry (hc, _name_buckets + i, name_list) { + if (old_nl) + old_nl->next = (uint32_t) ((void *) nl - + (void *) old_nl); + + nl->dev = dm_kdev(hc->md); + nl->next = 0; + strcpy(nl->name, hc->name); + + old_nl = nl; + nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1); + } + } + + out: + up_write(&_hash_lock); + return 0; +} + +static int check_name(const char *name) +{ + if (strchr(name, '/')) { + DMWARN("invalid device name"); + return -EINVAL; + } + + return 0; +} + +/* + * Fills in a dm_ioctl structure, ready for sending back to + * userland. + */ +static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) +{ + kdev_t dev = dm_kdev(md); + struct dm_table *table; + struct block_device *bdev; + + param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | + DM_ACTIVE_PRESENT_FLAG); + + if (dm_suspended(md)) + param->flags |= DM_SUSPEND_FLAG; + + param->dev = kdev_t_to_nr(dev); + + if (is_read_only(dev)) + param->flags |= DM_READONLY_FLAG; + + param->event_nr = dm_get_event_nr(md); + + table = dm_get_table(md); + if (table) { + param->flags |= DM_ACTIVE_PRESENT_FLAG; + param->target_count = dm_table_get_num_targets(table); + dm_table_put(table); + } else + param->target_count = 0; + + bdev = bdget(param->dev); + if (!bdev) + return -ENXIO; + param->open_count = bdev->bd_openers; + bdput(bdev); + + return 0; +} + +static int dev_create(struct dm_ioctl *param, size_t param_size) +{ + int r; + kdev_t dev = 0; + struct mapped_device *md; + + r = check_name(param->name); + if (r) + return r; + + if (param->flags & DM_PERSISTENT_DEV_FLAG) + dev = to_kdev_t(param->dev); + + r = dm_create(dev, &md); + if (r) + return r; + + r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md); + if (r) { + dm_put(md); + return r; + } + + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + r = __dev_status(md, param); + dm_put(md); + + return r; +} + +/* + * Always use UUID for lookups if it's present, otherwise use name. + */ +static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) +{ + return *param->uuid ? + __get_uuid_cell(param->uuid) : __get_name_cell(param->name); +} + +static inline struct mapped_device *find_device(struct dm_ioctl *param) +{ + struct hash_cell *hc; + struct mapped_device *md = NULL; + + down_read(&_hash_lock); + hc = __find_device_hash_cell(param); + if (hc) { + md = hc->md; + + /* + * Sneakily write in both the name and the uuid + * while we have the cell. + */ + strncpy(param->name, hc->name, sizeof(param->name)); + if (hc->uuid) + strncpy(param->uuid, hc->uuid, sizeof(param->uuid) - 1); + else + param->uuid[0] = '\0'; + + if (hc->new_map) + param->flags |= DM_INACTIVE_PRESENT_FLAG; + else + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + dm_get(md); + } + up_read(&_hash_lock); + + return md; +} + +static int dev_remove(struct dm_ioctl *param, size_t param_size) +{ + struct hash_cell *hc; + + down_write(&_hash_lock); + hc = __find_device_hash_cell(param); + + if (!hc) { + DMWARN("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + __hash_remove(hc); + up_write(&_hash_lock); + param->data_size = 0; + return 0; +} + +/* + * Check a string doesn't overrun the chunk of + * memory we copied from userland. + */ +static int invalid_str(char *str, void *end) +{ + while ((void *) str < end) + if (!*str++) + return 0; + + return -EINVAL; +} + +static int dev_rename(struct dm_ioctl *param, size_t param_size) +{ + int r; + char *new_name = (char *) param + param->data_start; + + if (new_name < (char *) (param + 1) || + invalid_str(new_name, (void *) param + param_size)) { + DMWARN("Invalid new logical volume name supplied."); + return -EINVAL; + } + + r = check_name(new_name); + if (r) + return r; + + param->data_size = 0; + return dm_hash_rename(param->name, new_name); +} + +static int suspend(struct dm_ioctl *param) +{ + int r = 0; + struct mapped_device *md; + + md = find_device(param); + if (!md) + return -ENXIO; + + if (!dm_suspended(md)) + r = dm_suspend(md); + + if (!r) + r = __dev_status(md, param); + + dm_put(md); + return r; +} + +static int resume(struct dm_ioctl *param) +{ + int r = 0; + struct hash_cell *hc; + struct mapped_device *md; + struct dm_table *new_map; + + down_write(&_hash_lock); + + hc = __find_device_hash_cell(param); + if (!hc) { + DMWARN("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + md = hc->md; + dm_get(md); + + new_map = hc->new_map; + hc->new_map = NULL; + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + up_write(&_hash_lock); + + /* Do we need to load a new map ? */ + if (new_map) { + /* Suspend if it isn't already suspended */ + if (!dm_suspended(md)) + dm_suspend(md); + + r = dm_swap_table(md, new_map); + if (r) { + dm_put(md); + dm_table_put(new_map); + return r; + } + + if (dm_table_get_mode(new_map) & FMODE_WRITE) + set_device_ro(dm_kdev(md), 0); + else + set_device_ro(dm_kdev(md), 1); + + dm_table_put(new_map); + } + + if (dm_suspended(md)) + r = dm_resume(md); + + if (!r) + r = __dev_status(md, param); + + dm_put(md); + return r; +} + +/* + * Set or unset the suspension state of a device. + * If the device already is in the requested state we just return its status. + */ +static int dev_suspend(struct dm_ioctl *param, size_t param_size) +{ + if (param->flags & DM_SUSPEND_FLAG) + return suspend(param); + + return resume(param); +} + +/* + * Copies device info back to user space, used by + * the create and info ioctls. + */ +static int dev_status(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct mapped_device *md; + + md = find_device(param); + if (!md) + return -ENXIO; + + r = __dev_status(md, param); + dm_put(md); + return r; +} + +/* + * Wait for a device to report an event + */ +static int dev_wait(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct mapped_device *md; + DECLARE_WAITQUEUE(wq, current); + + md = find_device(param); + if (!md) + return -ENXIO; + + /* + * Wait for a notification event + */ + set_current_state(TASK_INTERRUPTIBLE); + if (!dm_add_wait_queue(md, &wq, param->event_nr)) { + schedule(); + dm_remove_wait_queue(md, &wq); + } + set_current_state(TASK_RUNNING); + + /* + * The userland program is going to want to know what + * changed to trigger the event, so we may as well tell + * him and save an ioctl. + */ + r = __dev_status(md, param); + + dm_put(md); + return r; +} + +static inline int get_mode(struct dm_ioctl *param) +{ + int mode = FMODE_READ | FMODE_WRITE; + + if (param->flags & DM_READONLY_FLAG) + mode = FMODE_READ; + + return mode; +} + +static int next_target(struct dm_target_spec *last, uint32_t next, void *end, + struct dm_target_spec **spec, char **target_params) +{ + *spec = (struct dm_target_spec *) ((unsigned char *) last + next); + *target_params = (char *) (*spec + 1); + + if (*spec < (last + 1)) + return -EINVAL; + + return invalid_str(*target_params, end); +} + +static int populate_table(struct dm_table *table, struct dm_ioctl *param, + size_t param_size) +{ + int r; + unsigned int i = 0; + struct dm_target_spec *spec = (struct dm_target_spec *) param; + uint32_t next = param->data_start; + void *end = (void *) param + param_size; + char *target_params; + + if (!param->target_count) { + DMWARN("populate_table: no targets specified"); + return -EINVAL; + } + + for (i = 0; i < param->target_count; i++) { + + r = next_target(spec, next, end, &spec, &target_params); + if (r) { + DMWARN("unable to find target"); + return r; + } + + r = dm_table_add_target(table, spec->target_type, + (sector_t) spec->sector_start, + (sector_t) spec->length, + target_params); + if (r) { + DMWARN("error adding target to table"); + return r; + } + + next = spec->next; + } + + return dm_table_complete(table); +} + +static int table_load(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct hash_cell *hc; + struct dm_table *t; + + r = dm_table_create(&t, get_mode(param)); + if (r) + return r; + + r = populate_table(t, param, param_size); + if (r) { + dm_table_put(t); + return r; + } + + down_write(&_hash_lock); + hc = __find_device_hash_cell(param); + if (!hc) { + DMWARN("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + hc->new_map = t; + param->flags |= DM_INACTIVE_PRESENT_FLAG; + + r = __dev_status(hc->md, param); + up_write(&_hash_lock); + return r; +} + +static int table_clear(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct hash_cell *hc; + + down_write(&_hash_lock); + + hc = __find_device_hash_cell(param); + if (!hc) { + DMWARN("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + if (hc->new_map) { + dm_table_put(hc->new_map); + hc->new_map = NULL; + } + + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + r = __dev_status(hc->md, param); + up_write(&_hash_lock); + return r; +} + +/* + * Retrieves a list of devices used by a particular dm device. + */ +static void retrieve_deps(struct dm_table *table, struct dm_ioctl *param, + size_t param_size) +{ + unsigned int count = 0; + struct list_head *tmp; + size_t len, needed; + struct dm_target_deps *deps; + + deps = get_result_buffer(param, param_size, &len); + + /* + * Count the devices. + */ + list_for_each(tmp, dm_table_get_devices(table)) + count++; + + /* + * Check we have enough space. + */ + needed = sizeof(*deps) + (sizeof(*deps->dev) * count); + if (len < needed) { + param->flags |= DM_BUFFER_FULL_FLAG; + return; + } + + /* + * Fill in the devices. + */ + deps->count = count; + count = 0; + list_for_each(tmp, dm_table_get_devices(table)) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + deps->dev[count++] = dd->bdev->bd_dev; + } + + param->data_size = param->data_start + needed; +} + +static int table_deps(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct mapped_device *md; + struct dm_table *table; + + md = find_device(param); + if (!md) + return -ENXIO; + + r = __dev_status(md, param); + if (r) + goto out; + + table = dm_get_table(md); + if (table) { + retrieve_deps(table, param, param_size); + dm_table_put(table); + } + + out: + dm_put(md); + return r; +} + +/* + * Build up the status struct for each target + */ +static void retrieve_status(struct dm_table *table, struct dm_ioctl *param, + size_t param_size) +{ + unsigned int i, num_targets; + struct dm_target_spec *spec; + char *outbuf, *outptr; + status_type_t type; + size_t remaining, len, used = 0; + + outptr = outbuf = get_result_buffer(param, param_size, &len); + + if (param->flags & DM_STATUS_TABLE_FLAG) + type = STATUSTYPE_TABLE; + else + type = STATUSTYPE_INFO; + + /* Get all the target info */ + num_targets = dm_table_get_num_targets(table); + for (i = 0; i < num_targets; i++) { + struct dm_target *ti = dm_table_get_target(table, i); + + remaining = len - (outptr - outbuf); + if (remaining < sizeof(struct dm_target_spec)) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } + + spec = (struct dm_target_spec *) outptr; + + spec->status = 0; + spec->sector_start = ti->begin; + spec->length = ti->len; + strncpy(spec->target_type, ti->type->name, + sizeof(spec->target_type)); + + outptr += sizeof(struct dm_target_spec); + remaining = len - (outptr - outbuf); + + /* Get the status/table string from the target driver */ + if (ti->type->status) { + if (ti->type->status(ti, type, outptr, remaining)) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } + } else + outptr[0] = '\0'; + + outptr += strlen(outptr) + 1; + used = param->data_start + (outptr - outbuf); + + align_ptr(outptr); + spec->next = outptr - outbuf; + } + + if (used) + param->data_size = used; + + param->target_count = num_targets; +} + +/* + * Return the status of a device as a text string for each + * target. + */ +static int table_status(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct mapped_device *md; + struct dm_table *table; + + md = find_device(param); + if (!md) + return -ENXIO; + + r = __dev_status(md, param); + if (r) + goto out; + + table = dm_get_table(md); + if (table) { + retrieve_status(table, param, param_size); + dm_table_put(table); + } + + out: + dm_put(md); + return r; +} + +/*----------------------------------------------------------------- + * Implementation of open/close/ioctl on the special char + * device. + *---------------------------------------------------------------*/ +static ioctl_fn lookup_ioctl(unsigned int cmd) +{ + static struct { + int cmd; + ioctl_fn fn; + } _ioctls[] = { + {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */ + {DM_REMOVE_ALL_CMD, remove_all}, + {DM_LIST_DEVICES_CMD, list_devices}, + + {DM_DEV_CREATE_CMD, dev_create}, + {DM_DEV_REMOVE_CMD, dev_remove}, + {DM_DEV_RENAME_CMD, dev_rename}, + {DM_DEV_SUSPEND_CMD, dev_suspend}, + {DM_DEV_STATUS_CMD, dev_status}, + {DM_DEV_WAIT_CMD, dev_wait}, + + {DM_TABLE_LOAD_CMD, table_load}, + {DM_TABLE_CLEAR_CMD, table_clear}, + {DM_TABLE_DEPS_CMD, table_deps}, + {DM_TABLE_STATUS_CMD, table_status} + }; + + return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn; +} + +/* + * As well as checking the version compatibility this always + * copies the kernel interface version out. + */ +static int check_version(unsigned int cmd, struct dm_ioctl *user) +{ + uint32_t version[3]; + int r = 0; + + if (copy_from_user(version, user->version, sizeof(version))) + return -EFAULT; + + if ((DM_VERSION_MAJOR != version[0]) || + (DM_VERSION_MINOR < version[1])) { + DMWARN("ioctl interface mismatch: " + "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)", + DM_VERSION_MAJOR, DM_VERSION_MINOR, + DM_VERSION_PATCHLEVEL, + version[0], version[1], version[2], cmd); + r = -EINVAL; + } + + /* + * Fill in the kernel version. + */ + version[0] = DM_VERSION_MAJOR; + version[1] = DM_VERSION_MINOR; + version[2] = DM_VERSION_PATCHLEVEL; + if (copy_to_user(user->version, version, sizeof(version))) + return -EFAULT; + + return r; +} + +static void free_params(struct dm_ioctl *param) +{ + vfree(param); +} + +static int copy_params(struct dm_ioctl *user, struct dm_ioctl **param) +{ + struct dm_ioctl tmp, *dmi; + + if (copy_from_user(&tmp, user, sizeof(tmp))) + return -EFAULT; + + if (tmp.data_size < sizeof(tmp)) + return -EINVAL; + + dmi = (struct dm_ioctl *) vmalloc(tmp.data_size); + if (!dmi) + return -ENOMEM; + + if (copy_from_user(dmi, user, tmp.data_size)) { + vfree(dmi); + return -EFAULT; + } + + *param = dmi; + return 0; +} + +static int validate_params(uint cmd, struct dm_ioctl *param) +{ + /* Always clear this flag */ + param->flags &= ~DM_BUFFER_FULL_FLAG; + + /* Ignores parameters */ + if (cmd == DM_REMOVE_ALL_CMD || cmd == DM_LIST_DEVICES_CMD) + return 0; + + /* Unless creating, either name or uuid but not both */ + if (cmd != DM_DEV_CREATE_CMD) { + if ((!*param->uuid && !*param->name) || + (*param->uuid && *param->name)) { + DMWARN("one of name or uuid must be supplied, cmd(%u)", + cmd); + return -EINVAL; + } + } + + /* Ensure strings are terminated */ + param->name[DM_NAME_LEN - 1] = '\0'; + param->uuid[DM_UUID_LEN - 1] = '\0'; + + return 0; +} + +static int ctl_ioctl(struct inode *inode, struct file *file, + uint command, ulong u) +{ + int r = 0; + unsigned int cmd; + struct dm_ioctl *param; + struct dm_ioctl *user = (struct dm_ioctl *) u; + ioctl_fn fn = NULL; + size_t param_size; + + /* only root can play with this */ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (_IOC_TYPE(command) != DM_IOCTL) + return -ENOTTY; + + cmd = _IOC_NR(command); + + /* + * Check the interface version passed in. This also + * writes out the kernel's interface version. + */ + r = check_version(cmd, user); + if (r) + return r; + + /* + * Nothing more to do for the version command. + */ + if (cmd == DM_VERSION_CMD) + return 0; + + fn = lookup_ioctl(cmd); + if (!fn) { + DMWARN("dm_ctl_ioctl: unknown command 0x%x", command); + return -ENOTTY; + } + + /* + * FIXME: I don't like this, we're trying to avoid low + * memory issues when a device is suspended. + */ + current->flags |= PF_MEMALLOC; + + /* + * Copy the parameters into kernel space. + */ + r = copy_params(user, ¶m); + if (r) { + current->flags &= ~PF_MEMALLOC; + return r; + } + + r = validate_params(cmd, param); + if (r) + goto out; + + param_size = param->data_size; + param->data_size = sizeof(*param); + r = fn(param, param_size); + + /* + * Copy the results back to userland. + */ + if (!r && copy_to_user(user, param, param->data_size)) + r = -EFAULT; + + out: + free_params(param); + current->flags &= ~PF_MEMALLOC; + return r; +} + +static struct file_operations _ctl_fops = { + .ioctl = ctl_ioctl, + .owner = THIS_MODULE, +}; + +static devfs_handle_t _ctl_handle; + +static struct miscdevice _dm_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = DM_NAME, + .fops = &_ctl_fops +}; + +/* + * Create misc character device and link to DM_DIR/control. + */ +int __init dm_interface_init(void) +{ + int r; + char rname[64]; + + r = dm_hash_init(); + if (r) + return r; + + r = misc_register(&_dm_misc); + if (r) { + DMERR("misc_register failed for control device"); + dm_hash_exit(); + return r; + } + + r = devfs_generate_path(_dm_misc.devfs_handle, rname + 3, + sizeof rname - 3); + if (r == -ENOSYS) + goto done; /* devfs not present */ + + if (r < 0) { + DMERR("devfs_generate_path failed for control device"); + goto failed; + } + + strncpy(rname + r, "../", 3); + r = devfs_mk_symlink(NULL, DM_DIR "/control", + DEVFS_FL_DEFAULT, rname + r, &_ctl_handle, NULL); + if (r) { + DMERR("devfs_mk_symlink failed for control device"); + goto failed; + } + devfs_auto_unregister(_dm_misc.devfs_handle, _ctl_handle); + + done: + DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR, + DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA, + DM_DRIVER_EMAIL); + return 0; + + failed: + misc_deregister(&_dm_misc); + dm_hash_exit(); + return r; +} + +void dm_interface_exit(void) +{ + if (misc_deregister(&_dm_misc) < 0) + DMERR("misc_deregister failed for control device"); + + dm_hash_exit(); +} --- diff/drivers/md/dm-linear.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-linear.c 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include + +/* + * Linear: maps a linear range of a device. + */ +struct linear_c { + struct dm_dev *dev; + sector_t start; +}; + +/* + * Construct a linear mapping: + */ +static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct linear_c *lc; + + if (argc != 2) { + ti->error = "dm-linear: Not enough arguments"; + return -EINVAL; + } + + lc = kmalloc(sizeof(*lc), GFP_KERNEL); + if (lc == NULL) { + ti->error = "dm-linear: Cannot allocate linear context"; + return -ENOMEM; + } + + if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) { + ti->error = "dm-linear: Invalid device sector"; + goto bad; + } + + if (dm_get_device(ti, argv[0], lc->start, ti->len, + dm_table_get_mode(ti->table), &lc->dev)) { + ti->error = "dm-linear: Device lookup failed"; + goto bad; + } + + ti->private = lc; + return 0; + + bad: + kfree(lc); + return -EINVAL; +} + +static void linear_dtr(struct dm_target *ti) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + + dm_put_device(ti, lc->dev); + kfree(lc); +} + +static int linear_map(struct dm_target *ti, struct buffer_head *bh, int rw, + union map_info *map_context) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + + bh->b_rdev = lc->dev->dev; + bh->b_rsector = lc->start + (bh->b_rsector - ti->begin); + + return 1; +} + +static int linear_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + kdev_t kdev; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + kdev = to_kdev_t(lc->dev->bdev->bd_dev); + snprintf(result, maxlen, "%s " SECTOR_FORMAT, + dm_kdevname(kdev), lc->start); + break; + } + return 0; +} + +static struct target_type linear_target = { + .name = "linear", + .module = THIS_MODULE, + .ctr = linear_ctr, + .dtr = linear_dtr, + .map = linear_map, + .status = linear_status, +}; + +int __init dm_linear_init(void) +{ + int r = dm_register_target(&linear_target); + + if (r < 0) + DMERR("linear: register failed %d", r); + + return r; +} + +void dm_linear_exit(void) +{ + int r = dm_unregister_target(&linear_target); + + if (r < 0) + DMERR("linear: unregister failed %d", r); +} --- diff/drivers/md/dm-log.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-log.c 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,302 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the LGPL. + */ + +#include +#include +#include +#include + +#include "dm-log.h" +#include "dm-io.h" + +static LIST_HEAD(_log_types); +static spinlock_t _lock = SPIN_LOCK_UNLOCKED; + +int dm_register_dirty_log_type(struct dirty_log_type *type) +{ + spin_lock(&_lock); + type->use_count = 0; + if (type->module) + __MOD_INC_USE_COUNT(type->module); + + list_add(&type->list, &_log_types); + spin_unlock(&_lock); + + return 0; +} + +int dm_unregister_dirty_log_type(struct dirty_log_type *type) +{ + spin_lock(&_lock); + + if (type->use_count) + DMWARN("Attempt to unregister a log type that is still in use"); + else { + list_del(&type->list); + if (type->module) + __MOD_DEC_USE_COUNT(type->module); + } + + spin_unlock(&_lock); + + return 0; +} + +static struct dirty_log_type *get_type(const char *type_name) +{ + struct dirty_log_type *type; + struct list_head *tmp; + + spin_lock(&_lock); + list_for_each (tmp, &_log_types) { + type = list_entry(tmp, struct dirty_log_type, list); + if (!strcmp(type_name, type->name)) { + type->use_count++; + spin_unlock(&_lock); + return type; + } + } + + spin_unlock(&_lock); + return NULL; +} + +static void put_type(struct dirty_log_type *type) +{ + spin_lock(&_lock); + type->use_count--; + spin_unlock(&_lock); +} + +struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size, + unsigned int argc, char **argv) +{ + struct dirty_log_type *type; + struct dirty_log *log; + + log = kmalloc(sizeof(*log), GFP_KERNEL); + if (!log) + return NULL; + + type = get_type(type_name); + if (!type) { + kfree(log); + return NULL; + } + + log->type = type; + if (type->ctr(log, dev_size, argc, argv)) { + kfree(log); + put_type(type); + return NULL; + } + + return log; +} + +void dm_destroy_dirty_log(struct dirty_log *log) +{ + log->type->dtr(log); + put_type(log->type); + kfree(log); +} + + +/*----------------------------------------------------------------- + * In core log, ie. trivial, non-persistent + * + * For now we'll keep this simple and just have 2 bitsets, one + * for clean/dirty, the other for sync/nosync. The sync bitset + * will be freed when everything is in sync. + * + * FIXME: problems with a 64bit sector_t + *---------------------------------------------------------------*/ +struct core_log { + sector_t region_size; + unsigned int region_count; + unsigned long *clean_bits; + unsigned long *sync_bits; + unsigned long *recovering_bits; /* FIXME: this seems excessive */ + + int sync_search; +}; + +static int core_ctr(struct dirty_log *log, sector_t dev_size, + unsigned int argc, char **argv) +{ + struct core_log *clog; + sector_t region_size; + unsigned int region_count; + size_t bitset_size; + + if (argc != 1) { + DMWARN("wrong number of arguments to core_log"); + return -EINVAL; + } + + if (sscanf(argv[0], SECTOR_FORMAT, ®ion_size) != 1) { + DMWARN("invalid region size string"); + return -EINVAL; + } + + region_count = dm_div_up(dev_size, region_size); + + clog = kmalloc(sizeof(*clog), GFP_KERNEL); + if (!clog) { + DMWARN("couldn't allocate core log"); + return -ENOMEM; + } + + clog->region_size = region_size; + clog->region_count = region_count; + + bitset_size = dm_round_up(region_count >> 3, sizeof(*clog->clean_bits)); + clog->clean_bits = vmalloc(bitset_size); + if (!clog->clean_bits) { + DMWARN("couldn't allocate clean bitset"); + kfree(clog); + return -ENOMEM; + } + memset(clog->clean_bits, -1, bitset_size); + + clog->sync_bits = vmalloc(bitset_size); + if (!clog->sync_bits) { + DMWARN("couldn't allocate sync bitset"); + vfree(clog->clean_bits); + kfree(clog); + return -ENOMEM; + } + memset(clog->sync_bits, 0, bitset_size); + + clog->recovering_bits = vmalloc(bitset_size); + if (!clog->recovering_bits) { + DMWARN("couldn't allocate sync bitset"); + vfree(clog->sync_bits); + vfree(clog->clean_bits); + kfree(clog); + return -ENOMEM; + } + memset(clog->recovering_bits, 0, bitset_size); + clog->sync_search = 0; + log->context = clog; + return 0; +} + +static void core_dtr(struct dirty_log *log) +{ + struct core_log *clog = (struct core_log *) log->context; + vfree(clog->clean_bits); + vfree(clog->sync_bits); + vfree(clog->recovering_bits); + kfree(clog); +} + +static sector_t core_get_region_size(struct dirty_log *log) +{ + struct core_log *clog = (struct core_log *) log->context; + return clog->region_size; +} + +static int core_is_clean(struct dirty_log *log, region_t region) +{ + struct core_log *clog = (struct core_log *) log->context; + return test_bit(region, clog->clean_bits); +} + +static int core_in_sync(struct dirty_log *log, region_t region, int block) +{ + struct core_log *clog = (struct core_log *) log->context; + + return test_bit(region, clog->sync_bits) ? 1 : 0; +} + +static int core_flush(struct dirty_log *log) +{ + /* no op */ + return 0; +} + +static void core_mark_region(struct dirty_log *log, region_t region) +{ + struct core_log *clog = (struct core_log *) log->context; + clear_bit(region, clog->clean_bits); +} + +static void core_clear_region(struct dirty_log *log, region_t region) +{ + struct core_log *clog = (struct core_log *) log->context; + set_bit(region, clog->clean_bits); +} + +static int core_get_resync_work(struct dirty_log *log, region_t *region) +{ + struct core_log *clog = (struct core_log *) log->context; + + if (clog->sync_search >= clog->region_count) + return 0; + + do { + *region = find_next_zero_bit(clog->sync_bits, + clog->region_count, + clog->sync_search); + clog->sync_search = *region + 1; + + if (*region == clog->region_count) + return 0; + + } while (test_bit(*region, clog->recovering_bits)); + + set_bit(*region, clog->recovering_bits); + return 1; +} + +static void core_complete_resync_work(struct dirty_log *log, region_t region, + int success) +{ + struct core_log *clog = (struct core_log *) log->context; + + clear_bit(region, clog->recovering_bits); + if (success) + set_bit(region, clog->sync_bits); +} + +static struct dirty_log_type _core_type = { + .name = "core", + + .ctr = core_ctr, + .dtr = core_dtr, + .get_region_size = core_get_region_size, + .is_clean = core_is_clean, + .in_sync = core_in_sync, + .flush = core_flush, + .mark_region = core_mark_region, + .clear_region = core_clear_region, + .get_resync_work = core_get_resync_work, + .complete_resync_work = core_complete_resync_work +}; + +__init int dm_dirty_log_init(void) +{ + int r; + + r = dm_register_dirty_log_type(&_core_type); + if (r) + DMWARN("couldn't register core log"); + + return r; +} + +void dm_dirty_log_exit(void) +{ + dm_unregister_dirty_log_type(&_core_type); +} + +EXPORT_SYMBOL(dm_register_dirty_log_type); +EXPORT_SYMBOL(dm_unregister_dirty_log_type); +EXPORT_SYMBOL(dm_dirty_log_init); +EXPORT_SYMBOL(dm_dirty_log_exit); +EXPORT_SYMBOL(dm_create_dirty_log); +EXPORT_SYMBOL(dm_destroy_dirty_log); --- diff/drivers/md/dm-log.h 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-log.h 2003-08-26 14:21:42.000000000 +0100 @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2003 Sistina Software + * + * This file is released under the LGPL. + */ + +#ifndef DM_DIRTY_LOG +#define DM_DIRTY_LOG + +#include "dm.h" + +typedef sector_t region_t; + +struct dirty_log_type; + +struct dirty_log { + struct dirty_log_type *type; + void *context; +}; + +struct dirty_log_type { + struct list_head list; + const char *name; + struct module *module; + unsigned int use_count; + + int (*ctr)(struct dirty_log *log, sector_t dev_size, + unsigned int argc, char **argv); + void (*dtr)(struct dirty_log *log); + + /* + * Retrieves the smallest size of region that the log can + * deal with. + */ + sector_t (*get_region_size)(struct dirty_log *log); + + /* + * A predicate to say whether a region is clean or not. + * May block. + */ + int (*is_clean)(struct dirty_log *log, region_t region); + + /* + * Returns: 0, 1, -EWOULDBLOCK, < 0 + * + * A predicate function to check the area given by + * [sector, sector + len) is in sync. + * + * If -EWOULDBLOCK is returned the state of the region is + * unknown, typically this will result in a read being + * passed to a daemon to deal with, since a daemon is + * allowed to block. + */ + int (*in_sync)(struct dirty_log *log, region_t region, int can_block); + + /* + * Flush the current log state (eg, to disk). This + * function may block. + */ + int (*flush)(struct dirty_log *log); + + /* + * Mark an area as clean or dirty. These functions may + * block, though for performance reasons blocking should + * be extremely rare (eg, allocating another chunk of + * memory for some reason). + */ + void (*mark_region)(struct dirty_log *log, region_t region); + void (*clear_region)(struct dirty_log *log, region_t region); + + /* + * Returns: <0 (error), 0 (no region), 1 (region) + * + * The mirrord will need perform recovery on regions of + * the mirror that are in the NOSYNC state. This + * function asks the log to tell the caller about the + * next region that this machine should recover. + * + * Do not confuse this function with 'in_sync()', one + * tells you if an area is synchronised, the other + * assigns recovery work. + */ + int (*get_resync_work)(struct dirty_log *log, region_t *region); + + /* + * This notifies the log that the resync of an area has + * been completed. The log should then mark this region + * as CLEAN. + */ + void (*complete_resync_work)(struct dirty_log *log, + region_t region, int success); +}; + +int dm_register_dirty_log_type(struct dirty_log_type *type); +int dm_unregister_dirty_log_type(struct dirty_log_type *type); + + +/* + * Make sure you use these two functions, rather than calling + * type->constructor/destructor() directly. + */ +struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size, + unsigned int argc, char **argv); +void dm_destroy_dirty_log(struct dirty_log *log); + +/* + * init/exit functions. + */ +int dm_dirty_log_init(void); +void dm_dirty_log_exit(void); + +#endif --- diff/drivers/md/dm-raid1.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-raid1.c 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,1297 @@ +/* + * Copyright (C) 2003 Sistina Software Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "dm-daemon.h" +#include "dm-io.h" +#include "dm-log.h" +#include "kcopyd.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct dm_daemon _kmirrord; + +/*----------------------------------------------------------------- + * buffer lists: + * + * We play with singly linked lists of buffers, but we want to be + * careful to add new buffers to the back of the list, to avoid + * buffers being starved of attention. + *---------------------------------------------------------------*/ +struct buffer_list { + struct buffer_head *head; + struct buffer_head *tail; +}; + +static inline void buffer_list_init(struct buffer_list *bl) +{ + bl->head = bl->tail = NULL; +} + +static inline void buffer_list_add(struct buffer_list *bl, + struct buffer_head *bh) +{ + bh->b_reqnext = NULL; + + if (bl->tail) { + bl->tail->b_reqnext = bh; + bl->tail = bh; + } else + bl->head = bl->tail = bh; +} + +static struct buffer_head *buffer_list_pop(struct buffer_list *bl) +{ + struct buffer_head *bh = bl->head; + + if (bh) { + bl->head = bl->head->b_reqnext; + if (!bl->head) + bl->tail = NULL; + + bh->b_reqnext = NULL; + } + + return bh; +} + +/*----------------------------------------------------------------- + * Region hash + * + * The mirror splits itself up into discrete regions. Each + * region can be in one of three states: clean, dirty, + * nosync. There is no need to put clean regions in the hash. + * + * In addition to being present in the hash table a region _may_ + * be present on one of three lists. + * + * clean_regions: Regions on this list have no io pending to + * them, they are in sync, we are no longer interested in them, + * they are dull. rh_update_states() will remove them from the + * hash table. + * + * quiesced_regions: These regions have been spun down, ready + * for recovery. rh_recovery_start() will remove regions from + * this list and hand them to kmirrord, which will schedule the + * recovery io with kcopyd. + * + * recovered_regions: Regions that kcopyd has successfully + * recovered. rh_update_states() will now schedule any delayed + * io, up the recovery_count, and remove the region from the + * hash. + * + * There are 2 locks: + * A rw spin lock 'hash_lock' protects just the hash table, + * this is never held in write mode from interrupt context, + * which I believe means that we only have to disable irqs when + * doing a write lock. + * + * An ordinary spin lock 'region_lock' that protects the three + * lists in the region_hash, with the 'state', 'list' and + * 'bhs_delayed' fields of the regions. This is used from irq + * context, so all other uses will have to suspend local irqs. + *---------------------------------------------------------------*/ +struct mirror_set; +struct region_hash { + struct mirror_set *ms; + sector_t region_size; + + /* holds persistent region state */ + struct dirty_log *log; + + /* hash table */ + rwlock_t hash_lock; + mempool_t *region_pool; + unsigned int mask; + unsigned int nr_buckets; + struct list_head *buckets; + + spinlock_t region_lock; + struct semaphore recovery_count; + struct list_head clean_regions; + struct list_head quiesced_regions; + struct list_head recovered_regions; +}; + +enum { + RH_CLEAN, + RH_DIRTY, + RH_NOSYNC, + RH_RECOVERING +}; + +struct region { + struct region_hash *rh; /* FIXME: can we get rid of this ? */ + region_t key; + int state; + + struct list_head hash_list; + struct list_head list; + + atomic_t pending; + struct buffer_head *delayed_bhs; +}; + +/* + * Conversion fns + */ +static inline region_t bh_to_region(struct region_hash *rh, + struct buffer_head *bh) +{ + return bh->b_rsector / rh->region_size; +} + +static inline sector_t region_to_sector(struct region_hash *rh, region_t region) +{ + return region * rh->region_size; +} + +/* FIXME move this */ +static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw); + +static void *region_alloc(int gfp_mask, void *pool_data) +{ + return kmalloc(sizeof(struct region), gfp_mask); +} + +static void region_free(void *element, void *pool_data) +{ + kfree(element); +} + +#define MIN_REGIONS 64 +#define MAX_RECOVERY 1 +static int rh_init(struct region_hash *rh, struct mirror_set *ms, + struct dirty_log *log, sector_t region_size, + region_t nr_regions) +{ + unsigned int nr_buckets, max_buckets; + size_t i; + + /* + * Calculate a suitable number of buckets for our hash + * table. + */ + max_buckets = nr_regions >> 6; + for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) + ; + nr_buckets >>= 1; + + rh->ms = ms; + rh->log = log; + rh->region_size = region_size; + rwlock_init(&rh->hash_lock); + rh->mask = nr_buckets - 1; + rh->nr_buckets = nr_buckets; + + rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); + if (!rh->buckets) { + DMERR("unable to allocate region hash memory"); + return -ENOMEM; + } + + for (i = 0; i < nr_buckets; i++) + INIT_LIST_HEAD(rh->buckets + i); + + spin_lock_init(&rh->region_lock); + sema_init(&rh->recovery_count, 0); + INIT_LIST_HEAD(&rh->clean_regions); + INIT_LIST_HEAD(&rh->quiesced_regions); + INIT_LIST_HEAD(&rh->recovered_regions); + + rh->region_pool = mempool_create(MIN_REGIONS, region_alloc, + region_free, NULL); + if (!rh->region_pool) { + vfree(rh->buckets); + rh->buckets = NULL; + return -ENOMEM; + } + + return 0; +} + +static void rh_exit(struct region_hash *rh) +{ + unsigned int h; + struct region *reg; + struct list_head *tmp, *tmp2; + + BUG_ON(!list_empty(&rh->quiesced_regions)); + for (h = 0; h < rh->nr_buckets; h++) { + list_for_each_safe (tmp, tmp2, rh->buckets + h) { + reg = list_entry(tmp, struct region, hash_list); + BUG_ON(atomic_read(®->pending)); + mempool_free(reg, rh->region_pool); + } + } + + if (rh->log) + dm_destroy_dirty_log(rh->log); + if (rh->region_pool) + mempool_destroy(rh->region_pool); + vfree(rh->buckets); +} + +#define RH_HASH_MULT 2654435387U + +static inline unsigned int rh_hash(struct region_hash *rh, region_t region) +{ + return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask; +} + +static struct region *__rh_lookup(struct region_hash *rh, region_t region) +{ + struct region *reg; + + list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list) + if (reg->key == region) + return reg; + + return NULL; +} + +static void __rh_insert(struct region_hash *rh, struct region *reg) +{ + unsigned int h = rh_hash(rh, reg->key); + list_add(®->hash_list, rh->buckets + h); +} + +static struct region *__rh_alloc(struct region_hash *rh, region_t region) +{ + struct region *reg, *nreg; + + read_unlock(&rh->hash_lock); + nreg = mempool_alloc(rh->region_pool, GFP_NOIO); + nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? + RH_CLEAN : RH_NOSYNC; + nreg->rh = rh; + nreg->key = region; + + INIT_LIST_HEAD(&nreg->list); + + atomic_set(&nreg->pending, 0); + nreg->delayed_bhs = NULL; + write_lock_irq(&rh->hash_lock); + + reg = __rh_lookup(rh, region); + if (reg) + /* we lost the race */ + mempool_free(nreg, rh->region_pool); + + else { + __rh_insert(rh, nreg); + if (nreg->state == RH_CLEAN) { + spin_lock_irq(&rh->region_lock); + list_add(&nreg->list, &rh->clean_regions); + spin_unlock_irq(&rh->region_lock); + } + reg = nreg; + } + write_unlock_irq(&rh->hash_lock); + read_lock(&rh->hash_lock); + + return reg; +} + +static inline struct region *__rh_find(struct region_hash *rh, region_t region) +{ + struct region *reg; + + reg = __rh_lookup(rh, region); + if (!reg) + reg = __rh_alloc(rh, region); + + return reg; +} + +static int rh_state(struct region_hash *rh, region_t region, int may_block) +{ + int r; + struct region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + read_unlock(&rh->hash_lock); + + if (reg) + return reg->state; + + /* + * The region wasn't in the hash, so we fall back to the + * dirty log. + */ + r = rh->log->type->in_sync(rh->log, region, may_block); + + /* + * Any error from the dirty log (eg. -EWOULDBLOCK) gets + * taken as a RH_NOSYNC + */ + return r == 1 ? RH_CLEAN : RH_NOSYNC; +} + +static inline int rh_in_sync(struct region_hash *rh, + region_t region, int may_block) +{ + int state = rh_state(rh, region, may_block); + return state == RH_CLEAN || state == RH_DIRTY; +} + +static void dispatch_buffers(struct mirror_set *ms, struct buffer_head *bh) +{ + struct buffer_head *nbh; + + while (bh) { + nbh = bh->b_reqnext; + queue_bh(ms, bh, WRITE); + bh = nbh; + } +} + +static void rh_update_states(struct region_hash *rh) +{ + struct list_head *tmp, *tmp2; + struct region *reg; + + LIST_HEAD(clean); + LIST_HEAD(recovered); + + /* + * Quickly grab the lists. + */ + write_lock_irq(&rh->hash_lock); + spin_lock(&rh->region_lock); + if (!list_empty(&rh->clean_regions)) { + list_splice(&rh->clean_regions, &clean); + INIT_LIST_HEAD(&rh->clean_regions); + + list_for_each_entry (reg, &clean, list) { + rh->log->type->clear_region(rh->log, reg->key); + list_del(®->hash_list); + } + } + + if (!list_empty(&rh->recovered_regions)) { + list_splice(&rh->recovered_regions, &recovered); + INIT_LIST_HEAD(&rh->recovered_regions); + + list_for_each_entry (reg, &recovered, list) + list_del(®->hash_list); + } + spin_unlock(&rh->region_lock); + write_unlock_irq(&rh->hash_lock); + + /* + * All the regions on the recovered and clean lists have + * now been pulled out of the system, so no need to do + * any more locking. + */ + list_for_each_safe (tmp, tmp2, &recovered) { + reg = list_entry(tmp, struct region, list); + + rh->log->type->complete_resync_work(rh->log, reg->key, 1); + dispatch_buffers(rh->ms, reg->delayed_bhs); + up(&rh->recovery_count); + mempool_free(reg, rh->region_pool); + } + + list_for_each_safe (tmp, tmp2, &clean) { + reg = list_entry(tmp, struct region, list); + mempool_free(reg, rh->region_pool); + } +} + +static void rh_inc(struct region_hash *rh, region_t region) +{ + struct region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + if (reg->state == RH_CLEAN) { + rh->log->type->mark_region(rh->log, reg->key); + + spin_lock_irq(&rh->region_lock); + reg->state = RH_DIRTY; + list_del_init(®->list); /* take off the clean list */ + spin_unlock_irq(&rh->region_lock); + } + + atomic_inc(®->pending); + read_unlock(&rh->hash_lock); +} + +static void rh_inc_pending(struct region_hash *rh, struct buffer_list *buffers) +{ + struct buffer_head *bh; + + for (bh = buffers->head; bh; bh = bh->b_reqnext) + rh_inc(rh, bh_to_region(rh, bh)); +} + +static void rh_dec(struct region_hash *rh, region_t region) +{ + unsigned long flags; + struct region *reg; + int wake = 0; + + read_lock(&rh->hash_lock); + reg = __rh_lookup(rh, region); + read_unlock(&rh->hash_lock); + + if (atomic_dec_and_test(®->pending)) { + spin_lock_irqsave(&rh->region_lock, flags); + if (reg->state == RH_RECOVERING) { + list_add_tail(®->list, &rh->quiesced_regions); + } else { + reg->state = RH_CLEAN; + list_add(®->list, &rh->clean_regions); + } + spin_unlock_irqrestore(&rh->region_lock, flags); + wake = 1; + } + + if (wake) + dm_daemon_wake(&_kmirrord); +} + +/* + * Starts quiescing a region in preparation for recovery. + */ +static int __rh_recovery_prepare(struct region_hash *rh) +{ + int r; + struct region *reg; + region_t region; + + /* + * Ask the dirty log what's next. + */ + r = rh->log->type->get_resync_work(rh->log, ®ion); + if (r <= 0) + return r; + + /* + * Get this region, and start it quiescing by setting the + * recovering flag. + */ + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + read_unlock(&rh->hash_lock); + + spin_lock_irq(&rh->region_lock); + reg->state = RH_RECOVERING; + + /* Already quiesced ? */ + if (atomic_read(®->pending)) + list_del_init(®->list); + + else { + list_del_init(®->list); + list_add(®->list, &rh->quiesced_regions); + } + spin_unlock_irq(&rh->region_lock); + + return 1; +} + +static void rh_recovery_prepare(struct region_hash *rh) +{ + while (!down_trylock(&rh->recovery_count)) + if (__rh_recovery_prepare(rh) <= 0) { + up(&rh->recovery_count); + break; + } +} + +/* + * Returns any quiesced regions. + */ +static struct region *rh_recovery_start(struct region_hash *rh) +{ + struct region *reg = NULL; + + spin_lock_irq(&rh->region_lock); + if (!list_empty(&rh->quiesced_regions)) { + reg = list_entry(rh->quiesced_regions.next, + struct region, list); + list_del_init(®->list); /* remove from the quiesced list */ + } + spin_unlock_irq(&rh->region_lock); + + return reg; +} + +/* FIXME: success ignored for now */ +static void rh_recovery_end(struct region *reg, int success) +{ + struct region_hash *rh = reg->rh; + + spin_lock_irq(&rh->region_lock); + list_add(®->list, ®->rh->recovered_regions); + spin_unlock_irq(&rh->region_lock); + + dm_daemon_wake(&_kmirrord); +} + +static void rh_flush(struct region_hash *rh) +{ + rh->log->type->flush(rh->log); +} + +static void rh_delay(struct region_hash *rh, struct buffer_head *bh) +{ + struct region *reg; + + read_lock(&rh->hash_lock); + reg = __rh_find(rh, bh_to_region(rh, bh)); + bh->b_reqnext = reg->delayed_bhs; + reg->delayed_bhs = bh; + read_unlock(&rh->hash_lock); +} + +static void rh_stop_recovery(struct region_hash *rh) +{ + int i; + + /* wait for any recovering regions */ + for (i = 0; i < MAX_RECOVERY; i++) + down(&rh->recovery_count); +} + +static void rh_start_recovery(struct region_hash *rh) +{ + int i; + + for (i = 0; i < MAX_RECOVERY; i++) + up(&rh->recovery_count); + + dm_daemon_wake(&_kmirrord); +} + +/*----------------------------------------------------------------- + * Mirror set structures. + *---------------------------------------------------------------*/ +struct mirror { + atomic_t error_count; + struct dm_dev *dev; + sector_t offset; +}; + +struct mirror_set { + struct dm_target *ti; + struct list_head list; + struct region_hash rh; + struct kcopyd_client *kcopyd_client; + + spinlock_t lock; /* protects the next two lists */ + struct buffer_list reads; + struct buffer_list writes; + + /* recovery */ + region_t nr_regions; + region_t sync_count; + + unsigned int nr_mirrors; + struct mirror mirror[0]; +}; + +/* + * Every mirror should look like this one. + */ +#define DEFAULT_MIRROR 0 + +/* + * This is yucky. We squirrel the mirror_set struct away inside + * b_reqnext for write buffers. This is safe since the bh + * doesn't get submitted to the lower levels of block layer. + */ +static struct mirror_set *bh_get_ms(struct buffer_head *bh) +{ + return (struct mirror_set *) bh->b_reqnext; +} + +static void bh_set_ms(struct buffer_head *bh, struct mirror_set *ms) +{ + bh->b_reqnext = (struct buffer_head *) ms; +} + +/*----------------------------------------------------------------- + * Recovery. + * + * When a mirror is first activated we may find that some regions + * are in the no-sync state. We have to recover these by + * recopying from the default mirror to all the others. + *---------------------------------------------------------------*/ +static void recovery_complete(int read_err, unsigned int write_err, + void *context) +{ + struct region *reg = (struct region *) context; + struct mirror_set *ms = reg->rh->ms; + + /* FIXME: better error handling */ + rh_recovery_end(reg, read_err || write_err); + if (++ms->sync_count == ms->nr_regions) + /* the sync is complete */ + dm_table_event(ms->ti->table); +} + +static int recover(struct mirror_set *ms, struct region *reg) +{ + int r; + unsigned int i; + struct io_region from, to[ms->nr_mirrors - 1], *dest; + struct mirror *m; + unsigned int flags = 0; + + /* fill in the source */ + m = ms->mirror + DEFAULT_MIRROR; + from.dev = m->dev->dev; + from.sector = m->offset + region_to_sector(reg->rh, reg->key); + if (reg->key == (ms->nr_regions - 1)) { + /* + * The final region may be smaller than + * region_size. + */ + from.count = ms->ti->len & (reg->rh->region_size - 1); + if (!from.count) + from.count = reg->rh->region_size; + } else + from.count = reg->rh->region_size; + + /* fill in the destinations */ + for (i = 1; i < ms->nr_mirrors; i++) { + m = ms->mirror + i; + dest = to + (i - 1); + + dest->dev = m->dev->dev; + dest->sector = m->offset + region_to_sector(reg->rh, reg->key); + dest->count = from.count; + } + + /* hand to kcopyd */ + set_bit(KCOPYD_IGNORE_ERROR, &flags); + r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, + recovery_complete, reg); + + return r; +} + +static void do_recovery(struct mirror_set *ms) +{ + int r; + struct region *reg; + + /* + * Start quiescing some regions. + */ + rh_recovery_prepare(&ms->rh); + + /* + * Copy any already quiesced regions. + */ + while ((reg = rh_recovery_start(&ms->rh))) { + r = recover(ms, reg); + if (r) + rh_recovery_end(reg, 0); + } +} + +/*----------------------------------------------------------------- + * Reads + *---------------------------------------------------------------*/ +static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) +{ + /* FIXME: add read balancing */ + return ms->mirror + DEFAULT_MIRROR; +} + +/* + * remap a buffer to a particular mirror. + */ +static void map_buffer(struct mirror_set *ms, + struct mirror *m, struct buffer_head *bh) +{ + sector_t bsize = bh->b_size >> 9; + sector_t rsector = bh->b_blocknr * bsize; + + bh->b_rdev = m->dev->dev; + bh->b_rsector = m->offset + (rsector - ms->ti->begin); +} + +static void do_reads(struct mirror_set *ms, struct buffer_list *reads) +{ + region_t region; + struct buffer_head *bh; + struct mirror *m; + + while ((bh = buffer_list_pop(reads))) { + region = bh_to_region(&ms->rh, bh); + + /* + * We can only read balance if the region is in sync. + */ + if (rh_in_sync(&ms->rh, region, 0)) + m = choose_mirror(ms, bh->b_rsector); + else + m = ms->mirror + DEFAULT_MIRROR; + + map_buffer(ms, m, bh); + generic_make_request(READ, bh); + } +} + +/*----------------------------------------------------------------- + * Writes. + * + * We do different things with the write io depending on the + * state of the region that it's in: + * + * SYNC: increment pending, use kcopyd to write to *all* mirrors + * RECOVERING: delay the io until recovery completes + * NOSYNC: increment pending, just write to the default mirror + *---------------------------------------------------------------*/ +static void write_callback(unsigned int error, void *context) +{ + unsigned int i; + int uptodate = 1; + struct buffer_head *bh = (struct buffer_head *) context; + struct mirror_set *ms; + + ms = bh_get_ms(bh); + bh_set_ms(bh, NULL); + + /* + * NOTE: We don't decrement the pending count here, + * instead it is done by the targets endio function. + * This way we handle both writes to SYNC and NOSYNC + * regions with the same code. + */ + + if (error) { + /* + * only error the io if all mirrors failed. + * FIXME: bogus + */ + uptodate = 0; + for (i = 0; i < ms->nr_mirrors; i++) + if (!test_bit(i, &error)) { + uptodate = 1; + break; + } + } + bh->b_end_io(bh, uptodate); +} + +static void do_write(struct mirror_set *ms, struct buffer_head *bh) +{ + unsigned int i; + struct io_region io[ms->nr_mirrors]; + struct mirror *m; + + for (i = 0; i < ms->nr_mirrors; i++) { + m = ms->mirror + i; + + io[i].dev = m->dev->dev; + io[i].sector = m->offset + (bh->b_rsector - ms->ti->begin); + io[i].count = bh->b_size >> 9; + } + + bh_set_ms(bh, ms); + dm_io_async(ms->nr_mirrors, io, WRITE, bh->b_page, + (unsigned int) bh->b_data & ~PAGE_MASK, write_callback, bh); +} + +static void do_writes(struct mirror_set *ms, struct buffer_list *writes) +{ + int state; + struct buffer_head *bh; + struct buffer_list sync, nosync, recover, *this_list = NULL; + + if (!writes->head) + return; + + /* + * Classify each write. + */ + buffer_list_init(&sync); + buffer_list_init(&nosync); + buffer_list_init(&recover); + + while ((bh = buffer_list_pop(writes))) { + state = rh_state(&ms->rh, bh_to_region(&ms->rh, bh), 1); + switch (state) { + case RH_CLEAN: + case RH_DIRTY: + this_list = &sync; + break; + + case RH_NOSYNC: + this_list = &nosync; + break; + + case RH_RECOVERING: + this_list = &recover; + break; + } + + buffer_list_add(this_list, bh); + } + + /* + * Increment the pending counts for any regions that will + * be written to (writes to recover regions are going to + * be delayed). + */ + rh_inc_pending(&ms->rh, &sync); + rh_inc_pending(&ms->rh, &nosync); + rh_flush(&ms->rh); + + /* + * Dispatch io. + */ + while ((bh = buffer_list_pop(&sync))) + do_write(ms, bh); + + while ((bh = buffer_list_pop(&recover))) + rh_delay(&ms->rh, bh); + + while ((bh = buffer_list_pop(&nosync))) { + map_buffer(ms, ms->mirror + DEFAULT_MIRROR, bh); + generic_make_request(WRITE, bh); + } +} + +/*----------------------------------------------------------------- + * kmirrord + *---------------------------------------------------------------*/ +static LIST_HEAD(_mirror_sets); +static DECLARE_RWSEM(_mirror_sets_lock); + +static void do_mirror(struct mirror_set *ms) +{ + struct buffer_list reads, writes; + + spin_lock(&ms->lock); + memcpy(&reads, &ms->reads, sizeof(reads)); + buffer_list_init(&ms->reads); + memcpy(&writes, &ms->writes, sizeof(writes)); + buffer_list_init(&ms->writes); + spin_unlock(&ms->lock); + + rh_update_states(&ms->rh); + do_recovery(ms); + do_reads(ms, &reads); + do_writes(ms, &writes); + run_task_queue(&tq_disk); +} + +static void do_work(void) +{ + struct mirror_set *ms; + + down_read(&_mirror_sets_lock); + list_for_each_entry (ms, &_mirror_sets, list) + do_mirror(ms); + up_read(&_mirror_sets_lock); +} + +/*----------------------------------------------------------------- + * Target functions + *---------------------------------------------------------------*/ +static struct mirror_set *alloc_context(unsigned int nr_mirrors, + sector_t region_size, + struct dm_target *ti, + struct dirty_log *dl) +{ + size_t len; + struct mirror_set *ms = NULL; + + if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors)) + return NULL; + + len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); + + ms = kmalloc(len, GFP_KERNEL); + if (!ms) { + ti->error = "dm-mirror: Cannot allocate mirror context"; + return NULL; + } + + memset(ms, 0, len); + spin_lock_init(&ms->lock); + + ms->ti = ti; + ms->nr_mirrors = nr_mirrors; + ms->nr_regions = dm_div_up(ti->len, region_size); + + if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) { + ti->error = "dm-mirror: Error creating dirty region hash"; + kfree(ms); + return NULL; + } + + return ms; +} + +static void free_context(struct mirror_set *ms, struct dm_target *ti, + unsigned int m) +{ + while (m--) + dm_put_device(ti, ms->mirror[m].dev); + + rh_exit(&ms->rh); + kfree(ms); +} + +static inline int _check_region_size(struct dm_target *ti, sector_t size) +{ + return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) || + size > ti->len); +} + +static int get_mirror(struct mirror_set *ms, struct dm_target *ti, + unsigned int mirror, char **argv) +{ + sector_t offset; + + if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) { + ti->error = "dm-mirror: Invalid offset"; + return -EINVAL; + } + + if (dm_get_device(ti, argv[0], offset, ti->len, + dm_table_get_mode(ti->table), + &ms->mirror[mirror].dev)) { + ti->error = "dm-mirror: Device lookup failure"; + return -ENXIO; + } + + ms->mirror[mirror].offset = offset; + + return 0; +} + +static int add_mirror_set(struct mirror_set *ms) +{ + down_write(&_mirror_sets_lock); + list_add_tail(&ms->list, &_mirror_sets); + up_write(&_mirror_sets_lock); + dm_daemon_wake(&_kmirrord); + + return 0; +} + +static void del_mirror_set(struct mirror_set *ms) +{ + down_write(&_mirror_sets_lock); + list_del(&ms->list); + up_write(&_mirror_sets_lock); +} + +/* + * Create dirty log: log_type #log_params + */ +static struct dirty_log *create_dirty_log(struct dm_target *ti, + unsigned int argc, char **argv, + unsigned int *args_used) +{ + unsigned int param_count; + struct dirty_log *dl; + + if (argc < 2) { + ti->error = "dm-mirror: Insufficient mirror log arguments"; + return NULL; + } + + if (sscanf(argv[1], "%u", ¶m_count) != 1 || param_count != 1) { + ti->error = "dm-mirror: Invalid mirror log argument count"; + return NULL; + } + + *args_used = 2 + param_count; + + if (argc < *args_used) { + ti->error = "dm-mirror: Insufficient mirror log arguments"; + return NULL; + } + + dl = dm_create_dirty_log(argv[0], ti->len, param_count, argv + 2); + if (!dl) { + ti->error = "dm-mirror: Error creating mirror dirty log"; + return NULL; + } + + if (!_check_region_size(ti, dl->type->get_region_size(dl))) { + ti->error = "dm-mirror: Invalid region size"; + dm_destroy_dirty_log(dl); + return NULL; + } + + return dl; +} + +/* + * Construct a mirror mapping: + * + * log_type #log_params + * #mirrors [mirror_path offset]{2,} + * + * For now, #log_params = 1, log_type = "core" + * + */ +#define DM_IO_PAGES 64 +static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + unsigned int nr_mirrors, m, args_used; + struct mirror_set *ms; + struct dirty_log *dl; + + dl = create_dirty_log(ti, argc, argv, &args_used); + if (!dl) + return -EINVAL; + + argv += args_used; + argc -= args_used; + + if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || + nr_mirrors < 2) { + ti->error = "dm-mirror: Invalid number of mirrors"; + dm_destroy_dirty_log(dl); + return -EINVAL; + } + + argv++, argc--; + + if (argc != nr_mirrors * 2) { + ti->error = "dm-mirror: Wrong number of mirror arguments"; + dm_destroy_dirty_log(dl); + return -EINVAL; + } + + ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); + if (!ms) { + dm_destroy_dirty_log(dl); + return -ENOMEM; + } + + /* Get the mirror parameter sets */ + for (m = 0; m < nr_mirrors; m++) { + r = get_mirror(ms, ti, m, argv); + if (r) { + free_context(ms, ti, m); + return r; + } + argv += 2; + argc -= 2; + } + + ti->private = ms; + + r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); + if (r) { + free_context(ms, ti, ms->nr_mirrors); + return r; + } + + add_mirror_set(ms); + return 0; +} + +static void mirror_dtr(struct dm_target *ti) +{ + struct mirror_set *ms = (struct mirror_set *) ti->private; + + del_mirror_set(ms); + kcopyd_client_destroy(ms->kcopyd_client); + free_context(ms, ti, ms->nr_mirrors); +} + +static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw) +{ + int wake = 0; + struct buffer_list *bl; + + bl = (rw == WRITE) ? &ms->writes : &ms->reads; + spin_lock(&ms->lock); + wake = !(bl->head); + buffer_list_add(bl, bh); + spin_unlock(&ms->lock); + + if (wake) + dm_daemon_wake(&_kmirrord); +} + +/* + * Mirror mapping function + */ +static int mirror_map(struct dm_target *ti, struct buffer_head *bh, + int rw, union map_info *map_context) +{ + int r; + struct mirror *m; + struct mirror_set *ms = ti->private; + + /* FIXME: nasty hack, 32 bit sector_t only */ + map_context->ll = bh->b_rsector / ms->rh.region_size; + + if (rw == WRITE) { + queue_bh(ms, bh, rw); + return 0; + } + + r = ms->rh.log->type->in_sync(ms->rh.log, bh_to_region(&ms->rh, bh), 0); + if (r < 0 && r != -EWOULDBLOCK) + return r; + + if (r == -EWOULDBLOCK) /* FIXME: ugly */ + r = 0; + + /* + * We don't want to fast track a recovery just for a read + * ahead. So we just let it silently fail. + * FIXME: get rid of this. + */ + if (!r && rw == READA) + return -EIO; + + if (!r) { + /* Pass this io over to the daemon */ + queue_bh(ms, bh, rw); + return 0; + } + + m = choose_mirror(ms, bh->b_rsector); + if (!m) + return -EIO; + + map_buffer(ms, m, bh); + return 1; +} + +static int mirror_end_io(struct dm_target *ti, struct buffer_head *bh, + int rw, int error, union map_info *map_context) +{ + struct mirror_set *ms = (struct mirror_set *) ti->private; + region_t region = map_context->ll; + + /* + * We need to dec pending if this was a write. + */ + if (rw == WRITE) + rh_dec(&ms->rh, region); + + return 0; +} + +static void mirror_suspend(struct dm_target *ti) +{ + struct mirror_set *ms = (struct mirror_set *) ti->private; + rh_stop_recovery(&ms->rh); +} + +static void mirror_resume(struct dm_target *ti) +{ + struct mirror_set *ms = (struct mirror_set *) ti->private; + rh_start_recovery(&ms->rh); +} + +static int mirror_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + unsigned int m, sz = 0; + struct mirror_set *ms = (struct mirror_set *) ti->private; + + switch (type) { + case STATUSTYPE_INFO: + sz += snprintf(result + sz, maxlen - sz, "%d ", ms->nr_mirrors); + + for (m = 0; m < ms->nr_mirrors; m++) + sz += snprintf(result + sz, maxlen - sz, "%s ", + dm_kdevname(ms->mirror[m].dev->dev)); + + sz += snprintf(result + sz, maxlen - sz, "%lu/%lu", + ms->sync_count, ms->nr_regions); + break; + + case STATUSTYPE_TABLE: + sz += snprintf(result + sz, maxlen - sz, + "%s 1 " SECTOR_FORMAT " %d ", + ms->rh.log->type->name, ms->rh.region_size, + ms->nr_mirrors); + + for (m = 0; m < ms->nr_mirrors; m++) + sz += snprintf(result + sz, maxlen - sz, "%s %ld ", + dm_kdevname(ms->mirror[m].dev->dev), + ms->mirror[m].offset); + } + + return 0; +} + +static struct target_type mirror_target = { + .name = "mirror", + .module = THIS_MODULE, + .ctr = mirror_ctr, + .dtr = mirror_dtr, + .map = mirror_map, + .end_io = mirror_end_io, + .suspend = mirror_suspend, + .resume = mirror_resume, + .status = mirror_status, +}; + +static int __init dm_mirror_init(void) +{ + int r; + + r = dm_dirty_log_init(); + if (r) + return r; + + r = dm_daemon_start(&_kmirrord, "kmirrord", do_work); + if (r) { + DMERR("couldn't start kmirrord"); + dm_dirty_log_exit(); + return r; + } + + r = dm_register_target(&mirror_target); + if (r < 0) { + DMERR("%s: Failed to register mirror target", + mirror_target.name); + dm_dirty_log_exit(); + dm_daemon_stop(&_kmirrord); + } + + return r; +} + +static void __exit dm_mirror_exit(void) +{ + int r; + + r = dm_unregister_target(&mirror_target); + if (r < 0) + DMERR("%s: unregister failed %d", mirror_target.name, r); + + dm_daemon_stop(&_kmirrord); + dm_dirty_log_exit(); +} + +/* Module hooks */ +module_init(dm_mirror_init); +module_exit(dm_mirror_exit); + +MODULE_DESCRIPTION(DM_NAME " mirror target"); +MODULE_AUTHOR("Heinz Mauelshagen "); +MODULE_LICENSE("GPL"); --- diff/drivers/md/dm-snapshot.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-snapshot.c 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,1235 @@ +/* + * dm-snapshot.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dm-snapshot.h" +#include "kcopyd.h" + +/* + * FIXME: Remove this before release. + */ +#if 0 +#define DMDEBUG(x...) DMWARN( ## x) +#else +#define DMDEBUG(x...) +#endif + +/* + * The percentage increment we will wake up users at + */ +#define WAKE_UP_PERCENT 5 + +/* + * kcopyd priority of snapshot operations + */ +#define SNAPSHOT_COPY_PRIORITY 2 + +/* + * Each snapshot reserves this many pages for io + * FIXME: calculate this + */ +#define SNAPSHOT_PAGES 256 + +struct pending_exception { + struct exception e; + + /* + * Origin buffers waiting for this to complete are held + * in a list (using b_reqnext). + */ + struct buffer_head *origin_bhs; + struct buffer_head *snapshot_bhs; + + /* + * Other pending_exceptions that are processing this + * chunk. When this list is empty, we know we can + * complete the origins. + */ + struct list_head siblings; + + /* Pointer back to snapshot context */ + struct dm_snapshot *snap; + + /* + * 1 indicates the exception has already been sent to + * kcopyd. + */ + int started; +}; + +/* + * Hash table mapping origin volumes to lists of snapshots and + * a lock to protect it + */ +static kmem_cache_t *exception_cache; +static kmem_cache_t *pending_cache; +static mempool_t *pending_pool; + +/* + * One of these per registered origin, held in the snapshot_origins hash + */ +struct origin { + /* The origin device */ + kdev_t dev; + + struct list_head hash_list; + + /* List of snapshots for this origin */ + struct list_head snapshots; +}; + +/* + * Size of the hash table for origin volumes. If we make this + * the size of the minors list then it should be nearly perfect + */ +#define ORIGIN_HASH_SIZE 256 +#define ORIGIN_MASK 0xFF +static struct list_head *_origins; +static struct rw_semaphore _origins_lock; + +static int init_origin_hash(void) +{ + int i; + + _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head), + GFP_KERNEL); + if (!_origins) { + DMERR("Device mapper: Snapshot: unable to allocate memory"); + return -ENOMEM; + } + + for (i = 0; i < ORIGIN_HASH_SIZE; i++) + INIT_LIST_HEAD(_origins + i); + init_rwsem(&_origins_lock); + + return 0; +} + +static void exit_origin_hash(void) +{ + kfree(_origins); +} + +static inline unsigned int origin_hash(kdev_t dev) +{ + return MINOR(dev) & ORIGIN_MASK; +} + +static struct origin *__lookup_origin(kdev_t origin) +{ + struct list_head *slist; + struct list_head *ol; + struct origin *o; + + ol = &_origins[origin_hash(origin)]; + list_for_each(slist, ol) { + o = list_entry(slist, struct origin, hash_list); + + if (o->dev == origin) + return o; + } + + return NULL; +} + +static void __insert_origin(struct origin *o) +{ + struct list_head *sl = &_origins[origin_hash(o->dev)]; + list_add_tail(&o->hash_list, sl); +} + +/* + * Make a note of the snapshot and its origin so we can look it + * up when the origin has a write on it. + */ +static int register_snapshot(struct dm_snapshot *snap) +{ + struct origin *o; + kdev_t dev = snap->origin->dev; + + down_write(&_origins_lock); + o = __lookup_origin(dev); + + if (!o) { + /* New origin */ + o = kmalloc(sizeof(*o), GFP_KERNEL); + if (!o) { + up_write(&_origins_lock); + return -ENOMEM; + } + + /* Initialise the struct */ + INIT_LIST_HEAD(&o->snapshots); + o->dev = dev; + + __insert_origin(o); + } + + list_add_tail(&snap->list, &o->snapshots); + + up_write(&_origins_lock); + return 0; +} + +static void unregister_snapshot(struct dm_snapshot *s) +{ + struct origin *o; + + down_write(&_origins_lock); + o = __lookup_origin(s->origin->dev); + + list_del(&s->list); + if (list_empty(&o->snapshots)) { + list_del(&o->hash_list); + kfree(o); + } + + up_write(&_origins_lock); +} + +/* + * Implementation of the exception hash tables. + */ +static int init_exception_table(struct exception_table *et, uint32_t size) +{ + unsigned int i; + + et->hash_mask = size - 1; + et->table = vcalloc(size, sizeof(struct list_head)); + if (!et->table) + return -ENOMEM; + + for (i = 0; i < size; i++) + INIT_LIST_HEAD(et->table + i); + + return 0; +} + +static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem) +{ + struct list_head *slot, *entry, *temp; + struct exception *ex; + int i, size; + + size = et->hash_mask + 1; + for (i = 0; i < size; i++) { + slot = et->table + i; + + list_for_each_safe(entry, temp, slot) { + ex = list_entry(entry, struct exception, hash_list); + kmem_cache_free(mem, ex); + } + } + + vfree(et->table); +} + +/* + * FIXME: check how this hash fn is performing. + */ +static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk) +{ + return chunk & et->hash_mask; +} + +static void insert_exception(struct exception_table *eh, struct exception *e) +{ + struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)]; + list_add(&e->hash_list, l); +} + +static inline void remove_exception(struct exception *e) +{ + list_del(&e->hash_list); +} + +/* + * Return the exception data for a sector, or NULL if not + * remapped. + */ +static struct exception *lookup_exception(struct exception_table *et, + chunk_t chunk) +{ + struct list_head *slot, *el; + struct exception *e; + + slot = &et->table[exception_hash(et, chunk)]; + list_for_each(el, slot) { + e = list_entry(el, struct exception, hash_list); + if (e->old_chunk == chunk) + return e; + } + + return NULL; +} + +static inline struct exception *alloc_exception(void) +{ + struct exception *e; + + e = kmem_cache_alloc(exception_cache, GFP_NOIO); + if (!e) + e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); + + return e; +} + +static inline void free_exception(struct exception *e) +{ + kmem_cache_free(exception_cache, e); +} + +static inline struct pending_exception *alloc_pending_exception(void) +{ + return mempool_alloc(pending_pool, GFP_NOIO); +} + +static inline void free_pending_exception(struct pending_exception *pe) +{ + mempool_free(pe, pending_pool); +} + +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) +{ + struct exception *e; + + e = alloc_exception(); + if (!e) + return -ENOMEM; + + e->old_chunk = old; + e->new_chunk = new; + insert_exception(&s->complete, e); + return 0; +} + +/* + * Hard coded magic. + */ +static int calc_max_buckets(void) +{ + unsigned long mem; + + mem = num_physpages << PAGE_SHIFT; + mem /= 50; + mem /= sizeof(struct list_head); + + return mem; +} + +/* + * Rounds a number down to a power of 2. + */ +static inline uint32_t round_down(uint32_t n) +{ + while (n & (n - 1)) + n &= (n - 1); + return n; +} + +/* + * Allocate room for a suitable hash table. + */ +static int init_hash_tables(struct dm_snapshot *s) +{ + sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets; + + /* + * Calculate based on the size of the original volume or + * the COW volume... + */ + cow_dev_size = get_dev_size(s->cow->dev); + origin_dev_size = get_dev_size(s->origin->dev); + max_buckets = calc_max_buckets(); + + hash_size = min(origin_dev_size, cow_dev_size) / s->chunk_size; + hash_size = min(hash_size, max_buckets); + + /* Round it down to a power of 2 */ + hash_size = round_down(hash_size); + if (init_exception_table(&s->complete, hash_size)) + return -ENOMEM; + + /* + * Allocate hash table for in-flight exceptions + * Make this smaller than the real hash table + */ + hash_size >>= 3; + if (!hash_size) + hash_size = 64; + + if (init_exception_table(&s->pending, hash_size)) { + exit_exception_table(&s->complete, exception_cache); + return -ENOMEM; + } + + return 0; +} + +/* + * Round a number up to the nearest 'size' boundary. size must + * be a power of 2. + */ +static inline ulong round_up(ulong n, ulong size) +{ + size--; + return (n + size) & ~size; +} + +/* + * Construct a snapshot mapping:

+ */ +static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct dm_snapshot *s; + unsigned long chunk_size; + int r = -EINVAL; + char persistent; + char *origin_path; + char *cow_path; + char *value; + int blocksize; + + if (argc < 4) { + ti->error = "dm-snapshot: requires exactly 4 arguments"; + r = -EINVAL; + goto bad1; + } + + origin_path = argv[0]; + cow_path = argv[1]; + persistent = toupper(*argv[2]); + + if (persistent != 'P' && persistent != 'N') { + ti->error = "Persistent flag is not P or N"; + r = -EINVAL; + goto bad1; + } + + chunk_size = simple_strtoul(argv[3], &value, 10); + if (chunk_size == 0 || value == NULL) { + ti->error = "Invalid chunk size"; + r = -EINVAL; + goto bad1; + } + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) { + ti->error = "Cannot allocate snapshot context private " + "structure"; + r = -ENOMEM; + goto bad1; + } + + r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin); + if (r) { + ti->error = "Cannot get origin device"; + goto bad2; + } + + /* FIXME: get cow length */ + r = dm_get_device(ti, cow_path, 0, 0, + FMODE_READ | FMODE_WRITE, &s->cow); + if (r) { + dm_put_device(ti, s->origin); + ti->error = "Cannot get COW device"; + goto bad2; + } + + /* + * Chunk size must be multiple of page size. Silently + * round up if it's not. + */ + chunk_size = round_up(chunk_size, PAGE_SIZE / SECTOR_SIZE); + + /* Validate the chunk size against the device block size */ + blocksize = get_hardsect_size(s->cow->dev); + if (chunk_size % (blocksize / SECTOR_SIZE)) { + ti->error = "Chunk size is not a multiple of device blocksize"; + r = -EINVAL; + goto bad3; + } + + /* Check the sizes are small enough to fit in one kiovec */ + if (chunk_size > KIO_MAX_SECTORS) { + ti->error = "Chunk size is too big"; + r = -EINVAL; + goto bad3; + } + + /* Check chunk_size is a power of 2 */ + if (chunk_size & (chunk_size - 1)) { + ti->error = "Chunk size is not a power of 2"; + r = -EINVAL; + goto bad3; + } + + s->chunk_size = chunk_size; + s->chunk_mask = chunk_size - 1; + s->type = persistent; + for (s->chunk_shift = 0; chunk_size; + s->chunk_shift++, chunk_size >>= 1) + ; + s->chunk_shift--; + + s->valid = 1; + s->have_metadata = 0; + s->last_percent = 0; + init_rwsem(&s->lock); + s->table = ti->table; + + /* Allocate hash table for COW data */ + if (init_hash_tables(s)) { + ti->error = "Unable to allocate hash table space"; + r = -ENOMEM; + goto bad3; + } + + /* + * Check the persistent flag - done here because we need the iobuf + * to check the LV header + */ + s->store.snap = s; + + if (persistent == 'P') + r = dm_create_persistent(&s->store, s->chunk_size); + else + r = dm_create_transient(&s->store, s, blocksize); + + if (r) { + ti->error = "Couldn't create exception store"; + r = -EINVAL; + goto bad4; + } + + r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); + if (r) { + ti->error = "Could not create kcopyd client"; + goto bad5; + } + + /* Flush IO to the origin device */ + fsync_dev(s->origin->dev); + + /* Add snapshot to the list of snapshots for this origin */ + if (register_snapshot(s)) { + r = -EINVAL; + ti->error = "Cannot register snapshot origin"; + goto bad6; + } + + ti->private = s; + return 0; + + bad6: + kcopyd_client_destroy(s->kcopyd_client); + + bad5: + s->store.destroy(&s->store); + + bad4: + exit_exception_table(&s->pending, pending_cache); + exit_exception_table(&s->complete, exception_cache); + + bad3: + dm_put_device(ti, s->cow); + dm_put_device(ti, s->origin); + + bad2: + kfree(s); + + bad1: + return r; +} + +static void snapshot_dtr(struct dm_target *ti) +{ + struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + + dm_table_event(ti->table); + + unregister_snapshot(s); + + exit_exception_table(&s->pending, pending_cache); + exit_exception_table(&s->complete, exception_cache); + + /* Deallocate memory used */ + s->store.destroy(&s->store); + + dm_put_device(ti, s->origin); + dm_put_device(ti, s->cow); + kcopyd_client_destroy(s->kcopyd_client); + kfree(s); +} + +/* + * We hold lists of buffer_heads, using the b_reqnext field. + */ +static void queue_buffer(struct buffer_head **queue, struct buffer_head *bh) +{ + bh->b_reqnext = *queue; + *queue = bh; +} + +/* + * FIXME: inefficient. + */ +static void queue_buffers(struct buffer_head **queue, struct buffer_head *bhs) +{ + while (*queue) + queue = &((*queue)->b_reqnext); + + *queue = bhs; +} + +/* + * Flush a list of buffers. + */ +static void flush_buffers(struct buffer_head *bh) +{ + struct buffer_head *n; + + DMDEBUG("begin flush"); + while (bh) { + n = bh->b_reqnext; + bh->b_reqnext = NULL; + DMDEBUG("flushing %p", bh); + generic_make_request(WRITE, bh); + bh = n; + } + + run_task_queue(&tq_disk); +} + +/* + * Error a list of buffers. + */ +static void error_buffers(struct buffer_head *bh) +{ + struct buffer_head *n; + + while (bh) { + n = bh->b_reqnext; + bh->b_reqnext = NULL; + buffer_IO_error(bh); + bh = n; + } +} + +static struct buffer_head *__flush_bhs(struct pending_exception *pe) +{ + struct pending_exception *sibling; + + if (list_empty(&pe->siblings)) + return pe->origin_bhs; + + sibling = list_entry(pe->siblings.next, + struct pending_exception, siblings); + + list_del(&pe->siblings); + + /* FIXME: I think there's a race on SMP machines here, add spin lock */ + queue_buffers(&sibling->origin_bhs, pe->origin_bhs); + + return NULL; +} + +static void pending_complete(struct pending_exception *pe, int success) +{ + struct exception *e; + struct dm_snapshot *s = pe->snap; + struct buffer_head *flush = NULL; + + if (success) { + e = alloc_exception(); + if (!e) { + DMWARN("Unable to allocate exception."); + down_write(&s->lock); + s->store.drop_snapshot(&s->store); + s->valid = 0; + flush = __flush_bhs(pe); + up_write(&s->lock); + + error_buffers(pe->snapshot_bhs); + goto out; + } + + /* + * Add a proper exception, and remove the + * in-flight exception from the list. + */ + down_write(&s->lock); + + memcpy(e, &pe->e, sizeof(*e)); + insert_exception(&s->complete, e); + remove_exception(&pe->e); + flush = __flush_bhs(pe); + + /* Submit any pending write BHs */ + up_write(&s->lock); + + flush_buffers(pe->snapshot_bhs); + DMDEBUG("Exception completed successfully."); + + /* Notify any interested parties */ + if (s->store.fraction_full) { + sector_t numerator, denominator; + int pc; + + s->store.fraction_full(&s->store, &numerator, + &denominator); + pc = numerator * 100 / denominator; + + if (pc >= s->last_percent + WAKE_UP_PERCENT) { + dm_table_event(s->table); + s->last_percent = pc - pc % WAKE_UP_PERCENT; + } + } + + } else { + /* Read/write error - snapshot is unusable */ + down_write(&s->lock); + if (s->valid) + DMERR("Error reading/writing snapshot"); + s->store.drop_snapshot(&s->store); + s->valid = 0; + remove_exception(&pe->e); + flush = __flush_bhs(pe); + up_write(&s->lock); + + error_buffers(pe->snapshot_bhs); + + dm_table_event(s->table); + DMDEBUG("Exception failed."); + } + + out: + if (flush) + flush_buffers(flush); + + free_pending_exception(pe); +} + +static void commit_callback(void *context, int success) +{ + struct pending_exception *pe = (struct pending_exception *) context; + pending_complete(pe, success); +} + +/* + * Called when the copy I/O has finished. kcopyd actually runs + * this code so don't block. + */ +static void copy_callback(int read_err, unsigned int write_err, void *context) +{ + struct pending_exception *pe = (struct pending_exception *) context; + struct dm_snapshot *s = pe->snap; + + if (read_err || write_err) + pending_complete(pe, 0); + + else + /* Update the metadata if we are persistent */ + s->store.commit_exception(&s->store, &pe->e, commit_callback, + pe); +} + +/* + * Dispatches the copy operation to kcopyd. + */ +static inline void start_copy(struct pending_exception *pe) +{ + struct dm_snapshot *s = pe->snap; + struct io_region src, dest; + kdev_t dev = s->origin->dev; + int *sizes = blk_size[major(dev)]; + sector_t dev_size = (sector_t) -1; + + if (pe->started) + return; + + /* this is protected by snap->lock */ + pe->started = 1; + + if (sizes && sizes[minor(dev)]) + dev_size = sizes[minor(dev)] << 1; + + src.dev = dev; + src.sector = chunk_to_sector(s, pe->e.old_chunk); + src.count = min(s->chunk_size, dev_size - src.sector); + + dest.dev = s->cow->dev; + dest.sector = chunk_to_sector(s, pe->e.new_chunk); + dest.count = src.count; + + /* Hand over to kcopyd */ + kcopyd_copy(s->kcopyd_client, + &src, 1, &dest, 0, copy_callback, pe); +} + +/* + * Looks to see if this snapshot already has a pending exception + * for this chunk, otherwise it allocates a new one and inserts + * it into the pending table. + */ +static struct pending_exception *find_pending_exception(struct dm_snapshot *s, + struct buffer_head *bh) +{ + struct exception *e; + struct pending_exception *pe; + chunk_t chunk = sector_to_chunk(s, bh->b_rsector); + + /* + * Is there a pending exception for this already ? + */ + e = lookup_exception(&s->pending, chunk); + if (e) { + /* cast the exception to a pending exception */ + pe = list_entry(e, struct pending_exception, e); + + } else { + /* Create a new pending exception */ + pe = alloc_pending_exception(); + pe->e.old_chunk = chunk; + pe->origin_bhs = pe->snapshot_bhs = NULL; + INIT_LIST_HEAD(&pe->siblings); + pe->snap = s; + pe->started = 0; + + if (s->store.prepare_exception(&s->store, &pe->e)) { + free_pending_exception(pe); + s->valid = 0; + return NULL; + } + + insert_exception(&s->pending, &pe->e); + } + + return pe; +} + +static inline void remap_exception(struct dm_snapshot *s, struct exception *e, + struct buffer_head *bh) +{ + bh->b_rdev = s->cow->dev; + bh->b_rsector = chunk_to_sector(s, e->new_chunk) + + (bh->b_rsector & s->chunk_mask); +} + +static int snapshot_map(struct dm_target *ti, struct buffer_head *bh, int rw, + union map_info *map_context) +{ + struct exception *e; + struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + int r = 1; + chunk_t chunk; + struct pending_exception *pe; + + chunk = sector_to_chunk(s, bh->b_rsector); + + /* Full snapshots are not usable */ + if (!s->valid) + return -1; + + /* + * Write to snapshot - higher level takes care of RW/RO + * flags so we should only get this if we are + * writeable. + */ + if (rw == WRITE) { + + down_write(&s->lock); + + /* If the block is already remapped - use that, else remap it */ + e = lookup_exception(&s->complete, chunk); + if (e) + remap_exception(s, e, bh); + + else { + pe = find_pending_exception(s, bh); + + if (!pe) { + s->store.drop_snapshot(&s->store); + s->valid = 0; + r = -EIO; + } else { + remap_exception(s, &pe->e, bh); + queue_buffer(&pe->snapshot_bhs, bh); + start_copy(pe); + r = 0; + } + } + + up_write(&s->lock); + + } else { + /* + * FIXME: this read path scares me because we + * always use the origin when we have a pending + * exception. However I can't think of a + * situation where this is wrong - ejt. + */ + + /* Do reads */ + down_read(&s->lock); + + /* See if it it has been remapped */ + e = lookup_exception(&s->complete, chunk); + if (e) + remap_exception(s, e, bh); + else + bh->b_rdev = s->origin->dev; + + up_read(&s->lock); + } + + return r; +} + +void snapshot_resume(struct dm_target *ti) +{ + struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + + if (s->have_metadata) + return; + + if (s->store.read_metadata(&s->store)) { + down_write(&s->lock); + s->valid = 0; + up_write(&s->lock); + } + + s->have_metadata = 1; +} + +static int snapshot_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct dm_snapshot *snap = (struct dm_snapshot *) ti->private; + char cow[16]; + char org[16]; + + switch (type) { + case STATUSTYPE_INFO: + if (!snap->valid) + snprintf(result, maxlen, "Invalid"); + else { + if (snap->store.fraction_full) { + sector_t numerator, denominator; + snap->store.fraction_full(&snap->store, + &numerator, + &denominator); + snprintf(result, maxlen, + SECTOR_FORMAT "/" SECTOR_FORMAT, + numerator, denominator); + } + else + snprintf(result, maxlen, "Unknown"); + } + break; + + case STATUSTYPE_TABLE: + /* + * kdevname returns a static pointer so we need + * to make private copies if the output is to + * make sense. + */ + strncpy(cow, dm_kdevname(snap->cow->dev), sizeof(cow)); + strncpy(org, dm_kdevname(snap->origin->dev), sizeof(org)); + snprintf(result, maxlen, "%s %s %c %ld", org, cow, + snap->type, snap->chunk_size); + break; + } + + return 0; +} + +/*----------------------------------------------------------------- + * Origin methods + *---------------------------------------------------------------*/ +static void list_merge(struct list_head *l1, struct list_head *l2) +{ + struct list_head *l1_n, *l2_p; + + l1_n = l1->next; + l2_p = l2->prev; + + l1->next = l2; + l2->prev = l1; + + l2_p->next = l1_n; + l1_n->prev = l2_p; +} + +static int __origin_write(struct list_head *snapshots, struct buffer_head *bh) +{ + int r = 1, first = 1; + struct list_head *sl; + struct dm_snapshot *snap; + struct exception *e; + struct pending_exception *pe, *last = NULL; + chunk_t chunk; + + /* Do all the snapshots on this origin */ + list_for_each(sl, snapshots) { + snap = list_entry(sl, struct dm_snapshot, list); + + /* Only deal with valid snapshots */ + if (!snap->valid) + continue; + + down_write(&snap->lock); + + /* + * Remember, different snapshots can have + * different chunk sizes. + */ + chunk = sector_to_chunk(snap, bh->b_rsector); + + /* + * Check exception table to see if block + * is already remapped in this snapshot + * and trigger an exception if not. + */ + e = lookup_exception(&snap->complete, chunk); + if (!e) { + pe = find_pending_exception(snap, bh); + if (!pe) { + snap->store.drop_snapshot(&snap->store); + snap->valid = 0; + + } else { + if (last) + list_merge(&pe->siblings, + &last->siblings); + + last = pe; + r = 0; + } + } + + up_write(&snap->lock); + } + + /* + * Now that we have a complete pe list we can start the copying. + */ + if (last) { + pe = last; + do { + down_write(&pe->snap->lock); + if (first) + queue_buffer(&pe->origin_bhs, bh); + start_copy(pe); + up_write(&pe->snap->lock); + first = 0; + pe = list_entry(pe->siblings.next, + struct pending_exception, siblings); + + } while (pe != last); + } + + return r; +} + +/* + * Called on a write from the origin driver. + */ +int do_origin(struct dm_dev *origin, struct buffer_head *bh) +{ + struct origin *o; + int r; + + down_read(&_origins_lock); + o = __lookup_origin(origin->dev); + if (!o) + BUG(); + + r = __origin_write(&o->snapshots, bh); + up_read(&_origins_lock); + + return r; +} + +/* + * Origin: maps a linear range of a device, with hooks for snapshotting. + */ + +/* + * Construct an origin mapping: + * The context for an origin is merely a 'struct dm_dev *' + * pointing to the real device. + */ +static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + struct dm_dev *dev; + + if (argc != 1) { + ti->error = "dm-origin: incorrect number of arguments"; + return -EINVAL; + } + + r = dm_get_device(ti, argv[0], 0, ti->len, + dm_table_get_mode(ti->table), &dev); + if (r) { + ti->error = "Cannot get target device"; + return r; + } + + ti->private = dev; + return 0; +} + +static void origin_dtr(struct dm_target *ti) +{ + struct dm_dev *dev = (struct dm_dev *) ti->private; + dm_put_device(ti, dev); +} + +static int origin_map(struct dm_target *ti, struct buffer_head *bh, int rw, + union map_info *map_context) +{ + struct dm_dev *dev = (struct dm_dev *) ti->private; + bh->b_rdev = dev->dev; + + /* Only tell snapshots if this is a write */ + return (rw == WRITE) ? do_origin(dev, bh) : 1; +} + +static int origin_status(struct dm_target *ti, status_type_t type, char *result, + unsigned int maxlen) +{ + struct dm_dev *dev = (struct dm_dev *) ti->private; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s", dm_kdevname(dev->dev)); + break; + } + + return 0; +} + +static struct target_type origin_target = { + name: "snapshot-origin", + module: THIS_MODULE, + ctr: origin_ctr, + dtr: origin_dtr, + map: origin_map, + status: origin_status, +}; + +static struct target_type snapshot_target = { + name: "snapshot", + module: THIS_MODULE, + ctr: snapshot_ctr, + dtr: snapshot_dtr, + map: snapshot_map, + resume: snapshot_resume, + status: snapshot_status, +}; + +int __init dm_snapshot_init(void) +{ + int r; + + r = dm_register_target(&snapshot_target); + if (r) { + DMERR("snapshot target register failed %d", r); + return r; + } + + r = dm_register_target(&origin_target); + if (r < 0) { + DMERR("Device mapper: Origin: register failed %d\n", r); + goto bad1; + } + + r = init_origin_hash(); + if (r) { + DMERR("init_origin_hash failed."); + goto bad2; + } + + exception_cache = kmem_cache_create("dm-snapshot-ex", + sizeof(struct exception), + __alignof__(struct exception), + 0, NULL, NULL); + if (!exception_cache) { + DMERR("Couldn't create exception cache."); + r = -ENOMEM; + goto bad3; + } + + pending_cache = + kmem_cache_create("dm-snapshot-in", + sizeof(struct pending_exception), + __alignof__(struct pending_exception), + 0, NULL, NULL); + if (!pending_cache) { + DMERR("Couldn't create pending cache."); + r = -ENOMEM; + goto bad4; + } + + pending_pool = mempool_create(128, mempool_alloc_slab, + mempool_free_slab, pending_cache); + if (!pending_pool) { + DMERR("Couldn't create pending pool."); + r = -ENOMEM; + goto bad5; + } + + return 0; + + bad5: + kmem_cache_destroy(pending_cache); + bad4: + kmem_cache_destroy(exception_cache); + bad3: + exit_origin_hash(); + bad2: + dm_unregister_target(&origin_target); + bad1: + dm_unregister_target(&snapshot_target); + return r; +} + +void dm_snapshot_exit(void) +{ + int r; + + r = dm_unregister_target(&snapshot_target); + if (r) + DMERR("snapshot unregister failed %d", r); + + r = dm_unregister_target(&origin_target); + if (r) + DMERR("origin unregister failed %d", r); + + exit_origin_hash(); + mempool_destroy(pending_pool); + kmem_cache_destroy(pending_cache); + kmem_cache_destroy(exception_cache); +} --- diff/drivers/md/dm-snapshot.h 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-snapshot.h 2003-08-26 14:21:39.000000000 +0100 @@ -0,0 +1,158 @@ +/* + * dm-snapshot.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#ifndef DM_SNAPSHOT_H +#define DM_SNAPSHOT_H + +#include "dm.h" +#include + +struct exception_table { + uint32_t hash_mask; + struct list_head *table; +}; + +/* + * The snapshot code deals with largish chunks of the disk at a + * time. Typically 64k - 256k. + */ +/* FIXME: can we get away with limiting these to a uint32_t ? */ +typedef sector_t chunk_t; + +/* + * An exception is used where an old chunk of data has been + * replaced by a new one. + */ +struct exception { + struct list_head hash_list; + + chunk_t old_chunk; + chunk_t new_chunk; +}; + +/* + * Abstraction to handle the meta/layout of exception stores (the + * COW device). + */ +struct exception_store { + + /* + * Destroys this object when you've finished with it. + */ + void (*destroy) (struct exception_store *store); + + /* + * The target shouldn't read the COW device until this is + * called. + */ + int (*read_metadata) (struct exception_store *store); + + /* + * Find somewhere to store the next exception. + */ + int (*prepare_exception) (struct exception_store *store, + struct exception *e); + + /* + * Update the metadata with this exception. + */ + void (*commit_exception) (struct exception_store *store, + struct exception *e, + void (*callback) (void *, int success), + void *callback_context); + + /* + * The snapshot is invalid, note this in the metadata. + */ + void (*drop_snapshot) (struct exception_store *store); + + /* + * Return how full the snapshot is. + */ + void (*fraction_full) (struct exception_store *store, + sector_t *numerator, + sector_t *denominator); + + struct dm_snapshot *snap; + void *context; +}; + +struct dm_snapshot { + struct rw_semaphore lock; + struct dm_table *table; + + struct dm_dev *origin; + struct dm_dev *cow; + + /* List of snapshots per Origin */ + struct list_head list; + + /* Size of data blocks saved - must be a power of 2 */ + chunk_t chunk_size; + chunk_t chunk_mask; + chunk_t chunk_shift; + + /* You can't use a snapshot if this is 0 (e.g. if full) */ + int valid; + int have_metadata; + + /* Used for display of table */ + char type; + + /* The last percentage we notified */ + int last_percent; + + struct exception_table pending; + struct exception_table complete; + + /* The on disk metadata handler */ + struct exception_store store; + + struct kcopyd_client *kcopyd_client; +}; + +/* + * Used by the exception stores to load exceptions hen + * initialising. + */ +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new); + +/* + * Constructor and destructor for the default persistent + * store. + */ +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size); + +int dm_create_transient(struct exception_store *store, + struct dm_snapshot *s, int blocksize); + +/* + * Return the number of sectors in the device. + */ +static inline sector_t get_dev_size(kdev_t dev) +{ + int *sizes; + + sizes = blk_size[MAJOR(dev)]; + if (sizes) + return sizes[MINOR(dev)] << 1; + + return 0; +} + +static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector) +{ + return (sector & ~s->chunk_mask) >> s->chunk_shift; +} + +static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk) +{ + return chunk << s->chunk_shift; +} + +#endif --- diff/drivers/md/dm-stripe.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-stripe.c 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,258 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include + +struct stripe { + struct dm_dev *dev; + sector_t physical_start; +}; + +struct stripe_c { + uint32_t stripes; + + /* The size of this target / num. stripes */ + uint32_t stripe_width; + + /* stripe chunk size */ + uint32_t chunk_shift; + sector_t chunk_mask; + + struct stripe stripe[0]; +}; + +static inline struct stripe_c *alloc_context(unsigned int stripes) +{ + size_t len; + + if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe), + stripes)) + return NULL; + + len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes); + + return kmalloc(len, GFP_KERNEL); +} + +/* + * Parse a single pair + */ +static int get_stripe(struct dm_target *ti, struct stripe_c *sc, + unsigned int stripe, char **argv) +{ + sector_t start; + + if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1) + return -EINVAL; + + if (dm_get_device(ti, argv[0], start, sc->stripe_width, + dm_table_get_mode(ti->table), + &sc->stripe[stripe].dev)) + return -ENXIO; + + sc->stripe[stripe].physical_start = start; + return 0; +} + +/* + * FIXME: Nasty function, only present because we can't link + * against __moddi3 and __divdi3. + * + * returns a == b * n + */ +static int multiple(sector_t a, sector_t b, sector_t *n) +{ + sector_t acc, prev, i; + + *n = 0; + while (a >= b) { + for (acc = b, prev = 0, i = 1; + acc <= a; + prev = acc, acc <<= 1, i <<= 1) + ; + + a -= prev; + *n += i >> 1; + } + + return a == 0; +} + +/* + * Construct a striped mapping. + * [ ]+ + */ +static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct stripe_c *sc; + sector_t width; + uint32_t stripes; + uint32_t chunk_size; + char *end; + int r; + unsigned int i; + + if (argc < 2) { + ti->error = "dm-stripe: Not enough arguments"; + return -EINVAL; + } + + stripes = simple_strtoul(argv[0], &end, 10); + if (*end) { + ti->error = "dm-stripe: Invalid stripe count"; + return -EINVAL; + } + + chunk_size = simple_strtoul(argv[1], &end, 10); + if (*end) { + ti->error = "dm-stripe: Invalid chunk_size"; + return -EINVAL; + } + + /* + * chunk_size is a power of two + */ + if (!chunk_size || (chunk_size & (chunk_size - 1))) { + ti->error = "dm-stripe: Invalid chunk size"; + return -EINVAL; + } + + if (!multiple(ti->len, stripes, &width)) { + ti->error = "dm-stripe: Target length not divisable by " + "number of stripes"; + return -EINVAL; + } + + /* + * Do we have enough arguments for that many stripes ? + */ + if (argc != (2 + 2 * stripes)) { + ti->error = "dm-stripe: Not enough destinations specified"; + return -EINVAL; + } + + sc = alloc_context(stripes); + if (!sc) { + ti->error = "dm-stripe: Memory allocation for striped context " + "failed"; + return -ENOMEM; + } + + sc->stripes = stripes; + sc->stripe_width = width; + + sc->chunk_mask = ((sector_t) chunk_size) - 1; + for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) + chunk_size >>= 1; + sc->chunk_shift--; + + /* + * Get the stripe destinations. + */ + for (i = 0; i < stripes; i++) { + argv += 2; + + r = get_stripe(ti, sc, i, argv); + if (r < 0) { + ti->error = "dm-stripe: Couldn't parse stripe " + "destination"; + while (i--) + dm_put_device(ti, sc->stripe[i].dev); + kfree(sc); + return r; + } + } + + ti->private = sc; + return 0; +} + +static void stripe_dtr(struct dm_target *ti) +{ + unsigned int i; + struct stripe_c *sc = (struct stripe_c *) ti->private; + + for (i = 0; i < sc->stripes; i++) + dm_put_device(ti, sc->stripe[i].dev); + + kfree(sc); +} + +static int stripe_map(struct dm_target *ti, struct buffer_head *bh, int rw, + union map_info *context) +{ + struct stripe_c *sc = (struct stripe_c *) ti->private; + + sector_t offset = bh->b_rsector - ti->begin; + uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift); + uint32_t stripe = chunk % sc->stripes; /* 32bit modulus */ + chunk = chunk / sc->stripes; + + bh->b_rdev = sc->stripe[stripe].dev->dev; + bh->b_rsector = sc->stripe[stripe].physical_start + + (chunk << sc->chunk_shift) + (offset & sc->chunk_mask); + return 1; +} + +static int stripe_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct stripe_c *sc = (struct stripe_c *) ti->private; + int offset; + unsigned int i; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + offset = snprintf(result, maxlen, "%d " SECTOR_FORMAT, + sc->stripes, sc->chunk_mask + 1); + for (i = 0; i < sc->stripes; i++) { + offset += + snprintf(result + offset, maxlen - offset, + " %s " SECTOR_FORMAT, + dm_kdevname(to_kdev_t(sc->stripe[i].dev->bdev->bd_dev)), + sc->stripe[i].physical_start); + } + break; + } + return 0; +} + +static struct target_type stripe_target = { + .name = "striped", + .module = THIS_MODULE, + .ctr = stripe_ctr, + .dtr = stripe_dtr, + .map = stripe_map, + .status = stripe_status, +}; + +int __init dm_stripe_init(void) +{ + int r; + + r = dm_register_target(&stripe_target); + if (r < 0) + DMWARN("striped target registration failed"); + + return r; +} + +void dm_stripe_exit(void) +{ + if (dm_unregister_target(&stripe_target)) + DMWARN("striped target unregistration failed"); + + return; +} --- diff/drivers/md/dm-table.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-table.c 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,687 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include +#include +#include + +#define MAX_DEPTH 16 +#define NODE_SIZE L1_CACHE_BYTES +#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) +#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) + +struct dm_table { + atomic_t holders; + + /* btree table */ + unsigned int depth; + unsigned int counts[MAX_DEPTH]; /* in nodes */ + sector_t *index[MAX_DEPTH]; + + unsigned int num_targets; + unsigned int num_allocated; + sector_t *highs; + struct dm_target *targets; + + /* + * Indicates the rw permissions for the new logical + * device. This should be a combination of FMODE_READ + * and FMODE_WRITE. + */ + int mode; + + /* a list of devices used by this table */ + struct list_head devices; + + /* events get handed up using this callback */ + void (*event_fn)(void *); + void *event_context; +}; + +/* + * Similar to ceiling(log_size(n)) + */ +static unsigned int int_log(unsigned long n, unsigned long base) +{ + int result = 0; + + while (n > 1) { + n = dm_div_up(n, base); + result++; + } + + return result; +} + +/* + * Calculate the index of the child node of the n'th node k'th key. + */ +static inline unsigned int get_child(unsigned int n, unsigned int k) +{ + return (n * CHILDREN_PER_NODE) + k; +} + +/* + * Return the n'th node of level l from table t. + */ +static inline sector_t *get_node(struct dm_table *t, unsigned int l, + unsigned int n) +{ + return t->index[l] + (n * KEYS_PER_NODE); +} + +/* + * Return the highest key that you could lookup from the n'th + * node on level l of the btree. + */ +static sector_t high(struct dm_table *t, unsigned int l, unsigned int n) +{ + for (; l < t->depth - 1; l++) + n = get_child(n, CHILDREN_PER_NODE - 1); + + if (n >= t->counts[l]) + return (sector_t) - 1; + + return get_node(t, l, n)[KEYS_PER_NODE - 1]; +} + +/* + * Fills in a level of the btree based on the highs of the level + * below it. + */ +static int setup_btree_index(unsigned int l, struct dm_table *t) +{ + unsigned int n, k; + sector_t *node; + + for (n = 0U; n < t->counts[l]; n++) { + node = get_node(t, l, n); + + for (k = 0U; k < KEYS_PER_NODE; k++) + node[k] = high(t, l + 1, get_child(n, k)); + } + + return 0; +} + +/* + * highs, and targets are managed as dynamic arrays during a + * table load. + */ +static int alloc_targets(struct dm_table *t, unsigned int num) +{ + sector_t *n_highs; + struct dm_target *n_targets; + int n = t->num_targets; + + /* + * Allocate both the target array and offset array at once. + */ + n_highs = (sector_t *) vcalloc(sizeof(struct dm_target) + + sizeof(sector_t), num); + if (!n_highs) + return -ENOMEM; + + n_targets = (struct dm_target *) (n_highs + num); + + if (n) { + memcpy(n_highs, t->highs, sizeof(*n_highs) * n); + memcpy(n_targets, t->targets, sizeof(*n_targets) * n); + } + + memset(n_highs + n, -1, sizeof(*n_highs) * (num - n)); + vfree(t->highs); + + t->num_allocated = num; + t->highs = n_highs; + t->targets = n_targets; + + return 0; +} + +int dm_table_create(struct dm_table **result, int mode) +{ + struct dm_table *t = kmalloc(sizeof(*t), GFP_NOIO); + + if (!t) + return -ENOMEM; + + memset(t, 0, sizeof(*t)); + INIT_LIST_HEAD(&t->devices); + atomic_set(&t->holders, 1); + + /* allocate a single nodes worth of targets to begin with */ + if (alloc_targets(t, KEYS_PER_NODE)) { + kfree(t); + t = NULL; + return -ENOMEM; + } + + t->mode = mode; + *result = t; + return 0; +} + +static void free_devices(struct list_head *devices) +{ + struct list_head *tmp, *next; + + for (tmp = devices->next; tmp != devices; tmp = next) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + next = tmp->next; + kfree(dd); + } +} + +void table_destroy(struct dm_table *t) +{ + unsigned int i; + + /* free the indexes (see dm_table_complete) */ + if (t->depth >= 2) + vfree(t->index[t->depth - 2]); + + /* free the targets */ + for (i = 0; i < t->num_targets; i++) { + struct dm_target *tgt = t->targets + i; + + if (tgt->type->dtr) + tgt->type->dtr(tgt); + + dm_put_target_type(tgt->type); + } + + vfree(t->highs); + + /* free the device list */ + if (t->devices.next != &t->devices) { + DMWARN("devices still present during destroy: " + "dm_table_remove_device calls missing"); + + free_devices(&t->devices); + } + + kfree(t); +} + +void dm_table_get(struct dm_table *t) +{ + atomic_inc(&t->holders); +} + +void dm_table_put(struct dm_table *t) +{ + if (atomic_dec_and_test(&t->holders)) + table_destroy(t); +} + +/* + * Checks to see if we need to extend highs or targets. + */ +static inline int check_space(struct dm_table *t) +{ + if (t->num_targets >= t->num_allocated) + return alloc_targets(t, t->num_allocated * 2); + + return 0; +} + +/* + * Convert a device path to a dev_t. + */ +static int lookup_device(const char *path, kdev_t *dev) +{ + int r; + struct nameidata nd; + struct inode *inode; + + if (!path_init(path, LOOKUP_FOLLOW, &nd)) + return 0; + + if ((r = path_walk(path, &nd))) + goto out; + + inode = nd.dentry->d_inode; + if (!inode) { + r = -ENOENT; + goto out; + } + + if (!S_ISBLK(inode->i_mode)) { + r = -ENOTBLK; + goto out; + } + + *dev = inode->i_rdev; + + out: + path_release(&nd); + return r; +} + +/* + * See if we've already got a device in the list. + */ +static struct dm_dev *find_device(struct list_head *l, kdev_t dev) +{ + struct list_head *tmp; + + list_for_each(tmp, l) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + if (kdev_same(dd->dev, dev)) + return dd; + } + + return NULL; +} + +/* + * Open a device so we can use it as a map destination. + */ +static int open_dev(struct dm_dev *dd) +{ + if (dd->bdev) + BUG(); + + dd->bdev = bdget(kdev_t_to_nr(dd->dev)); + if (!dd->bdev) + return -ENOMEM; + + return blkdev_get(dd->bdev, dd->mode, 0, BDEV_RAW); +} + +/* + * Close a device that we've been using. + */ +static void close_dev(struct dm_dev *dd) +{ + if (!dd->bdev) + return; + + blkdev_put(dd->bdev, BDEV_RAW); + dd->bdev = NULL; +} + +/* + * If possible (ie. blk_size[major] is set), this checks an area + * of a destination device is valid. + */ +static int check_device_area(kdev_t dev, sector_t start, sector_t len) +{ + int *sizes; + sector_t dev_size; + + if (!(sizes = blk_size[major(dev)]) || !(dev_size = sizes[minor(dev)])) + /* we don't know the device details, + * so give the benefit of the doubt */ + return 1; + + /* convert to 512-byte sectors */ + dev_size <<= 1; + + return ((start < dev_size) && (len <= (dev_size - start))); +} + +/* + * This upgrades the mode on an already open dm_dev. Being + * careful to leave things as they were if we fail to reopen the + * device. + */ +static int upgrade_mode(struct dm_dev *dd, int new_mode) +{ + int r; + struct dm_dev dd_copy; + + memcpy(&dd_copy, dd, sizeof(dd_copy)); + + dd->mode |= new_mode; + dd->bdev = NULL; + r = open_dev(dd); + if (!r) + close_dev(&dd_copy); + else + memcpy(dd, &dd_copy, sizeof(dd_copy)); + + return r; +} + +/* + * Add a device to the list, or just increment the usage count if + * it's already present. + */ +int dm_get_device(struct dm_target *ti, const char *path, sector_t start, + sector_t len, int mode, struct dm_dev **result) +{ + int r; + kdev_t dev; + struct dm_dev *dd; + unsigned major, minor; + struct dm_table *t = ti->table; + + if (!t) + BUG(); + + if (sscanf(path, "%u:%u", &major, &minor) == 2) { + /* Extract the major/minor numbers */ + dev = mk_kdev(major, minor); + } else { + /* convert the path to a device */ + if ((r = lookup_device(path, &dev))) + return r; + } + + dd = find_device(&t->devices, dev); + if (!dd) { + dd = kmalloc(sizeof(*dd), GFP_KERNEL); + if (!dd) + return -ENOMEM; + + dd->dev = dev; + dd->mode = mode; + dd->bdev = NULL; + + if ((r = open_dev(dd))) { + kfree(dd); + return r; + } + + atomic_set(&dd->count, 0); + list_add(&dd->list, &t->devices); + + } else if (dd->mode != (mode | dd->mode)) { + r = upgrade_mode(dd, mode); + if (r) + return r; + } + atomic_inc(&dd->count); + + if (!check_device_area(dd->dev, start, len)) { + DMWARN("device %s too small for target", path); + dm_put_device(ti, dd); + return -EINVAL; + } + + *result = dd; + + return 0; +} + +/* + * Decrement a devices use count and remove it if neccessary. + */ +void dm_put_device(struct dm_target *ti, struct dm_dev *dd) +{ + if (atomic_dec_and_test(&dd->count)) { + close_dev(dd); + list_del(&dd->list); + kfree(dd); + } +} + +/* + * Checks to see if the target joins onto the end of the table. + */ +static int adjoin(struct dm_table *table, struct dm_target *ti) +{ + struct dm_target *prev; + + if (!table->num_targets) + return !ti->begin; + + prev = &table->targets[table->num_targets - 1]; + return (ti->begin == (prev->begin + prev->len)); +} + +/* + * Destructively splits up the argument list to pass to ctr. + */ +static int split_args(int max, int *argc, char **argv, char *input) +{ + char *start, *end = input, *out; + *argc = 0; + + while (1) { + start = end; + + /* Skip whitespace */ + while (*start && isspace(*start)) + start++; + + if (!*start) + break; /* success, we hit the end */ + + /* 'out' is used to remove any back-quotes */ + end = out = start; + while (*end) { + /* Everything apart from '\0' can be quoted */ + if (*end == '\\' && *(end + 1)) { + *out++ = *(end + 1); + end += 2; + continue; + } + + if (isspace(*end)) + break; /* end of token */ + + *out++ = *end++; + } + + /* have we already filled the array ? */ + if ((*argc + 1) > max) + return -EINVAL; + + /* we know this is whitespace */ + if (*end) + end++; + + /* terminate the string and put it in the array */ + *out = '\0'; + argv[*argc] = start; + (*argc)++; + } + + return 0; +} + +int dm_table_add_target(struct dm_table *t, const char *type, + sector_t start, sector_t len, char *params) +{ + int r = -EINVAL, argc; + char *argv[32]; + struct dm_target *tgt; + + if ((r = check_space(t))) + return r; + + tgt = t->targets + t->num_targets; + memset(tgt, 0, sizeof(*tgt)); + + tgt->type = dm_get_target_type(type); + if (!tgt->type) { + tgt->error = "unknown target type"; + return -EINVAL; + } + + tgt->table = t; + tgt->begin = start; + tgt->len = len; + tgt->error = "Unknown error"; + + /* + * Does this target adjoin the previous one ? + */ + if (!adjoin(t, tgt)) { + tgt->error = "Gap in table"; + r = -EINVAL; + goto bad; + } + + r = split_args(ARRAY_SIZE(argv), &argc, argv, params); + if (r) { + tgt->error = "couldn't split parameters"; + goto bad; + } + + r = tgt->type->ctr(tgt, argc, argv); + if (r) + goto bad; + + t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; + return 0; + + bad: + printk(KERN_ERR DM_NAME ": %s\n", tgt->error); + dm_put_target_type(tgt->type); + return r; +} + +static int setup_indexes(struct dm_table *t) +{ + int i; + unsigned int total = 0; + sector_t *indexes; + + /* allocate the space for *all* the indexes */ + for (i = t->depth - 2; i >= 0; i--) { + t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE); + total += t->counts[i]; + } + + indexes = (sector_t *) vcalloc(total, (unsigned long) NODE_SIZE); + if (!indexes) + return -ENOMEM; + + /* set up internal nodes, bottom-up */ + for (i = t->depth - 2, total = 0; i >= 0; i--) { + t->index[i] = indexes; + indexes += (KEYS_PER_NODE * t->counts[i]); + setup_btree_index(i, t); + } + + return 0; +} + +/* + * Builds the btree to index the map. + */ +int dm_table_complete(struct dm_table *t) +{ + int r = 0; + unsigned int leaf_nodes; + + /* how many indexes will the btree have ? */ + leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); + t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); + + /* leaf layer has already been set up */ + t->counts[t->depth - 1] = leaf_nodes; + t->index[t->depth - 1] = t->highs; + + if (t->depth >= 2) + r = setup_indexes(t); + + return r; +} + +static spinlock_t _event_lock = SPIN_LOCK_UNLOCKED; +void dm_table_event_callback(struct dm_table *t, + void (*fn)(void *), void *context) +{ + spin_lock_irq(&_event_lock); + t->event_fn = fn; + t->event_context = context; + spin_unlock_irq(&_event_lock); +} + +void dm_table_event(struct dm_table *t) +{ + spin_lock(&_event_lock); + if (t->event_fn) + t->event_fn(t->event_context); + spin_unlock(&_event_lock); +} + +sector_t dm_table_get_size(struct dm_table *t) +{ + return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; +} + +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) +{ + if (index > t->num_targets) + return NULL; + + return t->targets + index; +} + +/* + * Search the btree for the correct target. + */ +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) +{ + unsigned int l, n = 0, k = 0; + sector_t *node; + + for (l = 0; l < t->depth; l++) { + n = get_child(n, k); + node = get_node(t, l, n); + + for (k = 0; k < KEYS_PER_NODE; k++) + if (node[k] >= sector) + break; + } + + return &t->targets[(KEYS_PER_NODE * n) + k]; +} + +unsigned int dm_table_get_num_targets(struct dm_table *t) +{ + return t->num_targets; +} + +struct list_head *dm_table_get_devices(struct dm_table *t) +{ + return &t->devices; +} + +int dm_table_get_mode(struct dm_table *t) +{ + return t->mode; +} + +void dm_table_suspend_targets(struct dm_table *t) +{ + int i; + + for (i = 0; i < t->num_targets; i++) { + struct dm_target *ti = t->targets + i; + + if (ti->type->suspend) + ti->type->suspend(ti); + } +} + +void dm_table_resume_targets(struct dm_table *t) +{ + int i; + + for (i = 0; i < t->num_targets; i++) { + struct dm_target *ti = t->targets + i; + + if (ti->type->resume) + ti->type->resume(ti); + } +} + +EXPORT_SYMBOL(dm_get_device); +EXPORT_SYMBOL(dm_put_device); +EXPORT_SYMBOL(dm_table_event); +EXPORT_SYMBOL(dm_table_get_mode); --- diff/drivers/md/dm-target.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm-target.c 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,188 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include + +struct tt_internal { + struct target_type tt; + + struct list_head list; + long use; +}; + +static LIST_HEAD(_targets); +static DECLARE_RWSEM(_lock); + +#define DM_MOD_NAME_SIZE 32 + +static inline struct tt_internal *__find_target_type(const char *name) +{ + struct list_head *tih; + struct tt_internal *ti; + + list_for_each(tih, &_targets) { + ti = list_entry(tih, struct tt_internal, list); + + if (!strcmp(name, ti->tt.name)) + return ti; + } + + return NULL; +} + +static struct tt_internal *get_target_type(const char *name) +{ + struct tt_internal *ti; + + down_read(&_lock); + ti = __find_target_type(name); + + if (ti) { + if (ti->use == 0 && ti->tt.module) + __MOD_INC_USE_COUNT(ti->tt.module); + ti->use++; + } + up_read(&_lock); + + return ti; +} + +static void load_module(const char *name) +{ + char module_name[DM_MOD_NAME_SIZE] = "dm-"; + + /* Length check for strcat() below */ + if (strlen(name) > (DM_MOD_NAME_SIZE - 4)) + return; + + strcat(module_name, name); + request_module(module_name); +} + +struct target_type *dm_get_target_type(const char *name) +{ + struct tt_internal *ti = get_target_type(name); + + if (!ti) { + load_module(name); + ti = get_target_type(name); + } + + return ti ? &ti->tt : NULL; +} + +void dm_put_target_type(struct target_type *t) +{ + struct tt_internal *ti = (struct tt_internal *) t; + + down_read(&_lock); + if (--ti->use == 0 && ti->tt.module) + __MOD_DEC_USE_COUNT(ti->tt.module); + + if (ti->use < 0) + BUG(); + up_read(&_lock); + + return; +} + +static struct tt_internal *alloc_target(struct target_type *t) +{ + struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL); + + if (ti) { + memset(ti, 0, sizeof(*ti)); + ti->tt = *t; + } + + return ti; +} + +int dm_register_target(struct target_type *t) +{ + int rv = 0; + struct tt_internal *ti = alloc_target(t); + + if (!ti) + return -ENOMEM; + + down_write(&_lock); + if (__find_target_type(t->name)) { + kfree(ti); + rv = -EEXIST; + } else + list_add(&ti->list, &_targets); + + up_write(&_lock); + return rv; +} + +int dm_unregister_target(struct target_type *t) +{ + struct tt_internal *ti; + + down_write(&_lock); + if (!(ti = __find_target_type(t->name))) { + up_write(&_lock); + return -EINVAL; + } + + if (ti->use) { + up_write(&_lock); + return -ETXTBSY; + } + + list_del(&ti->list); + kfree(ti); + + up_write(&_lock); + return 0; +} + +/* + * io-err: always fails an io, useful for bringing + * up LVs that have holes in them. + */ +static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args) +{ + return 0; +} + +static void io_err_dtr(struct dm_target *ti) +{ + /* empty */ +} + +static int io_err_map(struct dm_target *ti, struct buffer_head *bh, int rw, + union map_info *map_context) +{ + return -EIO; +} + +static struct target_type error_target = { + .name = "error", + .ctr = io_err_ctr, + .dtr = io_err_dtr, + .map = io_err_map, +}; + +int dm_target_init(void) +{ + return dm_register_target(&error_target); +} + +void dm_target_exit(void) +{ + if (dm_unregister_target(&error_target)) + DMWARN("error target unregistration failed"); +} + +EXPORT_SYMBOL(dm_register_target); +EXPORT_SYMBOL(dm_unregister_target); --- diff/drivers/md/dm.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm.c 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,1115 @@ +/* + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" +#include "kcopyd.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static const char *_name = DM_NAME; +#define DEFAULT_READ_AHEAD 64 + +struct dm_io { + struct mapped_device *md; + + struct dm_target *ti; + int rw; + union map_info map_context; + void (*end_io) (struct buffer_head * bh, int uptodate); + void *context; +}; + +struct deferred_io { + int rw; + struct buffer_head *bh; + struct deferred_io *next; +}; + +/* + * Bits for the md->flags field. + */ +#define DMF_BLOCK_IO 0 +#define DMF_SUSPENDED 1 + +struct mapped_device { + struct rw_semaphore lock; + atomic_t holders; + + kdev_t dev; + unsigned long flags; + + /* + * A list of ios that arrived while we were suspended. + */ + atomic_t pending; + wait_queue_head_t wait; + struct deferred_io *deferred; + + /* + * The current mapping. + */ + struct dm_table *map; + + /* + * io objects are allocated from here. + */ + mempool_t *io_pool; + + /* + * Event handling. + */ + uint32_t event_nr; + wait_queue_head_t eventq; +}; + +#define MIN_IOS 256 +static kmem_cache_t *_io_cache; + +static struct mapped_device *get_kdev(kdev_t dev); +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh); +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb); + +/*----------------------------------------------------------------- + * In order to avoid the 256 minor number limit we are going to + * register more major numbers as neccessary. + *---------------------------------------------------------------*/ +#define MAX_MINORS (1 << MINORBITS) + +struct major_details { + unsigned int major; + + int transient; + struct list_head transient_list; + + unsigned int first_free_minor; + int nr_free_minors; + + struct mapped_device *mds[MAX_MINORS]; + int blk_size[MAX_MINORS]; + int blksize_size[MAX_MINORS]; + int hardsect_size[MAX_MINORS]; +}; + +static struct rw_semaphore _dev_lock; +static struct major_details *_majors[MAX_BLKDEV]; + +/* + * This holds a list of majors that non-specified device numbers + * may be allocated from. Only majors with free minors appear on + * this list. + */ +static LIST_HEAD(_transients_free); + +static int __alloc_major(unsigned int major, struct major_details **result) +{ + int r; + unsigned int transient = !major; + struct major_details *maj; + + /* Major already allocated? */ + if (major && _majors[major]) + return 0; + + maj = kmalloc(sizeof(*maj), GFP_KERNEL); + if (!maj) + return -ENOMEM; + + memset(maj, 0, sizeof(*maj)); + INIT_LIST_HEAD(&maj->transient_list); + + maj->nr_free_minors = MAX_MINORS; + + r = register_blkdev(major, _name, &dm_blk_dops); + if (r < 0) { + DMERR("register_blkdev failed for %d", major); + kfree(maj); + return r; + } + if (r > 0) + major = r; + + maj->major = major; + + if (transient) { + maj->transient = transient; + list_add_tail(&maj->transient_list, &_transients_free); + } + + _majors[major] = maj; + + blk_size[major] = maj->blk_size; + blksize_size[major] = maj->blksize_size; + hardsect_size[major] = maj->hardsect_size; + read_ahead[major] = DEFAULT_READ_AHEAD; + + blk_queue_make_request(BLK_DEFAULT_QUEUE(major), dm_request); + + *result = maj; + return 0; +} + +static void __free_major(struct major_details *maj) +{ + unsigned int major = maj->major; + + list_del(&maj->transient_list); + + read_ahead[major] = 0; + blk_size[major] = NULL; + blksize_size[major] = NULL; + hardsect_size[major] = NULL; + + _majors[major] = NULL; + kfree(maj); + + if (unregister_blkdev(major, _name) < 0) + DMERR("devfs_unregister_blkdev failed"); +} + +static void free_all_majors(void) +{ + unsigned int major = ARRAY_SIZE(_majors); + + down_write(&_dev_lock); + + while (major--) + if (_majors[major]) + __free_major(_majors[major]); + + up_write(&_dev_lock); +} + +static void free_dev(kdev_t dev) +{ + unsigned int major = major(dev); + unsigned int minor = minor(dev); + struct major_details *maj; + + down_write(&_dev_lock); + + maj = _majors[major]; + if (!maj) + goto out; + + maj->mds[minor] = NULL; + maj->nr_free_minors++; + + if (maj->nr_free_minors == MAX_MINORS) { + __free_major(maj); + goto out; + } + + if (!maj->transient) + goto out; + + if (maj->nr_free_minors == 1) + list_add_tail(&maj->transient_list, &_transients_free); + + if (minor < maj->first_free_minor) + maj->first_free_minor = minor; + + out: + up_write(&_dev_lock); +} + +static void __alloc_minor(struct major_details *maj, unsigned int minor, + struct mapped_device *md) +{ + maj->mds[minor] = md; + md->dev = mk_kdev(maj->major, minor); + maj->nr_free_minors--; + + if (maj->transient && !maj->nr_free_minors) + list_del_init(&maj->transient_list); +} + +/* + * See if requested kdev_t is available. + */ +static int specific_dev(kdev_t dev, struct mapped_device *md) +{ + int r = 0; + unsigned int major = major(dev); + unsigned int minor = minor(dev); + struct major_details *maj; + + if (!major || (major > MAX_BLKDEV) || (minor >= MAX_MINORS)) { + DMWARN("device number requested out of range (%d, %d)", + major, minor); + return -EINVAL; + } + + down_write(&_dev_lock); + maj = _majors[major]; + + /* Register requested major? */ + if (!maj) { + r = __alloc_major(major, &maj); + if (r) + goto out; + + major = maj->major; + } + + if (maj->mds[minor]) { + r = -EBUSY; + goto out; + } + + __alloc_minor(maj, minor, md); + + out: + up_write(&_dev_lock); + + return r; +} + +/* + * Find first unused device number, requesting a new major number if required. + */ +static int first_free_dev(struct mapped_device *md) +{ + int r = 0; + struct major_details *maj; + + down_write(&_dev_lock); + + if (list_empty(&_transients_free)) { + r = __alloc_major(0, &maj); + if (r) + goto out; + } else + maj = list_entry(_transients_free.next, struct major_details, + transient_list); + + while (maj->mds[maj->first_free_minor++]) + ; + + __alloc_minor(maj, maj->first_free_minor - 1, md); + + out: + up_write(&_dev_lock); + + return r; +} + +static struct mapped_device *get_kdev(kdev_t dev) +{ + struct mapped_device *md; + struct major_details *maj; + + down_read(&_dev_lock); + maj = _majors[major(dev)]; + if (!maj) { + md = NULL; + goto out; + } + md = maj->mds[minor(dev)]; + if (md) + dm_get(md); + out: + up_read(&_dev_lock); + + return md; +} + +/*----------------------------------------------------------------- + * init/exit code + *---------------------------------------------------------------*/ + +static __init int local_init(void) +{ + init_rwsem(&_dev_lock); + + /* allocate a slab for the dm_ios */ + _io_cache = kmem_cache_create("dm io", + sizeof(struct dm_io), 0, 0, NULL, NULL); + + if (!_io_cache) + return -ENOMEM; + + return 0; +} + +static void local_exit(void) +{ + kmem_cache_destroy(_io_cache); + free_all_majors(); + + DMINFO("cleaned up"); +} + +/* + * We have a lot of init/exit functions, so it seems easier to + * store them in an array. The disposable macro 'xx' + * expands a prefix into a pair of function names. + */ +static struct { + int (*init) (void); + void (*exit) (void); + +} _inits[] = { +#define xx(n) {n ## _init, n ## _exit}, + xx(local) + xx(kcopyd) + xx(dm_target) + xx(dm_linear) + xx(dm_stripe) + xx(dm_snapshot) + xx(dm_interface) +#undef xx +}; + +static int __init dm_init(void) +{ + const int count = ARRAY_SIZE(_inits); + + int r, i; + + for (i = 0; i < count; i++) { + r = _inits[i].init(); + if (r) + goto bad; + } + + return 0; + + bad: + while (i--) + _inits[i].exit(); + + return r; +} + +static void __exit dm_exit(void) +{ + int i = ARRAY_SIZE(_inits); + + while (i--) + _inits[i].exit(); +} + +/* + * Block device functions + */ +static int dm_blk_open(struct inode *inode, struct file *file) +{ + struct mapped_device *md; + + md = get_kdev(inode->i_rdev); + if (!md) + return -ENXIO; + + return 0; +} + +static int dm_blk_close(struct inode *inode, struct file *file) +{ + struct mapped_device *md; + + md = get_kdev(inode->i_rdev); + dm_put(md); /* put the reference gained by dm_blk_open */ + dm_put(md); + return 0; +} + +static inline struct dm_io *alloc_io(struct mapped_device *md) +{ + return mempool_alloc(md->io_pool, GFP_NOIO); +} + +static inline void free_io(struct mapped_device *md, struct dm_io *io) +{ + mempool_free(io, md->io_pool); +} + +static inline struct deferred_io *alloc_deferred(void) +{ + return kmalloc(sizeof(struct deferred_io), GFP_NOIO); +} + +static inline void free_deferred(struct deferred_io *di) +{ + kfree(di); +} + +static inline sector_t volume_size(kdev_t dev) +{ + return blk_size[major(dev)][minor(dev)] << 1; +} + +/* FIXME: check this */ +static int dm_blk_ioctl(struct inode *inode, struct file *file, + unsigned int command, unsigned long a) +{ + kdev_t dev = inode->i_rdev; + long size; + + switch (command) { + case BLKROSET: + case BLKROGET: + case BLKRASET: + case BLKRAGET: + case BLKFLSBUF: + case BLKSSZGET: + //case BLKRRPART: /* Re-read partition tables */ + //case BLKPG: + case BLKELVGET: + case BLKELVSET: + case BLKBSZGET: + case BLKBSZSET: + return blk_ioctl(dev, command, a); + break; + + case BLKGETSIZE: + size = volume_size(dev); + if (copy_to_user((void *) a, &size, sizeof(long))) + return -EFAULT; + break; + + case BLKGETSIZE64: + size = volume_size(dev); + if (put_user((u64) ((u64) size) << 9, (u64 *) a)) + return -EFAULT; + break; + + case BLKRRPART: + return -ENOTTY; + + case LV_BMAP: + return dm_user_bmap(inode, (struct lv_bmap *) a); + + default: + DMWARN("unknown block ioctl 0x%x", command); + return -ENOTTY; + } + + return 0; +} + +/* + * Add the buffer to the list of deferred io. + */ +static int queue_io(struct mapped_device *md, struct buffer_head *bh, int rw) +{ + struct deferred_io *di; + + di = alloc_deferred(); + if (!di) + return -ENOMEM; + + down_write(&md->lock); + + if (!test_bit(DMF_BLOCK_IO, &md->flags)) { + up_write(&md->lock); + free_deferred(di); + return 1; + } + + di->bh = bh; + di->rw = rw; + di->next = md->deferred; + md->deferred = di; + + up_write(&md->lock); + return 0; /* deferred successfully */ +} + +/* + * bh->b_end_io routine that decrements the pending count + * and then calls the original bh->b_end_io fn. + */ +static void dec_pending(struct buffer_head *bh, int uptodate) +{ + int r; + struct dm_io *io = bh->b_private; + dm_endio_fn endio = io->ti->type->end_io; + + if (endio) { + r = endio(io->ti, bh, io->rw, uptodate ? 0 : -EIO, + &io->map_context); + if (r < 0) + uptodate = 0; + + else if (r > 0) + /* the target wants another shot at the io */ + return; + } + + if (atomic_dec_and_test(&io->md->pending)) + /* nudge anyone waiting on suspend queue */ + wake_up(&io->md->wait); + + bh->b_end_io = io->end_io; + bh->b_private = io->context; + free_io(io->md, io); + + bh->b_end_io(bh, uptodate); +} + +/* + * Do the bh mapping for a given leaf + */ +static inline int __map_buffer(struct mapped_device *md, int rw, + struct buffer_head *bh, struct dm_io *io) +{ + struct dm_target *ti; + + if (!md->map) + return -EINVAL; + + ti = dm_table_find_target(md->map, bh->b_rsector); + if (!ti->type) + return -EINVAL; + + /* hook the end io request fn */ + atomic_inc(&md->pending); + io->md = md; + io->ti = ti; + io->rw = rw; + io->end_io = bh->b_end_io; + io->context = bh->b_private; + bh->b_end_io = dec_pending; + bh->b_private = io; + + return ti->type->map(ti, bh, rw, &io->map_context); +} + +/* + * Checks to see if we should be deferring io, if so it queues it + * and returns 1. + */ +static inline int __deferring(struct mapped_device *md, int rw, + struct buffer_head *bh) +{ + int r; + + /* + * If we're suspended we have to queue this io for later. + */ + while (test_bit(DMF_BLOCK_IO, &md->flags)) { + up_read(&md->lock); + + /* + * There's no point deferring a read ahead + * request, just drop it. + */ + if (rw == READA) { + down_read(&md->lock); + return -EIO; + } + + r = queue_io(md, bh, rw); + down_read(&md->lock); + + if (r < 0) + return r; + + if (r == 0) + return 1; /* deferred successfully */ + + } + + return 0; +} + +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh) +{ + int r; + struct dm_io *io; + struct mapped_device *md; + + md = get_kdev(bh->b_rdev); + if (!md) { + buffer_IO_error(bh); + return 0; + } + + io = alloc_io(md); + down_read(&md->lock); + + r = __deferring(md, rw, bh); + if (r < 0) + goto bad; + + else if (!r) { + /* not deferring */ + r = __map_buffer(md, rw, bh, io); + if (r < 0) + goto bad; + } else + r = 0; + + up_read(&md->lock); + dm_put(md); + return r; + + bad: + buffer_IO_error(bh); + up_read(&md->lock); + dm_put(md); + return 0; +} + +static int check_dev_size(kdev_t dev, unsigned long block) +{ + unsigned int major = major(dev); + unsigned int minor = minor(dev); + + /* FIXME: check this */ + unsigned long max_sector = (blk_size[major][minor] << 1) + 1; + unsigned long sector = (block + 1) * (blksize_size[major][minor] >> 9); + + return (sector > max_sector) ? 0 : 1; +} + +/* + * Creates a dummy buffer head and maps it (for lilo). + */ +static int __bmap(struct mapped_device *md, kdev_t dev, unsigned long block, + kdev_t *r_dev, unsigned long *r_block) +{ + struct buffer_head bh; + struct dm_target *ti; + union map_info map_context; + int r; + + if (test_bit(DMF_BLOCK_IO, &md->flags)) { + return -EPERM; + } + + if (!check_dev_size(dev, block)) { + return -EINVAL; + } + + if (!md->map) + return -EINVAL; + + /* setup dummy bh */ + memset(&bh, 0, sizeof(bh)); + bh.b_blocknr = block; + bh.b_dev = bh.b_rdev = dev; + bh.b_size = blksize_size[major(dev)][minor(dev)]; + bh.b_rsector = block * (bh.b_size >> 9); + + /* find target */ + ti = dm_table_find_target(md->map, bh.b_rsector); + + /* do the mapping */ + r = ti->type->map(ti, &bh, READ, &map_context); + ti->type->end_io(ti, &bh, READ, 0, &map_context); + + if (!r) { + *r_dev = bh.b_rdev; + *r_block = bh.b_rsector / (bh.b_size >> 9); + } + + return r; +} + +/* + * Marshals arguments and results between user and kernel space. + */ +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb) +{ + struct mapped_device *md; + unsigned long block, r_block; + kdev_t r_dev; + int r; + + if (get_user(block, &lvb->lv_block)) + return -EFAULT; + + md = get_kdev(inode->i_rdev); + if (!md) + return -ENXIO; + + down_read(&md->lock); + r = __bmap(md, inode->i_rdev, block, &r_dev, &r_block); + up_read(&md->lock); + dm_put(md); + + if (!r && (put_user(kdev_t_to_nr(r_dev), &lvb->lv_dev) || + put_user(r_block, &lvb->lv_block))) + r = -EFAULT; + + return r; +} + +static void free_md(struct mapped_device *md) +{ + free_dev(md->dev); + mempool_destroy(md->io_pool); + kfree(md); +} + +/* + * Allocate and initialise a blank device with a given minor. + */ +static struct mapped_device *alloc_md(kdev_t dev) +{ + int r; + struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); + + if (!md) { + DMWARN("unable to allocate device, out of memory."); + return NULL; + } + + memset(md, 0, sizeof(*md)); + + /* Allocate suitable device number */ + if (!dev) + r = first_free_dev(md); + else + r = specific_dev(dev, md); + + if (r) { + kfree(md); + return NULL; + } + + md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, + mempool_free_slab, _io_cache); + if (!md->io_pool) { + free_md(md); + kfree(md); + return NULL; + } + + init_rwsem(&md->lock); + atomic_set(&md->holders, 1); + atomic_set(&md->pending, 0); + init_waitqueue_head(&md->wait); + init_waitqueue_head(&md->eventq); + + return md; +} + +/* + * The hardsect size for a mapped device is the largest hardsect size + * from the devices it maps onto. + */ +static int __find_hardsect_size(struct list_head *devices) +{ + int result = 512, size; + struct list_head *tmp; + + list_for_each (tmp, devices) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + size = get_hardsect_size(dd->dev); + if (size > result) + result = size; + } + + return result; +} + +/* + * Bind a table to the device. + */ +static void event_callback(void *context) +{ + struct mapped_device *md = (struct mapped_device *) context; + + down_write(&md->lock); + md->event_nr++; + wake_up_interruptible(&md->eventq); + up_write(&md->lock); +} + +static int __bind(struct mapped_device *md, struct dm_table *t) +{ + unsigned int minor = minor(md->dev); + unsigned int major = major(md->dev); + md->map = t; + + /* in k */ + blk_size[major][minor] = dm_table_get_size(t) >> 1; + blksize_size[major][minor] = BLOCK_SIZE; + hardsect_size[major][minor] = + __find_hardsect_size(dm_table_get_devices(t)); + register_disk(NULL, md->dev, 1, &dm_blk_dops, blk_size[major][minor]); + + dm_table_event_callback(md->map, event_callback, md); + dm_table_get(t); + return 0; +} + +static void __unbind(struct mapped_device *md) +{ + unsigned int minor = minor(md->dev); + unsigned int major = major(md->dev); + + if (md->map) { + dm_table_event_callback(md->map, NULL, NULL); + dm_table_put(md->map); + md->map = NULL; + + } + + blk_size[major][minor] = 0; + blksize_size[major][minor] = 0; + hardsect_size[major][minor] = 0; +} + +/* + * Constructor for a new device. + */ +int dm_create(kdev_t dev, struct mapped_device **result) +{ + struct mapped_device *md; + + md = alloc_md(dev); + if (!md) + return -ENXIO; + + __unbind(md); /* Ensure zero device size */ + + *result = md; + return 0; +} + +void dm_get(struct mapped_device *md) +{ + atomic_inc(&md->holders); +} + +void dm_put(struct mapped_device *md) +{ + if (atomic_dec_and_test(&md->holders)) { + if (md->map) + dm_table_suspend_targets(md->map); + __unbind(md); + free_md(md); + } +} + +/* + * Requeue the deferred io by calling generic_make_request. + */ +static void flush_deferred_io(struct deferred_io *c) +{ + struct deferred_io *n; + + while (c) { + n = c->next; + generic_make_request(c->rw, c->bh); + free_deferred(c); + c = n; + } +} + +/* + * Swap in a new table (destroying old one). + */ +int dm_swap_table(struct mapped_device *md, struct dm_table *table) +{ + int r; + + down_write(&md->lock); + + /* + * The device must be suspended, or have no table bound yet. + */ + if (md->map && !test_bit(DMF_SUSPENDED, &md->flags)) { + up_write(&md->lock); + return -EPERM; + } + + __unbind(md); + r = __bind(md, table); + if (r) + return r; + + up_write(&md->lock); + return 0; +} + +/* + * We need to be able to change a mapping table under a mounted + * filesystem. For example we might want to move some data in + * the background. Before the table can be swapped with + * dm_bind_table, dm_suspend must be called to flush any in + * flight io and ensure that any further io gets deferred. + */ +int dm_suspend(struct mapped_device *md) +{ + int r = 0; + DECLARE_WAITQUEUE(wait, current); + + down_write(&md->lock); + + /* + * First we set the BLOCK_IO flag so no more ios will be + * mapped. + */ + if (test_bit(DMF_BLOCK_IO, &md->flags)) { + up_write(&md->lock); + return -EINVAL; + } + + set_bit(DMF_BLOCK_IO, &md->flags); + add_wait_queue(&md->wait, &wait); + up_write(&md->lock); + + /* + * Then we wait for the already mapped ios to + * complete. + */ + run_task_queue(&tq_disk); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (!atomic_read(&md->pending) || signal_pending(current)) + break; + + schedule(); + } + set_current_state(TASK_RUNNING); + + down_write(&md->lock); + remove_wait_queue(&md->wait, &wait); + + /* did we flush everything ? */ + if (atomic_read(&md->pending)) { + clear_bit(DMF_BLOCK_IO, &md->flags); + r = -EINTR; + } else { + set_bit(DMF_SUSPENDED, &md->flags); + if (md->map) + dm_table_suspend_targets(md->map); + } + up_write(&md->lock); + + return r; +} + +int dm_resume(struct mapped_device *md) +{ + struct deferred_io *def; + + down_write(&md->lock); + if (!test_bit(DMF_SUSPENDED, &md->flags)) { + up_write(&md->lock); + return -EINVAL; + } + + if (md->map) + dm_table_resume_targets(md->map); + + clear_bit(DMF_SUSPENDED, &md->flags); + clear_bit(DMF_BLOCK_IO, &md->flags); + def = md->deferred; + md->deferred = NULL; + up_write(&md->lock); + + flush_deferred_io(def); + run_task_queue(&tq_disk); + + return 0; +} + +struct dm_table *dm_get_table(struct mapped_device *md) +{ + struct dm_table *t; + + down_read(&md->lock); + t = md->map; + if (t) + dm_table_get(t); + up_read(&md->lock); + + return t; +} + +/*----------------------------------------------------------------- + * Event notification. + *---------------------------------------------------------------*/ +uint32_t dm_get_event_nr(struct mapped_device *md) +{ + uint32_t r; + + down_read(&md->lock); + r = md->event_nr; + up_read(&md->lock); + + return r; +} + +int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq, + uint32_t event_nr) +{ + down_write(&md->lock); + if (event_nr != md->event_nr) { + up_write(&md->lock); + return 1; + } + + add_wait_queue(&md->eventq, wq); + up_write(&md->lock); + + return 0; +} + +const char *dm_kdevname(kdev_t dev) +{ + static char buffer[32]; + sprintf(buffer, "%03d:%03d", MAJOR(dev), MINOR(dev)); + return buffer; +} + +void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq) +{ + down_write(&md->lock); + remove_wait_queue(&md->eventq, wq); + up_write(&md->lock); +} + +kdev_t dm_kdev(struct mapped_device *md) +{ + kdev_t dev; + + down_read(&md->lock); + dev = md->dev; + up_read(&md->lock); + + return dev; +} + +int dm_suspended(struct mapped_device *md) +{ + return test_bit(DMF_SUSPENDED, &md->flags); +} + +struct block_device_operations dm_blk_dops = { + .open = dm_blk_open, + .release = dm_blk_close, + .ioctl = dm_blk_ioctl, + .owner = THIS_MODULE +}; + +/* + * module hooks + */ +module_init(dm_init); +module_exit(dm_exit); + +MODULE_DESCRIPTION(DM_NAME " driver"); +MODULE_AUTHOR("Joe Thornber "); +MODULE_LICENSE("GPL"); + +EXPORT_SYMBOL(dm_kdevname); --- diff/drivers/md/dm.h 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/dm.h 2003-08-26 14:21:35.000000000 +0100 @@ -0,0 +1,175 @@ +/* + * Internal header file for device mapper + * + * Copyright (C) 2001, 2002 Sistina Software + * + * This file is released under the LGPL. + */ + +#ifndef DM_INTERNAL_H +#define DM_INTERNAL_H + +#include +#include +#include +#include + +#define DM_NAME "device-mapper" +#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x) +#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x) +#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x) + +/* + * FIXME: I think this should be with the definition of sector_t + * in types.h. + */ +#ifdef CONFIG_LBD +#define SECTOR_FORMAT "%Lu" +#else +#define SECTOR_FORMAT "%lu" +#endif + +#define SECTOR_SHIFT 9 +#define SECTOR_SIZE (1 << SECTOR_SHIFT) + +extern struct block_device_operations dm_blk_dops; + +/* + * List of devices that a metadevice uses and should open/close. + */ +struct dm_dev { + struct list_head list; + + atomic_t count; + int mode; + kdev_t dev; + struct block_device *bdev; +}; + +struct dm_table; +struct mapped_device; + +/*----------------------------------------------------------------- + * Functions for manipulating a struct mapped_device. + * Drop the reference with dm_put when you finish with the object. + *---------------------------------------------------------------*/ +int dm_create(kdev_t dev, struct mapped_device **md); + +/* + * Reference counting for md. + */ +void dm_get(struct mapped_device *md); +void dm_put(struct mapped_device *md); + +/* + * A device can still be used while suspended, but I/O is deferred. + */ +int dm_suspend(struct mapped_device *md); +int dm_resume(struct mapped_device *md); + +/* + * The device must be suspended before calling this method. + */ +int dm_swap_table(struct mapped_device *md, struct dm_table *t); + +/* + * Drop a reference on the table when you've finished with the + * result. + */ +struct dm_table *dm_get_table(struct mapped_device *md); + +/* + * Event functions. + */ +uint32_t dm_get_event_nr(struct mapped_device *md); +int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq, + uint32_t event_nr); +void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq); + +/* + * Info functions. + */ +kdev_t dm_kdev(struct mapped_device *md); +int dm_suspended(struct mapped_device *md); + +/*----------------------------------------------------------------- + * Functions for manipulating a table. Tables are also reference + * counted. + *---------------------------------------------------------------*/ +int dm_table_create(struct dm_table **result, int mode); + +void dm_table_get(struct dm_table *t); +void dm_table_put(struct dm_table *t); + +int dm_table_add_target(struct dm_table *t, const char *type, + sector_t start, sector_t len, char *params); +int dm_table_complete(struct dm_table *t); +void dm_table_event_callback(struct dm_table *t, + void (*fn)(void *), void *context); +void dm_table_event(struct dm_table *t); +sector_t dm_table_get_size(struct dm_table *t); +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); +unsigned int dm_table_get_num_targets(struct dm_table *t); +struct list_head *dm_table_get_devices(struct dm_table *t); +int dm_table_get_mode(struct dm_table *t); +void dm_table_suspend_targets(struct dm_table *t); +void dm_table_resume_targets(struct dm_table *t); + +/*----------------------------------------------------------------- + * A registry of target types. + *---------------------------------------------------------------*/ +int dm_target_init(void); +void dm_target_exit(void); +struct target_type *dm_get_target_type(const char *name); +void dm_put_target_type(struct target_type *t); + + +/*----------------------------------------------------------------- + * Useful inlines. + *---------------------------------------------------------------*/ +static inline int array_too_big(unsigned long fixed, unsigned long obj, + unsigned long num) +{ + return (num > (ULONG_MAX - fixed) / obj); +} + +/* + * ceiling(n / size) * size + */ +static inline unsigned long dm_round_up(unsigned long n, unsigned long size) +{ + unsigned long r = n % size; + return n + (r ? (size - r) : 0); +} + +/* + * Ceiling(n / size) + */ +static inline unsigned long dm_div_up(unsigned long n, unsigned long size) +{ + return dm_round_up(n, size) / size; +} + +const char *dm_kdevname(kdev_t dev); + +/* + * The device-mapper can be driven through one of two interfaces; + * ioctl or filesystem, depending which patch you have applied. + */ +int dm_interface_init(void); +void dm_interface_exit(void); + +/* + * Targets for linear and striped mappings + */ +int dm_linear_init(void); +void dm_linear_exit(void); + +int dm_stripe_init(void); +void dm_stripe_exit(void); + +int dm_snapshot_init(void); +void dm_snapshot_exit(void); + +#endif --- diff/drivers/md/kcopyd.c 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/kcopyd.c 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,650 @@ +/* + * Copyright (C) 2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kcopyd.h" +#include "dm-daemon.h" + +/* FIXME: this is only needed for the DMERR macros */ +#include "dm.h" + +static struct dm_daemon _kcopyd; + +/*----------------------------------------------------------------- + * Each kcopyd client has its own little pool of preallocated + * pages for kcopyd io. + *---------------------------------------------------------------*/ +struct kcopyd_client { + struct list_head list; + + spinlock_t lock; + struct list_head pages; + unsigned int nr_pages; + unsigned int nr_free_pages; +}; + +static inline void __push_page(struct kcopyd_client *kc, struct page *p) +{ + list_add(&p->list, &kc->pages); + kc->nr_free_pages++; +} + +static inline struct page *__pop_page(struct kcopyd_client *kc) +{ + struct page *p; + + p = list_entry(kc->pages.next, struct page, list); + list_del(&p->list); + kc->nr_free_pages--; + + return p; +} + +static int kcopyd_get_pages(struct kcopyd_client *kc, + unsigned int nr, struct list_head *pages) +{ + struct page *p; + INIT_LIST_HEAD(pages); + + spin_lock(&kc->lock); + if (kc->nr_free_pages < nr) { + spin_unlock(&kc->lock); + return -ENOMEM; + } + + while (nr--) { + p = __pop_page(kc); + list_add(&p->list, pages); + } + spin_unlock(&kc->lock); + + return 0; +} + +static void kcopyd_put_pages(struct kcopyd_client *kc, struct list_head *pages) +{ + struct list_head *tmp, *tmp2; + + spin_lock(&kc->lock); + list_for_each_safe (tmp, tmp2, pages) + __push_page(kc, list_entry(tmp, struct page, list)); + spin_unlock(&kc->lock); +} + +/* + * These three functions resize the page pool. + */ +static void release_pages(struct list_head *pages) +{ + struct page *p; + struct list_head *tmp, *tmp2; + + list_for_each_safe (tmp, tmp2, pages) { + p = list_entry(tmp, struct page, list); + UnlockPage(p); + __free_page(p); + } +} + +static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr) +{ + unsigned int i; + struct page *p; + LIST_HEAD(new); + + for (i = 0; i < nr; i++) { + p = alloc_page(GFP_KERNEL); + if (!p) { + release_pages(&new); + return -ENOMEM; + } + + LockPage(p); + list_add(&p->list, &new); + } + + kcopyd_put_pages(kc, &new); + kc->nr_pages += nr; + return 0; +} + +static void client_free_pages(struct kcopyd_client *kc) +{ + BUG_ON(kc->nr_free_pages != kc->nr_pages); + release_pages(&kc->pages); + kc->nr_free_pages = kc->nr_pages = 0; +} + +/*----------------------------------------------------------------- + * kcopyd_jobs need to be allocated by the *clients* of kcopyd, + * for this reason we use a mempool to prevent the client from + * ever having to do io (which could cause a deadlock). + *---------------------------------------------------------------*/ +struct kcopyd_job { + struct kcopyd_client *kc; + struct list_head list; + unsigned int flags; + + /* + * Error state of the job. + */ + int read_err; + unsigned int write_err; + + /* + * Either READ or WRITE + */ + int rw; + struct io_region source; + + /* + * The destinations for the transfer. + */ + unsigned int num_dests; + struct io_region dests[KCOPYD_MAX_REGIONS]; + + sector_t offset; + unsigned int nr_pages; + struct list_head pages; + + /* + * Set this to ensure you are notified when the job has + * completed. 'context' is for callback to use. + */ + kcopyd_notify_fn fn; + void *context; + + /* + * These fields are only used if the job has been split + * into more manageable parts. + */ + struct semaphore lock; + atomic_t sub_jobs; + sector_t progress; +}; + +/* FIXME: this should scale with the number of pages */ +#define MIN_JOBS 512 + +static kmem_cache_t *_job_cache = NULL; +static mempool_t *_job_pool = NULL; + +/* + * We maintain three lists of jobs: + * + * i) jobs waiting for pages + * ii) jobs that have pages, and are waiting for the io to be issued. + * iii) jobs that have completed. + * + * All three of these are protected by job_lock. + */ +static spinlock_t _job_lock = SPIN_LOCK_UNLOCKED; + +static LIST_HEAD(_complete_jobs); +static LIST_HEAD(_io_jobs); +static LIST_HEAD(_pages_jobs); + +static int jobs_init(void) +{ + INIT_LIST_HEAD(&_complete_jobs); + INIT_LIST_HEAD(&_io_jobs); + INIT_LIST_HEAD(&_pages_jobs); + + _job_cache = kmem_cache_create("kcopyd-jobs", + sizeof(struct kcopyd_job), + __alignof__(struct kcopyd_job), + 0, NULL, NULL); + if (!_job_cache) + return -ENOMEM; + + _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab, + mempool_free_slab, _job_cache); + if (!_job_pool) { + kmem_cache_destroy(_job_cache); + return -ENOMEM; + } + + return 0; +} + +static void jobs_exit(void) +{ + BUG_ON(!list_empty(&_complete_jobs)); + BUG_ON(!list_empty(&_io_jobs)); + BUG_ON(!list_empty(&_pages_jobs)); + + mempool_destroy(_job_pool); + kmem_cache_destroy(_job_cache); +} + +/* + * Functions to push and pop a job onto the head of a given job + * list. + */ +static inline struct kcopyd_job *pop(struct list_head *jobs) +{ + struct kcopyd_job *job = NULL; + unsigned long flags; + + spin_lock_irqsave(&_job_lock, flags); + + if (!list_empty(jobs)) { + job = list_entry(jobs->next, struct kcopyd_job, list); + list_del(&job->list); + } + spin_unlock_irqrestore(&_job_lock, flags); + + return job; +} + +static inline void push(struct list_head *jobs, struct kcopyd_job *job) +{ + unsigned long flags; + + spin_lock_irqsave(&_job_lock, flags); + list_add_tail(&job->list, jobs); + spin_unlock_irqrestore(&_job_lock, flags); +} + +/* + * These three functions process 1 item from the corresponding + * job list. + * + * They return: + * < 0: error + * 0: success + * > 0: can't process yet. + */ +static int run_complete_job(struct kcopyd_job *job) +{ + void *context = job->context; + int read_err = job->read_err; + unsigned int write_err = job->write_err; + kcopyd_notify_fn fn = job->fn; + + kcopyd_put_pages(job->kc, &job->pages); + mempool_free(job, _job_pool); + fn(read_err, write_err, context); + return 0; +} + +static void complete_io(unsigned int error, void *context) +{ + struct kcopyd_job *job = (struct kcopyd_job *) context; + + if (error) { + if (job->rw == WRITE) + job->write_err &= error; + else + job->read_err = 1; + + if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) { + push(&_complete_jobs, job); + dm_daemon_wake(&_kcopyd); + return; + } + } + + if (job->rw == WRITE) + push(&_complete_jobs, job); + + else { + job->rw = WRITE; + push(&_io_jobs, job); + } + + dm_daemon_wake(&_kcopyd); +} + +/* + * Request io on as many buffer heads as we can currently get for + * a particular job. + */ +static int run_io_job(struct kcopyd_job *job) +{ + int r; + + if (job->rw == READ) + r = dm_io_async(1, &job->source, job->rw, + list_entry(job->pages.next, struct page, list), + job->offset, complete_io, job); + + else + r = dm_io_async(job->num_dests, job->dests, job->rw, + list_entry(job->pages.next, struct page, list), + job->offset, complete_io, job); + + return r; +} + +#define SECTORS_PER_PAGE (PAGE_SIZE / SECTOR_SIZE) +static int run_pages_job(struct kcopyd_job *job) +{ + int r; + + job->nr_pages = dm_div_up(job->dests[0].count + job->offset, + SECTORS_PER_PAGE); + r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages); + if (!r) { + /* this job is ready for io */ + push(&_io_jobs, job); + return 0; + } + + if (r == -ENOMEM) + /* can't complete now */ + return 1; + + return r; +} + +/* + * Run through a list for as long as possible. Returns the count + * of successful jobs. + */ +static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *)) +{ + struct kcopyd_job *job; + int r, count = 0; + + while ((job = pop(jobs))) { + + r = fn(job); + + if (r < 0) { + /* error this rogue job */ + if (job->rw == WRITE) + job->write_err = (unsigned int) -1; + else + job->read_err = 1; + push(&_complete_jobs, job); + break; + } + + if (r > 0) { + /* + * We couldn't service this job ATM, so + * push this job back onto the list. + */ + push(jobs, job); + break; + } + + count++; + } + + return count; +} + +/* + * kcopyd does this every time it's woken up. + */ +static void do_work(void) +{ + /* + * The order that these are called is *very* important. + * complete jobs can free some pages for pages jobs. + * Pages jobs when successful will jump onto the io jobs + * list. io jobs call wake when they complete and it all + * starts again. + */ + process_jobs(&_complete_jobs, run_complete_job); + process_jobs(&_pages_jobs, run_pages_job); + process_jobs(&_io_jobs, run_io_job); + run_task_queue(&tq_disk); +} + +/* + * If we are copying a small region we just dispatch a single job + * to do the copy, otherwise the io has to be split up into many + * jobs. + */ +static void dispatch_job(struct kcopyd_job *job) +{ + push(&_pages_jobs, job); + dm_daemon_wake(&_kcopyd); +} + +#define SUB_JOB_SIZE 128 +static void segment_complete(int read_err, + unsigned int write_err, void *context) +{ + /* FIXME: tidy this function */ + sector_t progress = 0; + sector_t count = 0; + struct kcopyd_job *job = (struct kcopyd_job *) context; + + down(&job->lock); + + /* update the error */ + if (read_err) + job->read_err = 1; + + if (write_err) + job->write_err &= write_err; + + /* + * Only dispatch more work if there hasn't been an error. + */ + if ((!job->read_err && !job->write_err) || + test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) { + /* get the next chunk of work */ + progress = job->progress; + count = job->source.count - progress; + if (count) { + if (count > SUB_JOB_SIZE) + count = SUB_JOB_SIZE; + + job->progress += count; + } + } + up(&job->lock); + + if (count) { + int i; + struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO); + + memcpy(sub_job, job, sizeof(*job)); + sub_job->source.sector += progress; + sub_job->source.count = count; + + for (i = 0; i < job->num_dests; i++) { + sub_job->dests[i].sector += progress; + sub_job->dests[i].count = count; + } + + sub_job->fn = segment_complete; + sub_job->context = job; + dispatch_job(sub_job); + + } else if (atomic_dec_and_test(&job->sub_jobs)) { + + /* + * To avoid a race we must keep the job around + * until after the notify function has completed. + * Otherwise the client may try and stop the job + * after we've completed. + */ + job->fn(read_err, write_err, job->context); + mempool_free(job, _job_pool); + } +} + +/* + * Create some little jobs that will do the move between + * them. + */ +#define SPLIT_COUNT 8 +static void split_job(struct kcopyd_job *job) +{ + int i; + + atomic_set(&job->sub_jobs, SPLIT_COUNT); + for (i = 0; i < SPLIT_COUNT; i++) + segment_complete(0, 0u, job); +} + +#define SUB_JOB_THRESHOLD (SPLIT_COUNT * SUB_JOB_SIZE) +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, + unsigned int num_dests, struct io_region *dests, + unsigned int flags, kcopyd_notify_fn fn, void *context) +{ + struct kcopyd_job *job; + + /* + * Allocate a new job. + */ + job = mempool_alloc(_job_pool, GFP_NOIO); + + /* + * set up for the read. + */ + job->kc = kc; + job->flags = flags; + job->read_err = 0; + job->write_err = 0; + job->rw = READ; + + memcpy(&job->source, from, sizeof(*from)); + + job->num_dests = num_dests; + memcpy(&job->dests, dests, sizeof(*dests) * num_dests); + + job->offset = 0; + job->nr_pages = 0; + INIT_LIST_HEAD(&job->pages); + + job->fn = fn; + job->context = context; + + if (job->source.count < SUB_JOB_THRESHOLD) + dispatch_job(job); + + else { + init_MUTEX(&job->lock); + job->progress = 0; + split_job(job); + } + + return 0; +} + +/* + * Cancels a kcopyd job, eg. someone might be deactivating a + * mirror. + */ +int kcopyd_cancel(struct kcopyd_job *job, int block) +{ + /* FIXME: finish */ + return -1; +} + +/*----------------------------------------------------------------- + * Unit setup + *---------------------------------------------------------------*/ +static DECLARE_MUTEX(_client_lock); +static LIST_HEAD(_clients); + +static int client_add(struct kcopyd_client *kc) +{ + down(&_client_lock); + list_add(&kc->list, &_clients); + up(&_client_lock); + return 0; +} + +static void client_del(struct kcopyd_client *kc) +{ + down(&_client_lock); + list_del(&kc->list); + up(&_client_lock); +} + +int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result) +{ + int r = 0; + struct kcopyd_client *kc; + + kc = kmalloc(sizeof(*kc), GFP_KERNEL); + if (!kc) + return -ENOMEM; + + kc->lock = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&kc->pages); + kc->nr_pages = kc->nr_free_pages = 0; + r = client_alloc_pages(kc, nr_pages); + if (r) { + kfree(kc); + return r; + } + + r = dm_io_get(nr_pages); + if (r) { + client_free_pages(kc); + kfree(kc); + return r; + } + + r = client_add(kc); + if (r) { + dm_io_put(nr_pages); + client_free_pages(kc); + kfree(kc); + return r; + } + + *result = kc; + return 0; +} + +void kcopyd_client_destroy(struct kcopyd_client *kc) +{ + dm_io_put(kc->nr_pages); + client_free_pages(kc); + client_del(kc); + kfree(kc); +} + + +int __init kcopyd_init(void) +{ + int r; + + r = jobs_init(); + if (r) + return r; + + r = dm_daemon_start(&_kcopyd, "kcopyd", do_work); + if (r) + jobs_exit(); + + return r; +} + +void kcopyd_exit(void) +{ + jobs_exit(); + dm_daemon_stop(&_kcopyd); +} + +EXPORT_SYMBOL(kcopyd_client_create); +EXPORT_SYMBOL(kcopyd_client_destroy); +EXPORT_SYMBOL(kcopyd_copy); +EXPORT_SYMBOL(kcopyd_cancel); --- diff/drivers/md/kcopyd.h 1970-01-01 01:00:00.000000000 +0100 +++ source/drivers/md/kcopyd.h 2003-08-26 14:21:35.000000000 +0100 @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2001 Sistina Software + * + * This file is released under the GPL. + */ + +#ifndef DM_KCOPYD_H +#define DM_KCOPYD_H + +/* + * Needed for the definition of offset_t. + */ +#include +#include + +#include "dm-io.h" + +int kcopyd_init(void); +void kcopyd_exit(void); + +/* FIXME: make this configurable */ +#define KCOPYD_MAX_REGIONS 8 + +#define KCOPYD_IGNORE_ERROR 1 + +/* + * To use kcopyd you must first create a kcopyd client object. + */ +struct kcopyd_client; +int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result); +void kcopyd_client_destroy(struct kcopyd_client *kc); + +/* + * Submit a copy job to kcopyd. This is built on top of the + * previous three fns. + * + * read_err is a boolean, + * write_err is a bitset, with 1 bit for each destination region + */ +typedef void (*kcopyd_notify_fn)(int read_err, + unsigned int write_err, void *context); + +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, + unsigned int num_dests, struct io_region *dests, + unsigned int flags, kcopyd_notify_fn fn, void *context); + +#endif --- diff/include/linux/device-mapper.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/linux/device-mapper.h 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the LGPL. + */ + +#ifndef _LINUX_DEVICE_MAPPER_H +#define _LINUX_DEVICE_MAPPER_H + +typedef unsigned long sector_t; + +struct dm_target; +struct dm_table; +struct dm_dev; + +typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; + +union map_info { + void *ptr; + unsigned long long ll; +}; + +/* + * In the constructor the target parameter will already have the + * table, type, begin and len fields filled in. + */ +typedef int (*dm_ctr_fn) (struct dm_target * target, unsigned int argc, + char **argv); + +/* + * The destructor doesn't need to free the dm_target, just + * anything hidden ti->private. + */ +typedef void (*dm_dtr_fn) (struct dm_target * ti); + +/* + * The map function must return: + * < 0: error + * = 0: The target will handle the io by resubmitting it later + * > 0: simple remap complete + */ +typedef int (*dm_map_fn) (struct dm_target * ti, struct buffer_head * bh, + int rw, union map_info *map_context); + +/* + * Returns: + * < 0 : error (currently ignored) + * 0 : ended successfully + * 1 : for some reason the io has still not completed (eg, + * multipath target might want to requeue a failed io). + */ +typedef int (*dm_endio_fn) (struct dm_target * ti, + struct buffer_head * bh, int rw, int error, + union map_info *map_context); +typedef void (*dm_suspend_fn) (struct dm_target *ti); +typedef void (*dm_resume_fn) (struct dm_target *ti); +typedef int (*dm_status_fn) (struct dm_target * ti, status_type_t status_type, + char *result, unsigned int maxlen); + +void dm_error(const char *message); + +/* + * Constructors should call these functions to ensure destination devices + * are opened/closed correctly. + * FIXME: too many arguments. + */ +int dm_get_device(struct dm_target *ti, const char *path, sector_t start, + sector_t len, int mode, struct dm_dev **result); +void dm_put_device(struct dm_target *ti, struct dm_dev *d); + +/* + * Information about a target type + */ +struct target_type { + const char *name; + struct module *module; + dm_ctr_fn ctr; + dm_dtr_fn dtr; + dm_map_fn map; + dm_endio_fn end_io; + dm_suspend_fn suspend; + dm_resume_fn resume; + dm_status_fn status; +}; + +struct dm_target { + struct dm_table *table; + struct target_type *type; + + /* target limits */ + sector_t begin; + sector_t len; + + /* target specific data */ + void *private; + + /* Used to provide an error string from the ctr */ + char *error; +}; + +int dm_register_target(struct target_type *t); +int dm_unregister_target(struct target_type *t); + +#endif /* _LINUX_DEVICE_MAPPER_H */ --- diff/include/linux/dm-ioctl.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/linux/dm-ioctl.h 2003-08-26 14:21:37.000000000 +0100 @@ -0,0 +1,237 @@ +/* + * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited. + * + * This file is released under the LGPL. + */ + +#ifndef _LINUX_DM_IOCTL_H +#define _LINUX_DM_IOCTL_H + +#include + +#define DM_DIR "mapper" /* Slashes not supported */ +#define DM_MAX_TYPE_NAME 16 +#define DM_NAME_LEN 128 +#define DM_UUID_LEN 129 + +/* + * A traditional ioctl interface for the device mapper. + * + * Each device can have two tables associated with it, an + * 'active' table which is the one currently used by io passing + * through the device, and an 'inactive' one which is a table + * that is being prepared as a replacement for the 'active' one. + * + * DM_VERSION: + * Just get the version information for the ioctl interface. + * + * DM_REMOVE_ALL: + * Remove all dm devices, destroy all tables. Only really used + * for debug. + * + * DM_LIST_DEVICES: + * Get a list of all the dm device names. + * + * DM_DEV_CREATE: + * Create a new device, neither the 'active' or 'inactive' table + * slots will be filled. The device will be in suspended state + * after creation, however any io to the device will get errored + * since it will be out-of-bounds. + * + * DM_DEV_REMOVE: + * Remove a device, destroy any tables. + * + * DM_DEV_RENAME: + * Rename a device. + * + * DM_SUSPEND: + * This performs both suspend and resume, depending which flag is + * passed in. + * Suspend: This command will not return until all pending io to + * the device has completed. Further io will be deferred until + * the device is resumed. + * Resume: It is no longer an error to issue this command on an + * unsuspended device. If a table is present in the 'inactive' + * slot, it will be moved to the active slot, then the old table + * from the active slot will be _destroyed_. Finally the device + * is resumed. + * + * DM_DEV_STATUS: + * Retrieves the status for the table in the 'active' slot. + * + * DM_DEV_WAIT: + * Wait for a significant event to occur to the device. This + * could either be caused by an event triggered by one of the + * targets of the table in the 'active' slot, or a table change. + * + * DM_TABLE_LOAD: + * Load a table into the 'inactive' slot for the device. The + * device does _not_ need to be suspended prior to this command. + * + * DM_TABLE_CLEAR: + * Destroy any table in the 'inactive' slot (ie. abort). + * + * DM_TABLE_DEPS: + * Return a set of device dependencies for the 'active' table. + * + * DM_TABLE_STATUS: + * Return the targets status for the 'active' table. + */ + +/* + * All ioctl arguments consist of a single chunk of memory, with + * this structure at the start. If a uuid is specified any + * lookup (eg. for a DM_INFO) will be done on that, *not* the + * name. + */ +struct dm_ioctl { + /* + * The version number is made up of three parts: + * major - no backward or forward compatibility, + * minor - only backwards compatible, + * patch - both backwards and forwards compatible. + * + * All clients of the ioctl interface should fill in the + * version number of the interface that they were + * compiled with. + * + * All recognised ioctl commands (ie. those that don't + * return -ENOTTY) fill out this field, even if the + * command failed. + */ + uint32_t version[3]; /* in/out */ + uint32_t data_size; /* total size of data passed in + * including this struct */ + + uint32_t data_start; /* offset to start of data + * relative to start of this struct */ + + uint32_t target_count; /* in/out */ + int32_t open_count; /* out */ + uint32_t flags; /* in/out */ + uint32_t event_nr; /* in/out */ + uint32_t padding; + + uint64_t dev; /* in/out */ + + char name[DM_NAME_LEN]; /* device name */ + char uuid[DM_UUID_LEN]; /* unique identifier for + * the block device */ +}; + +/* + * Used to specify tables. These structures appear after the + * dm_ioctl. + */ +struct dm_target_spec { + uint64_t sector_start; + uint64_t length; + int32_t status; /* used when reading from kernel only */ + + /* + * Offset in bytes (from the start of this struct) to + * next target_spec. + */ + uint32_t next; + + char target_type[DM_MAX_TYPE_NAME]; + + /* + * Parameter string starts immediately after this object. + * Be careful to add padding after string to ensure correct + * alignment of subsequent dm_target_spec. + */ +}; + +/* + * Used to retrieve the target dependencies. + */ +struct dm_target_deps { + uint32_t count; /* Array size */ + uint32_t padding; /* unused */ + uint64_t dev[0]; /* out */ +}; + +/* + * Used to get a list of all dm devices. + */ +struct dm_name_list { + uint64_t dev; + uint32_t next; /* offset to the next record from + the _start_ of this */ + char name[0]; +}; + +/* + * If you change this make sure you make the corresponding change + * to dm-ioctl.c:lookup_ioctl() + */ +enum { + /* Top level cmds */ + DM_VERSION_CMD = 0, + DM_REMOVE_ALL_CMD, + DM_LIST_DEVICES_CMD, + + /* device level cmds */ + DM_DEV_CREATE_CMD, + DM_DEV_REMOVE_CMD, + DM_DEV_RENAME_CMD, + DM_DEV_SUSPEND_CMD, + DM_DEV_STATUS_CMD, + DM_DEV_WAIT_CMD, + + /* Table level cmds */ + DM_TABLE_LOAD_CMD, + DM_TABLE_CLEAR_CMD, + DM_TABLE_DEPS_CMD, + DM_TABLE_STATUS_CMD, +}; + +#define DM_IOCTL 0xfd + +#define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl) +#define DM_REMOVE_ALL _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl) +#define DM_LIST_DEVICES _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, struct dm_ioctl) + +#define DM_DEV_CREATE _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, struct dm_ioctl) +#define DM_DEV_REMOVE _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, struct dm_ioctl) +#define DM_DEV_RENAME _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, struct dm_ioctl) +#define DM_DEV_SUSPEND _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl) +#define DM_DEV_STATUS _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl) +#define DM_DEV_WAIT _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl) + +#define DM_TABLE_LOAD _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl) +#define DM_TABLE_CLEAR _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl) +#define DM_TABLE_DEPS _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, struct dm_ioctl) +#define DM_TABLE_STATUS _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl) + +#define DM_VERSION_MAJOR 4 +#define DM_VERSION_MINOR 0 +#define DM_VERSION_PATCHLEVEL 1 +#define DM_VERSION_EXTRA "-ioctl (2003-07-12)" + +/* Status bits */ +#define DM_READONLY_FLAG (1 << 0) /* In/Out */ +#define DM_SUSPEND_FLAG (1 << 1) /* In/Out */ +#define DM_PERSISTENT_DEV_FLAG (1 << 3) /* In */ + +/* + * Flag passed into ioctl STATUS command to get table information + * rather than current status. + */ +#define DM_STATUS_TABLE_FLAG (1 << 4) /* In */ + +/* + * Flags that indicate whether a table is present in either of + * the two table slots that a device has. + */ +#define DM_ACTIVE_PRESENT_FLAG (1 << 5) /* Out */ +#define DM_INACTIVE_PRESENT_FLAG (1 << 6) /* Out */ + +/* + * Indicates that the buffer passed in wasn't big enough for the + * results. + */ +#define DM_BUFFER_FULL_FLAG (1 << 8) /* Out */ + +#endif /* _LINUX_DM_IOCTL_H */ --- diff/include/linux/mempool.h 1970-01-01 01:00:00.000000000 +0100 +++ source/include/linux/mempool.h 2003-08-26 14:21:35.000000000 +0100 @@ -0,0 +1,31 @@ +/* + * memory buffer pool support + */ +#ifndef _LINUX_MEMPOOL_H +#define _LINUX_MEMPOOL_H + +#include +#include + +struct mempool_s; +typedef struct mempool_s mempool_t; + +typedef void * (mempool_alloc_t)(int gfp_mask, void *pool_data); +typedef void (mempool_free_t)(void *element, void *pool_data); + +extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data); +extern int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask); +extern void mempool_destroy(mempool_t *pool); +extern void * mempool_alloc(mempool_t *pool, int gfp_mask); +extern void mempool_free(void *element, mempool_t *pool); + +/* + * A mempool_alloc_t and mempool_free_t that get the memory from + * a slab that is passed in through pool_data. + */ +void *mempool_alloc_slab(int gfp_mask, void *pool_data); +void mempool_free_slab(void *element, void *pool_data); + + +#endif /* _LINUX_MEMPOOL_H */ --- diff/mm/filemap.c.rej 1970-01-01 01:00:00.000000000 +0100 +++ source/mm/filemap.c.rej 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,21 @@ +*************** +*** 1704,1711 **** + retval = generic_file_direct_IO(READ, filp, buf, count, pos); + if (retval > 0) + *ppos = pos + retval; + } +- UPDATE_ATIME(filp->f_dentry->d_inode); + goto out; + } + } +--- 1704,1713 ---- + retval = generic_file_direct_IO(READ, filp, buf, count, pos); + if (retval > 0) + *ppos = pos + retval; ++ + } ++ if (!S_ISBLK(inode->i_mode)) ++ UPDATE_ATIME(filp->f_dentry->d_inode); + goto out; + } + } --- diff/mm/mempool.c 1970-01-01 01:00:00.000000000 +0100 +++ source/mm/mempool.c 2003-08-26 13:59:04.000000000 +0100 @@ -0,0 +1,299 @@ +/* + * linux/mm/mempool.c + * + * memory buffer pool support. Such pools are mostly used + * for guaranteed, deadlock-free memory allocations during + * extreme VM load. + * + * started by Ingo Molnar, Copyright (C) 2001 + */ + +#include +#include +#include +#include + +struct mempool_s { + spinlock_t lock; + int min_nr; /* nr of elements at *elements */ + int curr_nr; /* Current nr of elements at *elements */ + void **elements; + + void *pool_data; + mempool_alloc_t *alloc; + mempool_free_t *free; + wait_queue_head_t wait; +}; + +static void add_element(mempool_t *pool, void *element) +{ + BUG_ON(pool->curr_nr >= pool->min_nr); + pool->elements[pool->curr_nr++] = element; +} + +static void *remove_element(mempool_t *pool) +{ + BUG_ON(pool->curr_nr <= 0); + return pool->elements[--pool->curr_nr]; +} + +static void free_pool(mempool_t *pool) +{ + while (pool->curr_nr) { + void *element = remove_element(pool); + pool->free(element, pool->pool_data); + } + kfree(pool->elements); + kfree(pool); +} + +/** + * mempool_create - create a memory pool + * @min_nr: the minimum number of elements guaranteed to be + * allocated for this pool. + * @alloc_fn: user-defined element-allocation function. + * @free_fn: user-defined element-freeing function. + * @pool_data: optional private data available to the user-defined functions. + * + * this function creates and allocates a guaranteed size, preallocated + * memory pool. The pool can be used from the mempool_alloc and mempool_free + * functions. This function might sleep. Both the alloc_fn() and the free_fn() + * functions might sleep - as long as the mempool_alloc function is not called + * from IRQ contexts. + */ +mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data) +{ + mempool_t *pool; + + pool = kmalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + memset(pool, 0, sizeof(*pool)); + pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL); + if (!pool->elements) { + kfree(pool); + return NULL; + } + spin_lock_init(&pool->lock); + pool->min_nr = min_nr; + pool->pool_data = pool_data; + init_waitqueue_head(&pool->wait); + pool->alloc = alloc_fn; + pool->free = free_fn; + + /* + * First pre-allocate the guaranteed number of buffers. + */ + while (pool->curr_nr < pool->min_nr) { + void *element; + + element = pool->alloc(GFP_KERNEL, pool->pool_data); + if (unlikely(!element)) { + free_pool(pool); + return NULL; + } + add_element(pool, element); + } + return pool; +} + +/** + * mempool_resize - resize an existing memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @new_min_nr: the new minimum number of elements guaranteed to be + * allocated for this pool. + * @gfp_mask: the usual allocation bitmask. + * + * This function shrinks/grows the pool. In the case of growing, + * it cannot be guaranteed that the pool will be grown to the new + * size immediately, but new mempool_free() calls will refill it. + * + * Note, the caller must guarantee that no mempool_destroy is called + * while this function is running. mempool_alloc() & mempool_free() + * might be called (eg. from IRQ contexts) while this function executes. + */ +int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask) +{ + void *element; + void **new_elements; + unsigned long flags; + + BUG_ON(new_min_nr <= 0); + + spin_lock_irqsave(&pool->lock, flags); + if (new_min_nr < pool->min_nr) { + while (pool->curr_nr > new_min_nr) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + pool->free(element, pool->pool_data); + spin_lock_irqsave(&pool->lock, flags); + } + pool->min_nr = new_min_nr; + goto out_unlock; + } + spin_unlock_irqrestore(&pool->lock, flags); + + /* Grow the pool */ + new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask); + if (!new_elements) + return -ENOMEM; + + spin_lock_irqsave(&pool->lock, flags); + memcpy(new_elements, pool->elements, + pool->curr_nr * sizeof(*new_elements)); + kfree(pool->elements); + pool->elements = new_elements; + pool->min_nr = new_min_nr; + + while (pool->curr_nr < pool->min_nr) { + spin_unlock_irqrestore(&pool->lock, flags); + element = pool->alloc(gfp_mask, pool->pool_data); + if (!element) + goto out; + spin_lock_irqsave(&pool->lock, flags); + if (pool->curr_nr < pool->min_nr) + add_element(pool, element); + else + kfree(element); /* Raced */ + } +out_unlock: + spin_unlock_irqrestore(&pool->lock, flags); +out: + return 0; +} + +/** + * mempool_destroy - deallocate a memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * this function only sleeps if the free_fn() function sleeps. The caller + * has to guarantee that all elements have been returned to the pool (ie: + * freed) prior to calling mempool_destroy(). + */ +void mempool_destroy(mempool_t *pool) +{ + if (pool->curr_nr != pool->min_nr) + BUG(); /* There were outstanding elements */ + free_pool(pool); +} + +/** + * mempool_alloc - allocate an element from a specific memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @gfp_mask: the usual allocation bitmask. + * + * this function only sleeps if the alloc_fn function sleeps or + * returns NULL. Note that due to preallocation, this function + * *never* fails when called from process contexts. (it might + * fail if called from an IRQ context.) + */ +void * mempool_alloc(mempool_t *pool, int gfp_mask) +{ + void *element; + unsigned long flags; + int curr_nr; + DECLARE_WAITQUEUE(wait, current); + int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + +repeat_alloc: + element = pool->alloc(gfp_nowait, pool->pool_data); + if (likely(element != NULL)) + return element; + + /* + * If the pool is less than 50% full then try harder + * to allocate an element: + */ + if ((gfp_mask != gfp_nowait) && (pool->curr_nr <= pool->min_nr/2)) { + element = pool->alloc(gfp_mask, pool->pool_data); + if (likely(element != NULL)) + return element; + } + + /* + * Kick the VM at this point. + */ + wakeup_bdflush(); + + spin_lock_irqsave(&pool->lock, flags); + if (likely(pool->curr_nr)) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + return element; + } + spin_unlock_irqrestore(&pool->lock, flags); + + /* We must not sleep in the GFP_ATOMIC case */ + if (gfp_mask == gfp_nowait) + return NULL; + + run_task_queue(&tq_disk); + + add_wait_queue_exclusive(&pool->wait, &wait); + set_task_state(current, TASK_UNINTERRUPTIBLE); + + spin_lock_irqsave(&pool->lock, flags); + curr_nr = pool->curr_nr; + spin_unlock_irqrestore(&pool->lock, flags); + + if (!curr_nr) + schedule(); + + current->state = TASK_RUNNING; + remove_wait_queue(&pool->wait, &wait); + + goto repeat_alloc; +} + +/** + * mempool_free - return an element to the pool. + * @element: pool element pointer. + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * this function only sleeps if the free_fn() function sleeps. + */ +void mempool_free(void *element, mempool_t *pool) +{ + unsigned long flags; + + if (pool->curr_nr < pool->min_nr) { + spin_lock_irqsave(&pool->lock, flags); + if (pool->curr_nr < pool->min_nr) { + add_element(pool, element); + spin_unlock_irqrestore(&pool->lock, flags); + wake_up(&pool->wait); + return; + } + spin_unlock_irqrestore(&pool->lock, flags); + } + pool->free(element, pool->pool_data); +} + +/* + * A commonly used alloc and free fn. + */ +void *mempool_alloc_slab(int gfp_mask, void *pool_data) +{ + kmem_cache_t *mem = (kmem_cache_t *) pool_data; + return kmem_cache_alloc(mem, gfp_mask); +} + +void mempool_free_slab(void *element, void *pool_data) +{ + kmem_cache_t *mem = (kmem_cache_t *) pool_data; + kmem_cache_free(mem, element); +} + + +EXPORT_SYMBOL(mempool_create); +EXPORT_SYMBOL(mempool_resize); +EXPORT_SYMBOL(mempool_destroy); +EXPORT_SYMBOL(mempool_alloc); +EXPORT_SYMBOL(mempool_free); +EXPORT_SYMBOL(mempool_alloc_slab); +EXPORT_SYMBOL(mempool_free_slab); Only every other metadata area was being read when loading a snapshot! [Kevin Corry] --- diff/drivers/md/dm-exception-store.c 2003-08-26 13:59:04.000000000 +0100 +++ source/drivers/md/dm-exception-store.c 2003-08-26 16:27:05.000000000 +0100 @@ -369,8 +369,6 @@ r = insert_exceptions(ps, &full); if (r) return r; - - area++; } return 0; --- diff/arch/mips64/kernel/ioctl32.c.rej 2003-08-26 13:59:04.000000000 +0100 +++ source/arch/mips64/kernel/ioctl32.c.rej 1970-01-01 01:00:00.000000000 +0100 @@ -1,16 +0,0 @@ -*************** -*** 33,38 **** - #include - #include - #include - - #include - #undef __KERNEL__ /* This file was born to be ugly ... */ ---- 33,39 ---- - #include - #include - #include -+ #include - - #include - #undef __KERNEL__ /* This file was born to be ugly ... */ --- diff/arch/s390x/kernel/ioctl32.c.rej 2003-08-26 13:59:04.000000000 +0100 +++ source/arch/s390x/kernel/ioctl32.c.rej 1970-01-01 01:00:00.000000000 +0100 @@ -1,45 +0,0 @@ -*************** -*** 25,30 **** - #include - #include - #include - #include - #include - #include ---- 25,31 ---- - #include - #include - #include -+ #include - #include - #include - #include -*************** -*** 508,513 **** - - IOCTL32_DEFAULT(SIOCGSTAMP), - - IOCTL32_HANDLER(SIOCGIFNAME, dev_ifname32), - IOCTL32_HANDLER(SIOCGIFCONF, dev_ifconf), - IOCTL32_HANDLER(SIOCGIFFLAGS, dev_ifsioc), ---- 509,528 ---- - - IOCTL32_DEFAULT(SIOCGSTAMP), - -+ IOCTL32_DEFAULT(DM_VERSION), -+ IOCTL32_DEFAULT(DM_REMOVE_ALL), -+ IOCTL32_DEFAULT(DM_DEV_CREATE), -+ IOCTL32_DEFAULT(DM_DEV_REMOVE), -+ IOCTL32_DEFAULT(DM_TABLE_LOAD), -+ IOCTL32_DEFAULT(DM_DEV_SUSPEND), -+ IOCTL32_DEFAULT(DM_DEV_RENAME), -+ IOCTL32_DEFAULT(DM_TABLE_DEPS), -+ IOCTL32_DEFAULT(DM_DEV_STATUS), -+ IOCTL32_DEFAULT(DM_TABLE_STATUS), -+ IOCTL32_DEFAULT(DM_DEV_WAIT), -+ IOCTL32_DEFAULT(DM_LIST_DEVICES), -+ IOCTL32_DEFAULT(DM_TABLE_CLEAR), -+ - IOCTL32_HANDLER(SIOCGIFNAME, dev_ifname32), - IOCTL32_HANDLER(SIOCGIFCONF, dev_ifconf), - IOCTL32_HANDLER(SIOCGIFFLAGS, dev_ifsioc), --- diff/mm/filemap.c.rej 2003-08-26 13:59:04.000000000 +0100 +++ source/mm/filemap.c.rej 1970-01-01 01:00:00.000000000 +0100 @@ -1,21 +0,0 @@ -*************** -*** 1704,1711 **** - retval = generic_file_direct_IO(READ, filp, buf, count, pos); - if (retval > 0) - *ppos = pos + retval; - } -- UPDATE_ATIME(filp->f_dentry->d_inode); - goto out; - } - } ---- 1704,1713 ---- - retval = generic_file_direct_IO(READ, filp, buf, count, pos); - if (retval > 0) - *ppos = pos + retval; -+ - } -+ if (!S_ISBLK(inode->i_mode)) -+ UPDATE_ATIME(filp->f_dentry->d_inode); - goto out; - } - } Don't initialise static variables to zero/NULL. --- diff/drivers/md/kcopyd.c 2003-08-26 13:59:04.000000000 +0100 +++ source/drivers/md/kcopyd.c 2003-08-26 16:27:10.000000000 +0100 @@ -183,8 +183,8 @@ /* FIXME: this should scale with the number of pages */ #define MIN_JOBS 512 -static kmem_cache_t *_job_cache = NULL; -static mempool_t *_job_pool = NULL; +static kmem_cache_t *_job_cache; +static mempool_t *_job_pool; /* * We maintain three lists of jobs: Change resume/suspend to do_resume/do_suspend to avoid name clash. --- diff/drivers/md/dm-ioctl.c 2003-08-26 13:59:04.000000000 +0100 +++ source/drivers/md/dm-ioctl.c 2003-08-26 16:27:15.000000000 +0100 @@ -593,7 +593,7 @@ return dm_hash_rename(param->name, new_name); } -static int suspend(struct dm_ioctl *param) +static int do_suspend(struct dm_ioctl *param) { int r = 0; struct mapped_device *md; @@ -612,7 +612,7 @@ return r; } -static int resume(struct dm_ioctl *param) +static int do_resume(struct dm_ioctl *param) { int r = 0; struct hash_cell *hc; @@ -675,9 +675,9 @@ static int dev_suspend(struct dm_ioctl *param, size_t param_size) { if (param->flags & DM_SUSPEND_FLAG) - return suspend(param); + return do_suspend(param); - return resume(param); + return do_resume(param); } /* Hello all, The current version of the VFS locking patch adds a new semaphore to fs/super.c. This is used to make sure a filesystem does not get mounted on a logical volume while a snapshot is being taken. It also results in all mounts on the system being serialized, and isn't in line with the VFS locking scheme in general. I've been meaning to fix it forever, here's an updated version that adds a super with s->s_dev set to the source volume if nothing is currently mounted on the source volume. This allows me to use the s_umount semaphore in the super block to keep things safe, which is cleaner overall. The other benefit over the existing patch is this one has zero footprint outside the lockfs calls. You're only running new code if you take a snapshot. I've done some testing here, but wanted to let LVM people review it before going further. Patch is below against 2.4.21-rc6. This provides zero new functionality over the existing VFS locking patch, and is experimental. Do not apply this on production servers, and do not apply unless you want to help test. -chris ===== drivers/md/lvm.c 1.19 vs edited ===== --- diff/drivers/md/dm-snapshot.c 2003-08-26 13:59:04.000000000 +0100 +++ source/drivers/md/dm-snapshot.c 2003-08-26 16:27:21.000000000 +0100 @@ -525,7 +525,7 @@ } /* Flush IO to the origin device */ - fsync_dev(s->origin->dev); + fsync_dev_lockfs(s->origin->dev); /* Add snapshot to the list of snapshots for this origin */ if (register_snapshot(s)) { @@ -539,6 +539,7 @@ bad6: kcopyd_client_destroy(s->kcopyd_client); + unlockfs(s->origin->dev); bad5: s->store.destroy(&s->store); --- diff/drivers/md/lvm.c 2003-06-16 09:56:10.000000000 +0100 +++ source/drivers/md/lvm.c 2003-08-26 16:27:21.000000000 +0100 @@ -229,9 +229,6 @@ #define DEVICE_OFF(device) #define LOCAL_END_REQUEST -/* lvm_do_lv_create calls fsync_dev_lockfs()/unlockfs() */ -/* #define LVM_VFS_ENHANCEMENT */ - #include #include #include @@ -2171,12 +2168,8 @@ if (lv_ptr->lv_access & LV_SNAPSHOT) { lv_t *org = lv_ptr->lv_snapshot_org, *last; - /* sync the original logical volume */ - fsync_dev(org->lv_dev); -#ifdef LVM_VFS_ENHANCEMENT /* VFS function call to sync and lock the filesystem */ fsync_dev_lockfs(org->lv_dev); -#endif down_write(&org->lv_lock); org->lv_access |= LV_SNAPSHOT_ORG; @@ -2201,11 +2194,9 @@ else set_device_ro(lv_ptr->lv_dev, 1); -#ifdef LVM_VFS_ENHANCEMENT /* VFS function call to unlock the filesystem */ if (lv_ptr->lv_access & LV_SNAPSHOT) unlockfs(lv_ptr->lv_snapshot_org->lv_dev); -#endif lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].de = lvm_fs_create_lv(vg_ptr, lv_ptr); --- diff/fs/buffer.c 2003-08-26 13:59:04.000000000 +0100 +++ source/fs/buffer.c 2003-08-26 16:27:21.000000000 +0100 @@ -376,6 +376,34 @@ fsync_dev(dev); } +int fsync_dev_lockfs(kdev_t dev) +{ + /* you are not allowed to try locking all the filesystems + ** on the system, your chances of getting through without + ** total deadlock are slim to none. + */ + if (!dev) + return fsync_dev(dev) ; + + sync_buffers(dev, 0); + + lock_kernel(); + /* note, the FS might need to start transactions to + ** sync the inodes, or the quota, no locking until + ** after these are done + */ + sync_inodes(dev); + DQUOT_SYNC(dev); + /* if inodes or quotas could be dirtied during the + ** sync_supers_lockfs call, the FS is responsible for getting + ** them on disk, without deadlocking against the lock + */ + sync_supers_lockfs(dev) ; + unlock_kernel(); + + return sync_buffers(dev, 1) ; +} + asmlinkage long sys_sync(void) { fsync_dev(0); --- diff/fs/reiserfs/super.c 2003-08-26 13:50:12.000000000 +0100 +++ source/fs/reiserfs/super.c 2003-08-26 16:27:21.000000000 +0100 @@ -73,7 +73,7 @@ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); reiserfs_block_writes(&th) ; - journal_end(&th, s, 1) ; + journal_end_sync(&th, s, 1) ; } s->s_dirt = dirty; unlock_kernel() ; --- diff/fs/super.c 2003-08-26 13:50:12.000000000 +0100 +++ source/fs/super.c 2003-08-26 16:27:21.000000000 +0100 @@ -39,6 +39,12 @@ spinlock_t sb_lock = SPIN_LOCK_UNLOCKED; /* + * stub of a filesystem used to make sure an FS isn't mounted + * in the middle of a lockfs call + */ +static DECLARE_FSTYPE_DEV(lockfs_fs_type, "lockfs", NULL); + +/* * Handling of filesystem drivers list. * Rules: * Inclusion to/removals from/scanning of list are protected by spinlock. @@ -436,6 +442,25 @@ put_super(sb); } +static void write_super_lockfs(struct super_block *sb) +{ + lock_super(sb); + if (sb->s_root && sb->s_op) { + if (sb->s_dirt && sb->s_op->write_super) + sb->s_op->write_super(sb); + if (sb->s_op->write_super_lockfs) + sb->s_op->write_super_lockfs(sb); + } + unlock_super(sb); + + /* + * if no lockfs call is provided, use the sync_fs call instead. + * this must be done without the super lock held + */ + if (!sb->s_op->write_super_lockfs && sb->s_op->sync_fs) + sb->s_op->sync_fs(sb); +} + static inline void write_super(struct super_block *sb) { lock_super(sb); @@ -483,6 +508,119 @@ spin_unlock(&sb_lock); } +static struct super_block *find_super_for_lockfs(kdev_t dev) +{ + struct super_block *lockfs_sb = alloc_super(); + struct super_block * s; + + if (!dev) + return NULL; +restart: + spin_lock(&sb_lock); + s = find_super(dev); + if (s) { + spin_unlock(&sb_lock); + down_read(&s->s_umount); + if (s->s_root) { + destroy_super(lockfs_sb); + return s; + } + drop_super(s); + goto restart; + } + /* if (s) we either return or goto, so we know s == NULL here. + * At this point, there are no mounted filesystems on this device, + * so we pretend to mount one. + */ + if (!lockfs_sb) { + spin_unlock(&sb_lock); + return NULL; + } + s = lockfs_sb; + s->s_dev = dev; + if (lockfs_fs_type.fs_supers.prev == NULL) + INIT_LIST_HEAD(&lockfs_fs_type.fs_supers); + insert_super(s, &lockfs_fs_type); + s->s_root = (struct dentry *)1; + /* alloc_super gives us a write lock on s_umount, this + * way we know there are no concurrent lockfs holders for this dev. + * It allows us to remove the temp super from the list of supers + * immediately when unlockfs is called + */ + return s; +} +/* + * Note: don't check the dirty flag before waiting, we want the lock + * to happen every time this is called. dev must be non-zero + */ +void sync_supers_lockfs(kdev_t dev) +{ + struct super_block *sb; + sb = find_super_for_lockfs(dev); + if (sb) { + write_super_lockfs(sb); + /* the drop_super is done by unlockfs */ + } +} + +static void drop_super_lockfs(struct super_block *s) +{ + if (s->s_type == &lockfs_fs_type) { + struct file_system_type *fs = s->s_type; + + /* + * nobody else is allowed to grab_super() on our temp + */ + if (!deactivate_super(s)) + BUG(); + + spin_lock(&sb_lock); + s->s_root = NULL; + list_del(&s->s_list); + list_del(&s->s_instances); + spin_unlock(&sb_lock); + + up_write(&s->s_umount); + put_super(s); + put_filesystem(fs); + } else + drop_super(s); +} + +void unlockfs(kdev_t dev) +{ + struct super_block *s; + if (!dev) + return; + + spin_lock(&sb_lock); + s = find_super(dev); + if (s) { + /* + * find_super and the original lockfs call both incremented + * the reference count. drop one of them + */ + s->s_count--; + spin_unlock(&sb_lock); + if (s->s_root) { + if (s->s_op->unlockfs) + s->s_op->unlockfs(s); + drop_super_lockfs(s); + goto out; + } else { + printk("unlockfs: no s_root, dev %s\n", kdevname(dev)); + BUG(); + } + } else { + printk("unlockfs: no super found, dev %s\n", kdevname(dev)); + BUG(); + } + + spin_unlock(&sb_lock); +out: + return; +} + /** * get_super - get the superblock of a device * @dev: device to get the superblock for --- diff/include/linux/fs.h 2003-08-26 14:20:27.000000000 +0100 +++ source/include/linux/fs.h 2003-08-26 16:27:21.000000000 +0100 @@ -1272,6 +1272,7 @@ extern int sync_buffers(kdev_t, int); extern void sync_dev(kdev_t); extern int fsync_dev(kdev_t); +extern int fsync_dev_lockfs(kdev_t); extern int fsync_super(struct super_block *); extern int fsync_no_super(kdev_t); extern void sync_inodes_sb(struct super_block *); @@ -1289,6 +1290,8 @@ extern int filemap_fdatasync(struct address_space *); extern int filemap_fdatawait(struct address_space *); extern void sync_supers(kdev_t dev, int wait); +extern void sync_supers_lockfs(kdev_t); +extern void unlockfs(kdev_t); extern int bmap(struct inode *, int); extern int notify_change(struct dentry *, struct iattr *); extern int permission(struct inode *, int); --- diff/kernel/ksyms.c 2003-08-26 13:59:04.000000000 +0100 +++ source/kernel/ksyms.c 2003-08-26 16:27:21.000000000 +0100 @@ -189,6 +189,8 @@ EXPORT_SYMBOL(invalidate_inode_pages); EXPORT_SYMBOL(truncate_inode_pages); EXPORT_SYMBOL(fsync_dev); +EXPORT_SYMBOL(fsync_dev_lockfs); +EXPORT_SYMBOL(unlockfs); EXPORT_SYMBOL(fsync_no_super); EXPORT_SYMBOL(permission); EXPORT_SYMBOL(vfs_permission); missing parts of the previous vfs patch (merge). --- diff/drivers/md/dm-snapshot.c 2003-08-26 16:27:21.000000000 +0100 +++ source/drivers/md/dm-snapshot.c 2003-08-26 16:27:27.000000000 +0100 @@ -533,13 +533,14 @@ ti->error = "Cannot register snapshot origin"; goto bad6; } + unlockfs(s->origin->dev); ti->private = s; return 0; bad6: - kcopyd_client_destroy(s->kcopyd_client); unlockfs(s->origin->dev); + kcopyd_client_destroy(s->kcopyd_client); bad5: s->store.destroy(&s->store); Lift vfs locking to dm_suspend/resume. --- diff/drivers/md/dm-snapshot.c 2003-08-26 16:27:27.000000000 +0100 +++ source/drivers/md/dm-snapshot.c 2003-08-26 16:27:34.000000000 +0100 @@ -524,22 +524,17 @@ goto bad5; } - /* Flush IO to the origin device */ - fsync_dev_lockfs(s->origin->dev); - /* Add snapshot to the list of snapshots for this origin */ if (register_snapshot(s)) { r = -EINVAL; ti->error = "Cannot register snapshot origin"; goto bad6; } - unlockfs(s->origin->dev); ti->private = s; return 0; bad6: - unlockfs(s->origin->dev); kcopyd_client_destroy(s->kcopyd_client); bad5: --- diff/drivers/md/dm.c 2003-08-26 13:59:04.000000000 +0100 +++ source/drivers/md/dm.c 2003-08-26 16:27:34.000000000 +0100 @@ -951,13 +951,23 @@ int r = 0; DECLARE_WAITQUEUE(wait, current); - down_write(&md->lock); + /* Flush IO to the origin device */ + down_read(&md->lock); + if (test_bit(DMF_BLOCK_IO, &md->flags)) { + up_read(&md->lock); + return -EINVAL; + } + + fsync_dev_lockfs(md->dev); + up_read(&md->lock); + /* - * First we set the BLOCK_IO flag so no more ios will be - * mapped. + * Set the BLOCK_IO flag so no more ios will be mapped. */ + down_write(&md->lock); if (test_bit(DMF_BLOCK_IO, &md->flags)) { + unlockfs(md->dev); up_write(&md->lock); return -EINVAL; } @@ -986,6 +996,7 @@ /* did we flush everything ? */ if (atomic_read(&md->pending)) { + unlockfs(md->dev); clear_bit(DMF_BLOCK_IO, &md->flags); r = -EINTR; } else { @@ -1017,6 +1028,7 @@ md->deferred = NULL; up_write(&md->lock); + unlockfs(md->dev); flush_deferred_io(def); run_task_queue(&tq_disk); Correct error message when start a dm-daemon. --- diff/drivers/md/dm-daemon.c 2003-08-26 13:59:04.000000000 +0100 +++ source/drivers/md/dm-daemon.c 2003-08-26 16:28:05.000000000 +0100 @@ -81,7 +81,7 @@ down(&dd->start_lock); pid = kernel_thread(daemon, dd, 0); if (pid <= 0) { - DMERR("Failed to start kcopyd thread"); + DMERR("Failed to start %s thread", name); return -EAGAIN; } When multiple load ioctls are issued the reference count on older 'new_tables' wasn't being dropped. [Christophe Saout] --- diff/drivers/md/dm-ioctl.c 2003-08-26 16:27:15.000000000 +0100 +++ source/drivers/md/dm-ioctl.c 2003-08-26 16:28:11.000000000 +0100 @@ -816,6 +816,8 @@ return -ENXIO; } + if (hc->new_map) + dm_table_put(hc->new_map); hc->new_map = t; param->flags |= DM_INACTIVE_PRESENT_FLAG; Stop labelling dm as 'experimental'. --- diff/drivers/md/Config.in 2003-08-26 13:59:04.000000000 +0100 +++ source/drivers/md/Config.in 2003-08-26 16:28:16.000000000 +0100 @@ -14,9 +14,7 @@ dep_tristate ' Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD -if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then - dep_tristate ' Device-mapper support (EXPERIMENTAL)' CONFIG_BLK_DEV_DM $CONFIG_MD - dep_tristate ' Mirror (RAID-1) support (EXPERIMENTAL)' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM -fi +dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD +dep_tristate ' Mirror (RAID-1) support' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM endmenu Move retrieve_status up so dev_wait() can use it. --- diff/drivers/md/dm-ioctl.c 2003-08-26 16:28:11.000000000 +0100 +++ source/drivers/md/dm-ioctl.c 2003-08-26 16:28:21.000000000 +0100 @@ -699,6 +699,69 @@ } /* + * Build up the status struct for each target + */ +static void retrieve_status(struct dm_table *table, struct dm_ioctl *param, + size_t param_size) +{ + unsigned int i, num_targets; + struct dm_target_spec *spec; + char *outbuf, *outptr; + status_type_t type; + size_t remaining, len, used = 0; + + outptr = outbuf = get_result_buffer(param, param_size, &len); + + if (param->flags & DM_STATUS_TABLE_FLAG) + type = STATUSTYPE_TABLE; + else + type = STATUSTYPE_INFO; + + /* Get all the target info */ + num_targets = dm_table_get_num_targets(table); + for (i = 0; i < num_targets; i++) { + struct dm_target *ti = dm_table_get_target(table, i); + + remaining = len - (outptr - outbuf); + if (remaining < sizeof(struct dm_target_spec)) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } + + spec = (struct dm_target_spec *) outptr; + + spec->status = 0; + spec->sector_start = ti->begin; + spec->length = ti->len; + strncpy(spec->target_type, ti->type->name, + sizeof(spec->target_type)); + + outptr += sizeof(struct dm_target_spec); + remaining = len - (outptr - outbuf); + + /* Get the status/table string from the target driver */ + if (ti->type->status) { + if (ti->type->status(ti, type, outptr, remaining)) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } + } else + outptr[0] = '\0'; + + outptr += strlen(outptr) + 1; + used = param->data_start + (outptr - outbuf); + + align_ptr(outptr); + spec->next = outptr - outbuf; + } + + if (used) + param->data_size = used; + + param->target_count = num_targets; +} + +/* * Wait for a device to report an event */ static int dev_wait(struct dm_ioctl *param, size_t param_size) @@ -919,69 +982,6 @@ } /* - * Build up the status struct for each target - */ -static void retrieve_status(struct dm_table *table, struct dm_ioctl *param, - size_t param_size) -{ - unsigned int i, num_targets; - struct dm_target_spec *spec; - char *outbuf, *outptr; - status_type_t type; - size_t remaining, len, used = 0; - - outptr = outbuf = get_result_buffer(param, param_size, &len); - - if (param->flags & DM_STATUS_TABLE_FLAG) - type = STATUSTYPE_TABLE; - else - type = STATUSTYPE_INFO; - - /* Get all the target info */ - num_targets = dm_table_get_num_targets(table); - for (i = 0; i < num_targets; i++) { - struct dm_target *ti = dm_table_get_target(table, i); - - remaining = len - (outptr - outbuf); - if (remaining < sizeof(struct dm_target_spec)) { - param->flags |= DM_BUFFER_FULL_FLAG; - break; - } - - spec = (struct dm_target_spec *) outptr; - - spec->status = 0; - spec->sector_start = ti->begin; - spec->length = ti->len; - strncpy(spec->target_type, ti->type->name, - sizeof(spec->target_type)); - - outptr += sizeof(struct dm_target_spec); - remaining = len - (outptr - outbuf); - - /* Get the status/table string from the target driver */ - if (ti->type->status) { - if (ti->type->status(ti, type, outptr, remaining)) { - param->flags |= DM_BUFFER_FULL_FLAG; - break; - } - } else - outptr[0] = '\0'; - - outptr += strlen(outptr) + 1; - used = param->data_start + (outptr - outbuf); - - align_ptr(outptr); - spec->next = outptr - outbuf; - } - - if (used) - param->data_size = used; - - param->target_count = num_targets; -} - -/* * Return the status of a device as a text string for each * target. */ dev_wait was meant to return table status not dev status. [Alasdair Kergon] --- diff/drivers/md/dm-ioctl.c 2003-08-26 16:28:21.000000000 +0100 +++ source/drivers/md/dm-ioctl.c 2003-08-26 16:28:26.000000000 +0100 @@ -768,6 +768,7 @@ { int r; struct mapped_device *md; + struct dm_table *table; DECLARE_WAITQUEUE(wq, current); md = find_device(param); @@ -790,7 +791,16 @@ * him and save an ioctl. */ r = __dev_status(md, param); + if (r) + goto out; + + table = dm_get_table(md); + if (table) { + retrieve_status(table, param, param_size); + dm_table_put(table); + } + out: dm_put(md); return r; } Fix error message when linear targets gets handed more than 2 arguments. [Alasdair Kergon] --- diff/drivers/md/dm-linear.c 2003-08-26 13:59:04.000000000 +0100 +++ source/drivers/md/dm-linear.c 2003-08-26 16:28:31.000000000 +0100 @@ -27,7 +27,7 @@ struct linear_c *lc; if (argc != 2) { - ti->error = "dm-linear: Not enough arguments"; + ti->error = "dm-linear: Invalid argument count"; return -EINVAL; } Support an arbitrary number of target parameters. [Alasdair Kergon] --- diff/drivers/md/dm-table.c 2003-08-26 13:59:04.000000000 +0100 +++ source/drivers/md/dm-table.c 2003-08-26 16:28:39.000000000 +0100 @@ -441,12 +441,36 @@ } /* + * Used to dynamically allocate the arg array. + */ +static char **realloc_argv(unsigned *array_size, char **old_argv) +{ + char **argv; + unsigned new_size; + + new_size = *array_size ? *array_size * 2 : 64; + argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL); + if (argv) { + memcpy(argv, old_argv, *array_size * sizeof(*argv)); + *array_size = new_size; + } + + kfree(old_argv); + return argv; +} + +/* * Destructively splits up the argument list to pass to ctr. */ -static int split_args(int max, int *argc, char **argv, char *input) +static int split_args(int *argc, char ***argvp, char *input) { - char *start, *end = input, *out; + char *start, *end = input, *out, **argv = NULL; + unsigned array_size = 0; + *argc = 0; + argv = realloc_argv(&array_size, argv); + if (!argv) + return -ENOMEM; while (1) { start = end; @@ -475,8 +499,11 @@ } /* have we already filled the array ? */ - if ((*argc + 1) > max) - return -EINVAL; + if ((*argc + 1) > array_size) { + argv = realloc_argv(&array_size, argv); + if (!argv) + return -ENOMEM; + } /* we know this is whitespace */ if (*end) @@ -488,6 +515,7 @@ (*argc)++; } + *argvp = argv; return 0; } @@ -495,7 +523,7 @@ sector_t start, sector_t len, char *params) { int r = -EINVAL, argc; - char *argv[32]; + char **argv; struct dm_target *tgt; if ((r = check_space(t))) @@ -524,13 +552,14 @@ goto bad; } - r = split_args(ARRAY_SIZE(argv), &argc, argv, params); + r = split_args(&argc, &argv, params); if (r) { - tgt->error = "couldn't split parameters"; + tgt->error = "couldn't split parameters (insufficient memory)"; goto bad; } r = tgt->type->ctr(tgt, argc, argv); + kfree(argv); if (r) goto bad; Fix VFS patch --- diff/fs/buffer.c 2003-08-26 16:27:21.000000000 +0100 +++ source/fs/buffer.c 2003-08-26 16:56:31.000000000 +0100 @@ -393,7 +393,7 @@ ** after these are done */ sync_inodes(dev); - DQUOT_SYNC(dev); + DQUOT_SYNC_DEV(dev); /* if inodes or quotas could be dirtied during the ** sync_supers_lockfs call, the FS is responsible for getting ** them on disk, without deadlocking against the lock The ioctl interface always knows how many targets are going to be in the table, so remove the dynamic array sizing code in dm-table.c. This fixes a problem with large tables where the dm_target pointer passed to the target ctr was becoming invalid. --- diff/drivers/md/dm-ioctl.c 2003-08-26 16:28:26.000000000 +0100 +++ source/drivers/md/dm-ioctl.c 2003-08-26 16:58:16.000000000 +0100 @@ -871,7 +871,7 @@ struct hash_cell *hc; struct dm_table *t; - r = dm_table_create(&t, get_mode(param)); + r = dm_table_create(&t, get_mode(param), param->target_count); if (r) return r; --- diff/drivers/md/dm-table.c 2003-08-26 16:28:39.000000000 +0100 +++ source/drivers/md/dm-table.c 2003-08-26 17:05:06.000000000 +0100 @@ -112,42 +112,7 @@ return 0; } -/* - * highs, and targets are managed as dynamic arrays during a - * table load. - */ -static int alloc_targets(struct dm_table *t, unsigned int num) -{ - sector_t *n_highs; - struct dm_target *n_targets; - int n = t->num_targets; - - /* - * Allocate both the target array and offset array at once. - */ - n_highs = (sector_t *) vcalloc(sizeof(struct dm_target) + - sizeof(sector_t), num); - if (!n_highs) - return -ENOMEM; - - n_targets = (struct dm_target *) (n_highs + num); - - if (n) { - memcpy(n_highs, t->highs, sizeof(*n_highs) * n); - memcpy(n_targets, t->targets, sizeof(*n_targets) * n); - } - - memset(n_highs + n, -1, sizeof(*n_highs) * (num - n)); - vfree(t->highs); - - t->num_allocated = num; - t->highs = n_highs; - t->targets = n_targets; - - return 0; -} - -int dm_table_create(struct dm_table **result, int mode) +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets) { struct dm_table *t = kmalloc(sizeof(*t), GFP_NOIO); @@ -158,13 +123,17 @@ INIT_LIST_HEAD(&t->devices); atomic_set(&t->holders, 1); - /* allocate a single nodes worth of targets to begin with */ - if (alloc_targets(t, KEYS_PER_NODE)) { + + /* allocate both the target array and offset array at once */ + t->highs = (sector_t *) vcalloc(sizeof(struct dm_target) + + sizeof(sector_t), num_targets); + if (!t->highs) { kfree(t); - t = NULL; return -ENOMEM; } + t->targets = (struct dm_target *) (t->highs + num_targets); + t->num_allocated = num_targets; t->mode = mode; *result = t; return 0; @@ -224,17 +193,6 @@ } /* - * Checks to see if we need to extend highs or targets. - */ -static inline int check_space(struct dm_table *t) -{ - if (t->num_targets >= t->num_allocated) - return alloc_targets(t, t->num_allocated * 2); - - return 0; -} - -/* * Convert a device path to a dev_t. */ static int lookup_device(const char *path, kdev_t *dev) @@ -526,8 +484,8 @@ char **argv; struct dm_target *tgt; - if ((r = check_space(t))) - return r; + if (t->num_targets >= t->num_allocated) + return -ENOMEM; tgt = t->targets + t->num_targets; memset(tgt, 0, sizeof(*tgt)); --- diff/drivers/md/dm.h 2003-08-26 16:45:44.000000000 +0100 +++ source/drivers/md/dm.h 2003-08-26 17:04:02.000000000 +0100 @@ -96,7 +96,7 @@ * Functions for manipulating a table. Tables are also reference * counted. *---------------------------------------------------------------*/ -int dm_table_create(struct dm_table **result, int mode); +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets); void dm_table_get(struct dm_table *t); void dm_table_put(struct dm_table *t);