diff -urNp --exclude CVS --exclude BitKeeper x-ref/Documentation/Configure.help x/Documentation/Configure.help --- x-ref/Documentation/Configure.help 2003-07-17 05:44:47.000000000 +0200 +++ x/Documentation/Configure.help 2003-07-17 05:44:48.000000000 +0200 @@ -1839,6 +1839,20 @@ CONFIG_BLK_DEV_LVM want), say M here and read . The module will be called lvm-mod.o. +Device-mapper support +CONFIG_BLK_DEV_DM + Device-mapper is a low level volume manager. It works by allowing + people to specify mappings for ranges of logical sectors. Various + mapping types are available, in addition people may write their own + modules containing custom mappings if they wish. + + Higher level volume managers such as LVM2 use this driver. + + If you want to compile this as a module, say M here and read + . The module will be called dm-mod.o. + + If unsure, say N. + Multiple devices driver support (RAID and LVM) CONFIG_MD Support multiple physical spindles through a single logical device. diff -urNp --exclude CVS --exclude BitKeeper x-ref/MAINTAINERS x/MAINTAINERS --- x-ref/MAINTAINERS 2003-07-17 05:44:45.000000000 +0200 +++ x/MAINTAINERS 2003-07-17 05:44:48.000000000 +0200 @@ -506,6 +506,13 @@ M: dz@debian.org W: http://www.debian.org/~dz/i8k/ S: Maintained +DEVICE MAPPER +P: Joe Thornber +M: dm@uk.sistina.com +L: linux-LVM@sistina.com +W: http://www.sistina.com/lvm +S: Maintained + DEVICE NUMBER REGISTRY P: H. Peter Anvin M: hpa@zytor.com diff -urNp --exclude CVS --exclude BitKeeper x-ref/arch/mips64/kernel/ioctl32.c x/arch/mips64/kernel/ioctl32.c --- x-ref/arch/mips64/kernel/ioctl32.c 2003-03-15 03:24:56.000000000 +0100 +++ x/arch/mips64/kernel/ioctl32.c 2003-07-17 05:44:48.000000000 +0200 @@ -33,6 +33,7 @@ #include #include #include +#include #include #undef __KERNEL__ /* This file was born to be ugly ... */ @@ -915,6 +916,20 @@ static struct ioctl32_list ioctl32_handl IOCTL32_DEFAULT(RESTART_ARRAY_RW), #endif /* CONFIG_MD */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) + IOCTL32_DEFAULT(DM_VERSION), + IOCTL32_DEFAULT(DM_REMOVE_ALL), + IOCTL32_DEFAULT(DM_DEV_CREATE), + IOCTL32_DEFAULT(DM_DEV_REMOVE), + IOCTL32_DEFAULT(DM_DEV_RELOAD), + IOCTL32_DEFAULT(DM_DEV_SUSPEND), + IOCTL32_DEFAULT(DM_DEV_RENAME), + IOCTL32_DEFAULT(DM_DEV_DEPS), + IOCTL32_DEFAULT(DM_DEV_STATUS), + IOCTL32_DEFAULT(DM_TARGET_STATUS), + IOCTL32_DEFAULT(DM_TARGET_WAIT), +#endif /* CONFIG_BLK_DEV_DM */ + IOCTL32_DEFAULT(MTIOCTOP), /* mtio.h ioctls */ IOCTL32_HANDLER(MTIOCGET32, mt_ioctl_trans), IOCTL32_HANDLER(MTIOCPOS32, mt_ioctl_trans), diff -urNp --exclude CVS --exclude BitKeeper x-ref/arch/parisc/kernel/ioctl32.c x/arch/parisc/kernel/ioctl32.c --- x-ref/arch/parisc/kernel/ioctl32.c 2003-07-15 02:05:40.000000000 +0200 +++ x/arch/parisc/kernel/ioctl32.c 2003-07-17 05:44:48.000000000 +0200 @@ -55,6 +55,7 @@ #define max max */ #include #endif /* LVM */ +#include #include /* Ugly hack. */ @@ -3423,6 +3424,20 @@ COMPATIBLE_IOCTL(LE_REMAP) COMPATIBLE_IOCTL(LV_BMAP) COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE) #endif /* LVM */ +/* Device-Mapper */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) +COMPATIBLE_IOCTL(DM_VERSION) +COMPATIBLE_IOCTL(DM_REMOVE_ALL) +COMPATIBLE_IOCTL(DM_DEV_CREATE) +COMPATIBLE_IOCTL(DM_DEV_REMOVE) +COMPATIBLE_IOCTL(DM_DEV_RELOAD) +COMPATIBLE_IOCTL(DM_DEV_SUSPEND) +COMPATIBLE_IOCTL(DM_DEV_RENAME) +COMPATIBLE_IOCTL(DM_DEV_DEPS) +COMPATIBLE_IOCTL(DM_DEV_STATUS) +COMPATIBLE_IOCTL(DM_TARGET_STATUS) +COMPATIBLE_IOCTL(DM_TARGET_WAIT) +#endif /* CONFIG_BLK_DEV_DM */ #if defined(CONFIG_DRM) || defined(CONFIG_DRM_MODULE) COMPATIBLE_IOCTL(DRM_IOCTL_GET_MAGIC) COMPATIBLE_IOCTL(DRM_IOCTL_IRQ_BUSID) diff -urNp --exclude CVS --exclude BitKeeper x-ref/arch/ppc64/kernel/ioctl32.c x/arch/ppc64/kernel/ioctl32.c --- x-ref/arch/ppc64/kernel/ioctl32.c 2003-07-15 02:05:40.000000000 +0200 +++ x/arch/ppc64/kernel/ioctl32.c 2003-07-17 05:44:48.000000000 +0200 @@ -66,6 +66,7 @@ #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE) #include #endif /* LVM */ +#include #include /* Ugly hack. */ @@ -4435,6 +4436,20 @@ COMPATIBLE_IOCTL(NBD_CLEAR_QUE), COMPATIBLE_IOCTL(NBD_PRINT_DEBUG), COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS), COMPATIBLE_IOCTL(NBD_DISCONNECT), +/* device-mapper */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) +COMPATIBLE_IOCTL(DM_VERSION), +COMPATIBLE_IOCTL(DM_REMOVE_ALL), +COMPATIBLE_IOCTL(DM_DEV_CREATE), +COMPATIBLE_IOCTL(DM_DEV_REMOVE), +COMPATIBLE_IOCTL(DM_DEV_RELOAD), +COMPATIBLE_IOCTL(DM_DEV_SUSPEND), +COMPATIBLE_IOCTL(DM_DEV_RENAME), +COMPATIBLE_IOCTL(DM_DEV_DEPS), +COMPATIBLE_IOCTL(DM_DEV_STATUS), +COMPATIBLE_IOCTL(DM_TARGET_STATUS), +COMPATIBLE_IOCTL(DM_TARGET_WAIT), +#endif /* CONFIG_BLK_DEV_DM */ /* Remove *PRIVATE in 2.5 */ COMPATIBLE_IOCTL(SIOCDEVPRIVATE), COMPATIBLE_IOCTL(SIOCDEVPRIVATE+1), diff -urNp --exclude CVS --exclude BitKeeper x-ref/arch/s390x/kernel/ioctl32.c x/arch/s390x/kernel/ioctl32.c --- x-ref/arch/s390x/kernel/ioctl32.c 2003-07-15 02:05:40.000000000 +0200 +++ x/arch/s390x/kernel/ioctl32.c 2003-07-17 05:45:19.000000000 +0200 @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -627,6 +628,18 @@ static struct ioctl32_list ioctl32_handl IOCTL32_DEFAULT(SIOCGSTAMP), + IOCTL32_DEFAULT(DM_VERSION), + IOCTL32_DEFAULT(DM_REMOVE_ALL), + IOCTL32_DEFAULT(DM_DEV_CREATE), + IOCTL32_DEFAULT(DM_DEV_REMOVE), + IOCTL32_DEFAULT(DM_DEV_RELOAD), + IOCTL32_DEFAULT(DM_DEV_SUSPEND), + IOCTL32_DEFAULT(DM_DEV_RENAME), + IOCTL32_DEFAULT(DM_DEV_DEPS), + IOCTL32_DEFAULT(DM_DEV_STATUS), + IOCTL32_DEFAULT(DM_TARGET_STATUS), + IOCTL32_DEFAULT(DM_TARGET_WAIT), + IOCTL32_DEFAULT(LOOP_SET_FD), IOCTL32_DEFAULT(LOOP_CLR_FD), diff -urNp --exclude CVS --exclude BitKeeper x-ref/arch/sparc64/kernel/ioctl32.c x/arch/sparc64/kernel/ioctl32.c --- x-ref/arch/sparc64/kernel/ioctl32.c 2003-07-15 02:05:40.000000000 +0200 +++ x/arch/sparc64/kernel/ioctl32.c 2003-07-17 05:44:48.000000000 +0200 @@ -56,6 +56,7 @@ #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE) #include #endif /* LVM */ +#include #include /* Ugly hack. */ @@ -5103,6 +5104,21 @@ COMPATIBLE_IOCTL(VIDEO1394_IOC_TALK_CHAN COMPATIBLE_IOCTL(VIDEO1394_IOC_UNTALK_CHANNEL) #endif +/* device-mapper */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) +COMPATIBLE_IOCTL(DM_VERSION) +COMPATIBLE_IOCTL(DM_REMOVE_ALL) +COMPATIBLE_IOCTL(DM_DEV_CREATE) +COMPATIBLE_IOCTL(DM_DEV_REMOVE) +COMPATIBLE_IOCTL(DM_DEV_RELOAD) +COMPATIBLE_IOCTL(DM_DEV_SUSPEND) +COMPATIBLE_IOCTL(DM_DEV_RENAME) +COMPATIBLE_IOCTL(DM_DEV_DEPS) +COMPATIBLE_IOCTL(DM_DEV_STATUS) +COMPATIBLE_IOCTL(DM_TARGET_STATUS) +COMPATIBLE_IOCTL(DM_TARGET_WAIT) +#endif /* CONFIG_BLK_DEV_DM */ + /* And these ioctls need translation */ #if defined(CONFIG_IEEE1394) || defined(CONFIG_IEEE1394_MODULE) HANDLE_IOCTL(DV1394_IOC32_INIT, handle_dv1394_init) diff -urNp --exclude CVS --exclude BitKeeper x-ref/arch/x86_64/ia32/ia32_ioctl.c x/arch/x86_64/ia32/ia32_ioctl.c --- x-ref/arch/x86_64/ia32/ia32_ioctl.c 2003-07-15 02:05:40.000000000 +0200 +++ x/arch/x86_64/ia32/ia32_ioctl.c 2003-07-17 05:44:48.000000000 +0200 @@ -67,6 +67,7 @@ #define max max #include #endif /* LVM */ +#include #include /* Ugly hack. */ @@ -4047,6 +4048,20 @@ COMPATIBLE_IOCTL(LE_REMAP) COMPATIBLE_IOCTL(LV_BMAP) COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE) #endif /* LVM */ +/* Device-Mapper */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) +COMPATIBLE_IOCTL(DM_VERSION) +COMPATIBLE_IOCTL(DM_REMOVE_ALL) +COMPATIBLE_IOCTL(DM_DEV_CREATE) +COMPATIBLE_IOCTL(DM_DEV_REMOVE) +COMPATIBLE_IOCTL(DM_DEV_RELOAD) +COMPATIBLE_IOCTL(DM_DEV_SUSPEND) +COMPATIBLE_IOCTL(DM_DEV_RENAME) +COMPATIBLE_IOCTL(DM_DEV_DEPS) +COMPATIBLE_IOCTL(DM_DEV_STATUS) +COMPATIBLE_IOCTL(DM_TARGET_STATUS) +COMPATIBLE_IOCTL(DM_TARGET_WAIT) +#endif /* CONFIG_BLK_DEV_DM */ #ifdef CONFIG_AUTOFS_FS COMPATIBLE_IOCTL(AUTOFS_IOC_READY) COMPATIBLE_IOCTL(AUTOFS_IOC_FAIL) diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/Config.in x/drivers/md/Config.in --- x-ref/drivers/md/Config.in 2003-03-15 03:25:02.000000000 +0100 +++ x/drivers/md/Config.in 2003-07-17 05:44:48.000000000 +0200 @@ -14,5 +14,8 @@ dep_tristate ' RAID-4/RAID-5 mode' CONF dep_tristate ' Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + dep_tristate ' Device-mapper support (EXPERIMENTAL)' CONFIG_BLK_DEV_DM $CONFIG_MD +fi endmenu diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/Makefile x/drivers/md/Makefile --- x-ref/drivers/md/Makefile 2003-03-15 03:25:02.000000000 +0100 +++ x/drivers/md/Makefile 2003-07-17 05:44:48.000000000 +0200 @@ -4,9 +4,12 @@ O_TARGET := mddev.o -export-objs := md.o xor.o +export-objs := md.o xor.o dm-table.o dm-target.o kcopyd.o list-multi := lvm-mod.o lvm-mod-objs := lvm.o lvm-snap.o lvm-fs.o +dm-mod-objs := dm.o dm-table.o dm-target.o dm-ioctl.o \ + dm-linear.o dm-stripe.o dm-snapshot.o dm-exception-store.o \ + kcopyd.o # Note: link order is important. All raid personalities # and xor.o must come before md.o, as they each initialise @@ -20,8 +23,12 @@ obj-$(CONFIG_MD_RAID5) += raid5.o xor.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_BLK_DEV_MD) += md.o obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o +obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o include $(TOPDIR)/Rules.make lvm-mod.o: $(lvm-mod-objs) $(LD) -r -o $@ $(lvm-mod-objs) + +dm-mod.o: $(dm-mod-objs) + $(LD) -r -o $@ $(dm-mod-objs) diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/dm-exception-store.c x/drivers/md/dm-exception-store.c --- x-ref/drivers/md/dm-exception-store.c 1970-01-01 01:00:00.000000000 +0100 +++ x/drivers/md/dm-exception-store.c 2003-07-17 05:44:48.000000000 +0200 @@ -0,0 +1,704 @@ +/* + * dm-snapshot.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm-snapshot.h" +#include "kcopyd.h" + +#include +#include +#include +#include + +/*----------------------------------------------------------------- + * Persistent snapshots, by persistent we mean that the snapshot + * will survive a reboot. + *---------------------------------------------------------------*/ + +/* + * We need to store a record of which parts of the origin have + * been copied to the snapshot device. The snapshot code + * requires that we copy exception chunks to chunk aligned areas + * of the COW store. It makes sense therefore, to store the + * metadata in chunk size blocks. + * + * There is no backward or forward compatibility implemented, + * snapshots with different disk versions than the kernel will + * not be usable. It is expected that "lvcreate" will blank out + * the start of a fresh COW device before calling the snapshot + * constructor. + * + * The first chunk of the COW device just contains the header. + * After this there is a chunk filled with exception metadata, + * followed by as many exception chunks as can fit in the + * metadata areas. + * + * All on disk structures are in little-endian format. The end + * of the exceptions info is indicated by an exception with a + * new_chunk of 0, which is invalid since it would point to the + * header chunk. + */ + +/* + * Magic for persistent snapshots: "SnAp" - Feeble isn't it. + */ +#define SNAP_MAGIC 0x70416e53 + +/* + * The on-disk version of the metadata. + */ +#define SNAPSHOT_DISK_VERSION 1 + +struct disk_header { + uint32_t magic; + + /* + * Is this snapshot valid. There is no way of recovering + * an invalid snapshot. + */ + int valid; + + /* + * Simple, incrementing version. no backward + * compatibility. + */ + uint32_t version; + + /* In sectors */ + uint32_t chunk_size; +}; + +struct disk_exception { + uint64_t old_chunk; + uint64_t new_chunk; +}; + +struct commit_callback { + void (*callback) (void *, int success); + void *context; +}; + +/* + * The top level structure for a persistent exception store. + */ +struct pstore { + struct dm_snapshot *snap; /* up pointer to my snapshot */ + int version; + int valid; + uint32_t chunk_size; + uint32_t exceptions_per_area; + + /* + * Now that we have an asynchronous kcopyd there is no + * need for large chunk sizes, so it wont hurt to have a + * whole chunks worth of metadata in memory at once. + */ + void *area; + struct kiobuf *iobuf; + + /* + * Used to keep track of which metadata area the data in + * 'chunk' refers to. + */ + uint32_t current_area; + + /* + * The next free chunk for an exception. + */ + uint32_t next_free; + + /* + * The index of next free exception in the current + * metadata area. + */ + uint32_t current_committed; + + atomic_t pending_count; + uint32_t callback_count; + struct commit_callback *callbacks; +}; + +/* + * For performance reasons we want to defer writing a committed + * exceptions metadata to disk so that we can amortise away this + * exensive operation. + * + * For the initial version of this code we will remain with + * synchronous io. There are some deadlock issues with async + * that I haven't yet worked out. + */ +static int do_io(int rw, struct kcopyd_region *where, struct kiobuf *iobuf) +{ + int i, sectors_per_block, nr_blocks, start; + int blocksize = get_hardsect_size(where->dev); + int status; + + sectors_per_block = blocksize / SECTOR_SIZE; + + nr_blocks = where->count / sectors_per_block; + start = where->sector / sectors_per_block; + + for (i = 0; i < nr_blocks; i++) + iobuf->blocks[i] = start++; + + iobuf->length = where->count << 9; + iobuf->locked = 1; + + status = brw_kiovec(rw, 1, &iobuf, where->dev, iobuf->blocks, + blocksize); + if (status != (where->count << 9)) + return -EIO; + + return 0; +} + +static int allocate_iobuf(struct pstore *ps) +{ + size_t i, r = -ENOMEM, len, nr_pages; + struct page *page; + + len = ps->chunk_size << SECTOR_SHIFT; + + /* + * Allocate the chunk_size block of memory that will hold + * a single metadata area. + */ + ps->area = vmalloc(len); + if (!ps->area) + return r; + + if (alloc_kiovec(1, &ps->iobuf)) + goto bad; + + nr_pages = ps->chunk_size / (PAGE_SIZE / SECTOR_SIZE); + r = expand_kiobuf(ps->iobuf, nr_pages); + if (r) + goto bad; + + /* + * We lock the pages for ps->area into memory since they'll be + * doing a lot of io. + */ + for (i = 0; i < nr_pages; i++) { + page = vmalloc_to_page(ps->area + (i * PAGE_SIZE)); + LockPage(page); + ps->iobuf->maplist[i] = page; + ps->iobuf->nr_pages++; + } + + ps->iobuf->nr_pages = nr_pages; + ps->iobuf->offset = 0; + + return 0; + + bad: + if (ps->iobuf) + free_kiovec(1, &ps->iobuf); + + if (ps->area) + vfree(ps->area); + ps->iobuf = NULL; + return r; +} + +static void free_iobuf(struct pstore *ps) +{ + int i; + + for (i = 0; i < ps->iobuf->nr_pages; i++) + UnlockPage(ps->iobuf->maplist[i]); + ps->iobuf->locked = 0; + + free_kiovec(1, &ps->iobuf); + vfree(ps->area); +} + +/* + * Read or write a chunk aligned and sized block of data from a device. + */ +static int chunk_io(struct pstore *ps, uint32_t chunk, int rw) +{ + int r; + struct kcopyd_region where; + + where.dev = ps->snap->cow->dev; + where.sector = ps->chunk_size * chunk; + where.count = ps->chunk_size; + + r = do_io(rw, &where, ps->iobuf); + if (r) + return r; + + return 0; +} + +/* + * Read or write a metadata area. Remembering to skip the first + * chunk which holds the header. + */ +static int area_io(struct pstore *ps, uint32_t area, int rw) +{ + int r; + uint32_t chunk; + + /* convert a metadata area index to a chunk index */ + chunk = 1 + ((ps->exceptions_per_area + 1) * area); + + r = chunk_io(ps, chunk, rw); + if (r) + return r; + + ps->current_area = area; + return 0; +} + +static int zero_area(struct pstore *ps, uint32_t area) +{ + memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); + return area_io(ps, area, WRITE); +} + +static int read_header(struct pstore *ps, int *new_snapshot) +{ + int r; + struct disk_header *dh; + + r = chunk_io(ps, 0, READ); + if (r) + return r; + + dh = (struct disk_header *) ps->area; + + if (dh->magic == 0) { + *new_snapshot = 1; + + } else if (dh->magic == SNAP_MAGIC) { + *new_snapshot = 0; + ps->valid = dh->valid; + ps->version = dh->version; + ps->chunk_size = dh->chunk_size; + + } else { + DMWARN("Invalid/corrupt snapshot"); + r = -ENXIO; + } + + return r; +} + +static int write_header(struct pstore *ps) +{ + struct disk_header *dh; + + memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT); + + dh = (struct disk_header *) ps->area; + dh->magic = SNAP_MAGIC; + dh->valid = ps->valid; + dh->version = ps->version; + dh->chunk_size = ps->chunk_size; + + return chunk_io(ps, 0, WRITE); +} + +/* + * Access functions for the disk exceptions, these do the endian conversions. + */ +static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) +{ + if (index >= ps->exceptions_per_area) + return NULL; + + return ((struct disk_exception *) ps->area) + index; +} + +static int read_exception(struct pstore *ps, + uint32_t index, struct disk_exception *result) +{ + struct disk_exception *e; + + e = get_exception(ps, index); + if (!e) + return -EINVAL; + + /* copy it */ + result->old_chunk = le64_to_cpu(e->old_chunk); + result->new_chunk = le64_to_cpu(e->new_chunk); + + return 0; +} + +static int write_exception(struct pstore *ps, + uint32_t index, struct disk_exception *de) +{ + struct disk_exception *e; + + e = get_exception(ps, index); + if (!e) + return -EINVAL; + + /* copy it */ + e->old_chunk = cpu_to_le64(de->old_chunk); + e->new_chunk = cpu_to_le64(de->new_chunk); + + return 0; +} + +/* + * Registers the exceptions that are present in the current area. + * 'full' is filled in to indicate if the area has been + * filled. + */ +static int insert_exceptions(struct pstore *ps, int *full) +{ + int i, r; + struct disk_exception de; + + /* presume the area is full */ + *full = 1; + + for (i = 0; i < ps->exceptions_per_area; i++) { + r = read_exception(ps, i, &de); + + if (r) + return r; + + /* + * If the new_chunk is pointing at the start of + * the COW device, where the first metadata area + * is we know that we've hit the end of the + * exceptions. Therefore the area is not full. + */ + if (de.new_chunk == 0LL) { + ps->current_committed = i; + *full = 0; + break; + } + + /* + * Keep track of the start of the free chunks. + */ + if (ps->next_free <= de.new_chunk) + ps->next_free = de.new_chunk + 1; + + /* + * Otherwise we add the exception to the snapshot. + */ + r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk); + if (r) + return r; + } + + return 0; +} + +static int read_exceptions(struct pstore *ps) +{ + uint32_t area; + int r, full = 1; + + /* + * Keeping reading chunks and inserting exceptions until + * we find a partially full area. + */ + for (area = 0; full; area++) { + r = area_io(ps, area, READ); + if (r) + return r; + + r = insert_exceptions(ps, &full); + if (r) + return r; + + area++; + } + + return 0; +} + +static inline struct pstore *get_info(struct exception_store *store) +{ + return (struct pstore *) store->context; +} + +static int persistent_percentfull(struct exception_store *store) +{ + struct pstore *ps = get_info(store); + return (ps->next_free * store->snap->chunk_size * 100) / + get_dev_size(store->snap->cow->dev); +} + +static void persistent_destroy(struct exception_store *store) +{ + struct pstore *ps = get_info(store); + + vfree(ps->callbacks); + free_iobuf(ps); + kfree(ps); +} + +static int persistent_prepare(struct exception_store *store, + struct exception *e) +{ + struct pstore *ps = get_info(store); + uint32_t stride; + sector_t size = get_dev_size(store->snap->cow->dev); + + /* Is there enough room ? */ + if (size < ((ps->next_free + 1) * store->snap->chunk_size)) + return -ENOSPC; + + e->new_chunk = ps->next_free; + + /* + * Move onto the next free pending, making sure to take + * into account the location of the metadata chunks. + */ + stride = (ps->exceptions_per_area + 1); + if ((++ps->next_free % stride) == 1) + ps->next_free++; + + atomic_inc(&ps->pending_count); + return 0; +} + +static void persistent_commit(struct exception_store *store, + struct exception *e, + void (*callback) (void *, int success), + void *callback_context) +{ + int r, i; + struct pstore *ps = get_info(store); + struct disk_exception de; + struct commit_callback *cb; + + de.old_chunk = e->old_chunk; + de.new_chunk = e->new_chunk; + write_exception(ps, ps->current_committed++, &de); + + /* + * Add the callback to the back of the array. This code + * is the only place where the callback array is + * manipulated, and we know that it will never be called + * multiple times concurrently. + */ + cb = ps->callbacks + ps->callback_count++; + cb->callback = callback; + cb->context = callback_context; + + /* + * If there are no more exceptions in flight, or we have + * filled this metadata area we commit the exceptions to + * disk. + */ + if (atomic_dec_and_test(&ps->pending_count) || + (ps->current_committed == ps->exceptions_per_area)) { + r = area_io(ps, ps->current_area, WRITE); + if (r) + ps->valid = 0; + + for (i = 0; i < ps->callback_count; i++) { + cb = ps->callbacks + i; + cb->callback(cb->context, r == 0 ? 1 : 0); + } + + ps->callback_count = 0; + } + + /* + * Have we completely filled the current area ? + */ + if (ps->current_committed == ps->exceptions_per_area) { + ps->current_committed = 0; + r = zero_area(ps, ps->current_area + 1); + if (r) + ps->valid = 0; + } +} + +static void persistent_drop(struct exception_store *store) +{ + struct pstore *ps = get_info(store); + + ps->valid = 0; + if (write_header(ps)) + DMWARN("write header failed"); +} + +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) +{ + int r, new_snapshot; + struct pstore *ps; + + /* allocate the pstore */ + ps = kmalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + return -ENOMEM; + + ps->snap = store->snap; + ps->valid = 1; + ps->version = SNAPSHOT_DISK_VERSION; + ps->chunk_size = chunk_size; + ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) / + sizeof(struct disk_exception); + ps->next_free = 2; /* skipping the header and first area */ + ps->current_committed = 0; + + r = allocate_iobuf(ps); + if (r) + goto bad; + + /* + * Allocate space for all the callbacks. + */ + ps->callback_count = 0; + atomic_set(&ps->pending_count, 0); + ps->callbacks = vcalloc(ps->exceptions_per_area, + sizeof(*ps->callbacks)); + + if (!ps->callbacks) + goto bad; + + /* + * Read the snapshot header. + */ + r = read_header(ps, &new_snapshot); + if (r) + goto bad; + + /* + * Do we need to setup a new snapshot ? + */ + if (new_snapshot) { + r = write_header(ps); + if (r) { + DMWARN("write_header failed"); + goto bad; + } + + r = zero_area(ps, 0); + if (r) { + DMWARN("zero_area(0) failed"); + goto bad; + } + + } else { + /* + * Sanity checks. + */ + if (!ps->valid) { + DMWARN("snapshot is marked invalid"); + r = -EINVAL; + goto bad; + } + + if (ps->chunk_size != chunk_size) { + DMWARN("chunk size for existing snapshot different " + "from that requested"); + r = -EINVAL; + goto bad; + } + + if (ps->version != SNAPSHOT_DISK_VERSION) { + DMWARN("unable to handle snapshot disk version %d", + ps->version); + r = -EINVAL; + goto bad; + } + + /* + * Read the metadata. + */ + r = read_exceptions(ps); + if (r) + goto bad; + } + + store->destroy = persistent_destroy; + store->prepare_exception = persistent_prepare; + store->commit_exception = persistent_commit; + store->drop_snapshot = persistent_drop; + store->percent_full = persistent_percentfull; + store->context = ps; + + return r; + + bad: + if (ps) { + if (ps->callbacks) + vfree(ps->callbacks); + + if (ps->iobuf) + free_iobuf(ps); + + kfree(ps); + } + return r; +} + +/*----------------------------------------------------------------- + * Implementation of the store for non-persistent snapshots. + *---------------------------------------------------------------*/ +struct transient_c { + sector_t next_free; +}; + +void transient_destroy(struct exception_store *store) +{ + kfree(store->context); +} + +int transient_prepare(struct exception_store *store, struct exception *e) +{ + struct transient_c *tc = (struct transient_c *) store->context; + sector_t size = get_dev_size(store->snap->cow->dev); + + if (size < (tc->next_free + store->snap->chunk_size)) + return -1; + + e->new_chunk = sector_to_chunk(store->snap, tc->next_free); + tc->next_free += store->snap->chunk_size; + + return 0; +} + +void transient_commit(struct exception_store *store, + struct exception *e, + void (*callback) (void *, int success), + void *callback_context) +{ + /* Just succeed */ + callback(callback_context, 1); +} + +static int transient_percentfull(struct exception_store *store) +{ + struct transient_c *tc = (struct transient_c *) store->context; + return (tc->next_free * 100) / get_dev_size(store->snap->cow->dev); +} + +int dm_create_transient(struct exception_store *store, + struct dm_snapshot *s, int blocksize) +{ + struct transient_c *tc; + + memset(store, 0, sizeof(*store)); + store->destroy = transient_destroy; + store->prepare_exception = transient_prepare; + store->commit_exception = transient_commit; + store->percent_full = transient_percentfull; + store->snap = s; + + tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); + if (!tc) + return -ENOMEM; + + tc->next_free = 0; + store->context = tc; + + return 0; +} diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/dm-ioctl.c x/drivers/md/dm-ioctl.c --- x-ref/drivers/md/dm-ioctl.c 1970-01-01 01:00:00.000000000 +0100 +++ x/drivers/md/dm-ioctl.c 2003-07-17 05:44:48.000000000 +0200 @@ -0,0 +1,1160 @@ +/* + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define DM_DRIVER_EMAIL "dm@uk.sistina.com" + +/*----------------------------------------------------------------- + * The ioctl interface needs to be able to look up devices by + * name or uuid. + *---------------------------------------------------------------*/ +struct hash_cell { + struct list_head name_list; + struct list_head uuid_list; + + char *name; + char *uuid; + struct mapped_device *md; + + /* I hate devfs */ + devfs_handle_t devfs_entry; +}; + +#define NUM_BUCKETS 64 +#define MASK_BUCKETS (NUM_BUCKETS - 1) +static struct list_head _name_buckets[NUM_BUCKETS]; +static struct list_head _uuid_buckets[NUM_BUCKETS]; + +static devfs_handle_t _dev_dir; +void dm_hash_remove_all(void); + +/* + * Guards access to all three tables. + */ +static DECLARE_RWSEM(_hash_lock); + +static void init_buckets(struct list_head *buckets) +{ + unsigned int i; + + for (i = 0; i < NUM_BUCKETS; i++) + INIT_LIST_HEAD(buckets + i); +} + +int dm_hash_init(void) +{ + init_buckets(_name_buckets); + init_buckets(_uuid_buckets); + _dev_dir = devfs_mk_dir(0, DM_DIR, NULL); + return 0; +} + +void dm_hash_exit(void) +{ + dm_hash_remove_all(); + devfs_unregister(_dev_dir); +} + +/*----------------------------------------------------------------- + * Hash function: + * We're not really concerned with the str hash function being + * fast since it's only used by the ioctl interface. + *---------------------------------------------------------------*/ +static unsigned int hash_str(const char *str) +{ + const unsigned int hash_mult = 2654435387U; + unsigned int h = 0; + + while (*str) + h = (h + (unsigned int) *str++) * hash_mult; + + return h & MASK_BUCKETS; +} + +/*----------------------------------------------------------------- + * Code for looking up a device by name + *---------------------------------------------------------------*/ +static struct hash_cell *__get_name_cell(const char *str) +{ + struct list_head *tmp; + struct hash_cell *hc; + unsigned int h = hash_str(str); + + list_for_each(tmp, _name_buckets + h) { + hc = list_entry(tmp, struct hash_cell, name_list); + if (!strcmp(hc->name, str)) + return hc; + } + + return NULL; +} + +static struct hash_cell *__get_uuid_cell(const char *str) +{ + struct list_head *tmp; + struct hash_cell *hc; + unsigned int h = hash_str(str); + + list_for_each(tmp, _uuid_buckets + h) { + hc = list_entry(tmp, struct hash_cell, uuid_list); + if (!strcmp(hc->uuid, str)) + return hc; + } + + return NULL; +} + +/*----------------------------------------------------------------- + * Inserting, removing and renaming a device. + *---------------------------------------------------------------*/ +static inline char *kstrdup(const char *str) +{ + char *r = kmalloc(strlen(str) + 1, GFP_KERNEL); + if (r) + strcpy(r, str); + return r; +} + +static struct hash_cell *alloc_cell(const char *name, const char *uuid, + struct mapped_device *md) +{ + struct hash_cell *hc; + + hc = kmalloc(sizeof(*hc), GFP_KERNEL); + if (!hc) + return NULL; + + hc->name = kstrdup(name); + if (!hc->name) { + kfree(hc); + return NULL; + } + + if (!uuid) + hc->uuid = NULL; + + else { + hc->uuid = kstrdup(uuid); + if (!hc->uuid) { + kfree(hc->name); + kfree(hc); + return NULL; + } + } + + INIT_LIST_HEAD(&hc->name_list); + INIT_LIST_HEAD(&hc->uuid_list); + hc->md = md; + return hc; +} + +static void free_cell(struct hash_cell *hc) +{ + if (hc) { + kfree(hc->name); + kfree(hc->uuid); + kfree(hc); + } +} + +/* + * devfs stuff. + */ +static int register_with_devfs(struct hash_cell *hc) +{ + kdev_t dev = dm_kdev(hc->md); + + hc->devfs_entry = + devfs_register(_dev_dir, hc->name, DEVFS_FL_CURRENT_OWNER, + major(dev), minor(dev), + S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP, + &dm_blk_dops, NULL); + + return 0; +} + +static int unregister_with_devfs(struct hash_cell *hc) +{ + devfs_unregister(hc->devfs_entry); + return 0; +} + +/* + * The kdev_t and uuid of a device can never change once it is + * initially inserted. + */ +int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md) +{ + struct hash_cell *cell; + + /* + * Allocate the new cells. + */ + cell = alloc_cell(name, uuid, md); + if (!cell) + return -ENOMEM; + + /* + * Insert the cell into all three hash tables. + */ + down_write(&_hash_lock); + if (__get_name_cell(name)) + goto bad; + + list_add(&cell->name_list, _name_buckets + hash_str(name)); + + if (uuid) { + if (__get_uuid_cell(uuid)) { + list_del(&cell->name_list); + goto bad; + } + list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); + } + register_with_devfs(cell); + dm_get(md); + up_write(&_hash_lock); + + return 0; + + bad: + up_write(&_hash_lock); + free_cell(cell); + return -EBUSY; +} + +void __hash_remove(struct hash_cell *hc) +{ + /* remove from the dev hash */ + list_del(&hc->uuid_list); + list_del(&hc->name_list); + unregister_with_devfs(hc); + dm_put(hc->md); +} + +void dm_hash_remove_all(void) +{ + int i; + struct hash_cell *hc; + struct list_head *tmp, *n; + + down_write(&_hash_lock); + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_safe(tmp, n, _name_buckets + i) { + hc = list_entry(tmp, struct hash_cell, name_list); + __hash_remove(hc); + } + } + up_write(&_hash_lock); +} + +int dm_hash_rename(const char *old, const char *new) +{ + char *new_name, *old_name; + struct hash_cell *hc; + + /* + * duplicate new. + */ + new_name = kstrdup(new); + if (!new_name) + return -ENOMEM; + + down_write(&_hash_lock); + + /* + * Is new free ? + */ + hc = __get_name_cell(new); + if (hc) { + DMWARN("asked to rename to an already existing name %s -> %s", + old, new); + up_write(&_hash_lock); + return -EBUSY; + } + + /* + * Is there such a device as 'old' ? + */ + hc = __get_name_cell(old); + if (!hc) { + DMWARN("asked to rename a non existent device %s -> %s", + old, new); + up_write(&_hash_lock); + return -ENXIO; + } + + /* + * rename and move the name cell. + */ + list_del(&hc->name_list); + old_name = hc->name; + hc->name = new_name; + list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + + /* rename the device node in devfs */ + unregister_with_devfs(hc); + register_with_devfs(hc); + + up_write(&_hash_lock); + kfree(old_name); + return 0; +} + + +/*----------------------------------------------------------------- + * Implementation of the ioctl commands + *---------------------------------------------------------------*/ + +/* + * All the ioctl commands get dispatched to functions with this + * prototype. + */ +typedef int (*ioctl_fn)(struct dm_ioctl *param, struct dm_ioctl *user); + +/* + * Check a string doesn't overrun the chunk of + * memory we copied from userland. + */ +static int valid_str(char *str, void *begin, void *end) +{ + while (((void *) str >= begin) && ((void *) str < end)) + if (!*str++) + return 0; + + return -EINVAL; +} + +static int next_target(struct dm_target_spec *last, uint32_t next, + void *begin, void *end, + struct dm_target_spec **spec, char **params) +{ + *spec = (struct dm_target_spec *) + ((unsigned char *) last + next); + *params = (char *) (*spec + 1); + + if (*spec < (last + 1) || ((void *) *spec > end)) + return -EINVAL; + + return valid_str(*params, begin, end); +} + +static int populate_table(struct dm_table *table, struct dm_ioctl *args) +{ + int i = 0, r, first = 1; + struct dm_target_spec *spec; + char *params; + void *begin, *end; + + if (!args->target_count) { + DMWARN("populate_table: no targets specified"); + return -EINVAL; + } + + begin = (void *) args; + end = begin + args->data_size; + + for (i = 0; i < args->target_count; i++) { + + if (first) + r = next_target((struct dm_target_spec *) args, + args->data_start, + begin, end, &spec, ¶ms); + else + r = next_target(spec, spec->next, begin, end, + &spec, ¶ms); + + if (r) { + DMWARN("unable to find target"); + return -EINVAL; + } + + r = dm_table_add_target(table, spec->target_type, + spec->sector_start, spec->length, + params); + if (r) { + DMWARN("error adding target to table"); + return -EINVAL; + } + + first = 0; + } + + return dm_table_complete(table); +} + +/* + * Round up the ptr to the next 'align' boundary. Obviously + * 'align' must be a power of 2. + */ +static inline void *align_ptr(void *ptr, unsigned int align) +{ + align--; + return (void *) (((unsigned long) (ptr + align)) & ~align); +} + +/* + * Copies a dm_ioctl and an optional additional payload to + * userland. + */ +static int results_to_user(struct dm_ioctl *user, struct dm_ioctl *param, + void *data, uint32_t len) +{ + int r; + void *ptr = NULL; + + if (data) { + ptr = align_ptr(user + 1, sizeof(unsigned long)); + param->data_start = ptr - (void *) user; + } + + /* + * The version number has already been filled in, so we + * just copy later fields. + */ + r = copy_to_user(&user->data_size, ¶m->data_size, + sizeof(*param) - sizeof(param->version)); + if (r) + return -EFAULT; + + if (data) { + if (param->data_start + len > param->data_size) + return -ENOSPC; + + if (copy_to_user(ptr, data, len)) + r = -EFAULT; + } + + return r; +} + +/* + * Fills in a dm_ioctl structure, ready for sending back to + * userland. + */ +static int __info(struct mapped_device *md, struct dm_ioctl *param) +{ + kdev_t dev = dm_kdev(md); + struct dm_table *table; + struct block_device *bdev; + + param->flags = DM_EXISTS_FLAG; + if (dm_suspended(md)) + param->flags |= DM_SUSPEND_FLAG; + + param->dev = kdev_t_to_nr(dev); + bdev = bdget(param->dev); + if (!bdev) + return -ENXIO; + + param->open_count = bdev->bd_openers; + bdput(bdev); + + if (is_read_only(dev)) + param->flags |= DM_READONLY_FLAG; + + table = dm_get_table(md); + param->target_count = dm_table_get_num_targets(table); + dm_table_put(table); + + return 0; +} + +/* + * Always use UUID for lookups if it's present, otherwise use name. + */ +static inline struct mapped_device *find_device(struct dm_ioctl *param) +{ + struct hash_cell *hc; + struct mapped_device *md = NULL; + + down_read(&_hash_lock); + hc = *param->uuid ? __get_uuid_cell(param->uuid) : + __get_name_cell(param->name); + if (hc) { + md = hc->md; + + /* + * Sneakily write in both the name and the uuid + * while we have the cell. + */ + strncpy(param->name, hc->name, sizeof(param->name)); + if (hc->uuid) + strncpy(param->uuid, hc->uuid, sizeof(param->uuid) - 1); + else + param->uuid[0] = '\0'; + + dm_get(md); + } + up_read(&_hash_lock); + + return md; +} + +#define ALIGNMENT sizeof(int) +static void *_align(void *ptr, unsigned int a) +{ + register unsigned long align = --a; + + return (void *) (((unsigned long) ptr + align) & ~align); +} + +/* + * Copies device info back to user space, used by + * the create and info ioctls. + */ +static int info(struct dm_ioctl *param, struct dm_ioctl *user) +{ + struct mapped_device *md; + + param->flags = 0; + + md = find_device(param); + if (!md) + /* + * Device not found - returns cleared exists flag. + */ + goto out; + + __info(md, param); + dm_put(md); + + out: + return results_to_user(user, param, NULL, 0); +} + +static inline int get_mode(struct dm_ioctl *param) +{ + int mode = FMODE_READ | FMODE_WRITE; + + if (param->flags & DM_READONLY_FLAG) + mode = FMODE_READ; + + return mode; +} + +static int check_name(const char *name) +{ + if (strchr(name, '/')) { + DMWARN("invalid device name"); + return -EINVAL; + } + + return 0; +} + +static int create(struct dm_ioctl *param, struct dm_ioctl *user) +{ + int r; + kdev_t dev; + struct dm_table *t; + struct mapped_device *md; + int minor; + + r = check_name(param->name); + if (r) + return r; + + r = dm_table_create(&t, get_mode(param)); + if (r) + return r; + + r = populate_table(t, param); + if (r) { + dm_table_put(t); + return r; + } + + minor = (param->flags & DM_PERSISTENT_DEV_FLAG) ? + minor(to_kdev_t(param->dev)) : -1; + + r = dm_create(minor, t, &md); + if (r) { + dm_table_put(t); + return r; + } + dm_table_put(t); /* md will have grabbed its own reference */ + + dev = dm_kdev(md); + set_device_ro(dev, (param->flags & DM_READONLY_FLAG)); + r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md); + dm_put(md); + + return r ? r : info(param, user); +} + +/* + * Build up the status struct for each target + */ +static int __status(struct mapped_device *md, struct dm_ioctl *param, + char *outbuf, int *len) +{ + int i, num_targets; + struct dm_target_spec *spec; + char *outptr; + status_type_t type; + struct dm_table *table = dm_get_table(md); + + if (param->flags & DM_STATUS_TABLE_FLAG) + type = STATUSTYPE_TABLE; + else + type = STATUSTYPE_INFO; + + outptr = outbuf; + + /* Get all the target info */ + num_targets = dm_table_get_num_targets(table); + for (i = 0; i < num_targets; i++) { + struct dm_target *ti = dm_table_get_target(table, i); + + if (outptr - outbuf + + sizeof(struct dm_target_spec) > param->data_size) { + dm_table_put(table); + return -ENOMEM; + } + + spec = (struct dm_target_spec *) outptr; + + spec->status = 0; + spec->sector_start = ti->begin; + spec->length = ti->len; + strncpy(spec->target_type, ti->type->name, + sizeof(spec->target_type)); + + outptr += sizeof(struct dm_target_spec); + + /* Get the status/table string from the target driver */ + if (ti->type->status) + ti->type->status(ti, type, outptr, + outbuf + param->data_size - outptr); + else + outptr[0] = '\0'; + + outptr += strlen(outptr) + 1; + _align(outptr, ALIGNMENT); + spec->next = outptr - outbuf; + } + + param->target_count = num_targets; + *len = outptr - outbuf; + dm_table_put(table); + + return 0; +} + +/* + * Return the status of a device as a text string for each + * target. + */ +static int get_status(struct dm_ioctl *param, struct dm_ioctl *user) +{ + struct mapped_device *md; + int len = 0; + int ret; + char *outbuf = NULL; + + md = find_device(param); + if (!md) + /* + * Device not found - returns cleared exists flag. + */ + goto out; + + /* We haven't a clue how long the resultant data will be so + just allocate as much as userland has allowed us and make sure + we don't overun it */ + outbuf = kmalloc(param->data_size, GFP_KERNEL); + if (!outbuf) + goto out; + /* + * Get the status of all targets + */ + __status(md, param, outbuf, &len); + + /* + * Setup the basic dm_ioctl structure. + */ + __info(md, param); + + out: + if (md) + dm_put(md); + + ret = results_to_user(user, param, outbuf, len); + + if (outbuf) + kfree(outbuf); + + return ret; +} + +/* + * Wait for a device to report an event + */ +static int wait_device_event(struct dm_ioctl *param, struct dm_ioctl *user) +{ + struct mapped_device *md; + struct dm_table *table; + DECLARE_WAITQUEUE(wq, current); + + md = find_device(param); + if (!md) + /* + * Device not found - returns cleared exists flag. + */ + goto out; + + /* + * Setup the basic dm_ioctl structure. + */ + __info(md, param); + + /* + * Wait for a notification event + */ + set_current_state(TASK_INTERRUPTIBLE); + table = dm_get_table(md); + dm_table_add_wait_queue(table, &wq); + dm_table_put(table); + dm_put(md); + + yield(); + set_current_state(TASK_RUNNING); + + out: + return results_to_user(user, param, NULL, 0); +} + +/* + * Retrieves a list of devices used by a particular dm device. + */ +static int dep(struct dm_ioctl *param, struct dm_ioctl *user) +{ + int count, r; + struct mapped_device *md; + struct list_head *tmp; + size_t len = 0; + struct dm_target_deps *deps = NULL; + struct dm_table *table; + + md = find_device(param); + if (!md) + goto out; + table = dm_get_table(md); + + /* + * Setup the basic dm_ioctl structure. + */ + __info(md, param); + + /* + * Count the devices. + */ + count = 0; + list_for_each(tmp, dm_table_get_devices(table)) + count++; + + /* + * Allocate a kernel space version of the dm_target_status + * struct. + */ + if (array_too_big(sizeof(*deps), sizeof(*deps->dev), count)) { + dm_table_put(table); + dm_put(md); + return -ENOMEM; + } + + len = sizeof(*deps) + (sizeof(*deps->dev) * count); + deps = kmalloc(len, GFP_KERNEL); + if (!deps) { + dm_table_put(table); + dm_put(md); + return -ENOMEM; + } + + /* + * Fill in the devices. + */ + deps->count = count; + count = 0; + list_for_each(tmp, dm_table_get_devices(table)) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + deps->dev[count++] = dd->bdev->bd_dev; + } + dm_table_put(table); + dm_put(md); + + out: + r = results_to_user(user, param, deps, len); + + kfree(deps); + return r; +} + +static int remove(struct dm_ioctl *param, struct dm_ioctl *user) +{ + struct hash_cell *hc; + + down_write(&_hash_lock); + hc = *param->uuid ? __get_uuid_cell(param->uuid) : + __get_name_cell(param->name); + if (!hc) { + DMWARN("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -EINVAL; + } + + /* + * You may ask the interface to drop its reference to an + * in use device. This is no different to unlinking a + * file that someone still has open. The device will not + * actually be destroyed until the last opener closes it. + * The name and uuid of the device (both are interface + * properties) will be available for reuse immediately. + * + * You don't want to drop a _suspended_ device from the + * interface, since that will leave you with no way of + * resuming it. + */ + if (dm_suspended(hc->md)) { + DMWARN("refusing to remove a suspended device."); + up_write(&_hash_lock); + return -EPERM; + } + + __hash_remove(hc); + up_write(&_hash_lock); + return 0; +} + +static int remove_all(struct dm_ioctl *param, struct dm_ioctl *user) +{ + dm_hash_remove_all(); + return 0; +} + +static int suspend(struct dm_ioctl *param, struct dm_ioctl *user) +{ + int r; + struct mapped_device *md; + + md = find_device(param); + if (!md) + return -ENXIO; + + if (param->flags & DM_SUSPEND_FLAG) + r = dm_suspend(md); + else + r = dm_resume(md); + + dm_put(md); + return r; +} + +static int reload(struct dm_ioctl *param, struct dm_ioctl *user) +{ + int r; + kdev_t dev; + struct mapped_device *md; + struct dm_table *t; + + r = dm_table_create(&t, get_mode(param)); + if (r) + return r; + + r = populate_table(t, param); + if (r) { + dm_table_put(t); + return r; + } + + md = find_device(param); + if (!md) { + dm_table_put(t); + return -ENXIO; + } + + r = dm_swap_table(md, t); + if (r) { + dm_put(md); + dm_table_put(t); + return r; + } + dm_table_put(t); /* md will have taken its own reference */ + + dev = dm_kdev(md); + set_device_ro(dev, (param->flags & DM_READONLY_FLAG)); + dm_put(md); + + r = info(param, user); + return r; +} + +static int rename(struct dm_ioctl *param, struct dm_ioctl *user) +{ + int r; + char *new_name = (char *) param + param->data_start; + + if (valid_str(new_name, (void *) param, + (void *) param + param->data_size)) { + DMWARN("Invalid new logical volume name supplied."); + return -EINVAL; + } + + r = check_name(new_name); + if (r) + return r; + + return dm_hash_rename(param->name, new_name); +} + + +/*----------------------------------------------------------------- + * Implementation of open/close/ioctl on the special char + * device. + *---------------------------------------------------------------*/ +static ioctl_fn lookup_ioctl(unsigned int cmd) +{ + static struct { + int cmd; + ioctl_fn fn; + } _ioctls[] = { + {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */ + {DM_REMOVE_ALL_CMD, remove_all}, + {DM_DEV_CREATE_CMD, create}, + {DM_DEV_REMOVE_CMD, remove}, + {DM_DEV_RELOAD_CMD, reload}, + {DM_DEV_RENAME_CMD, rename}, + {DM_DEV_SUSPEND_CMD, suspend}, + {DM_DEV_DEPS_CMD, dep}, + {DM_DEV_STATUS_CMD, info}, + {DM_TARGET_STATUS_CMD, get_status}, + {DM_TARGET_WAIT_CMD, wait_device_event}, + }; + + return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn; +} + +/* + * As well as checking the version compatibility this always + * copies the kernel interface version out. + */ +static int check_version(int cmd, struct dm_ioctl *user) +{ + uint32_t version[3]; + int r = 0; + + if (copy_from_user(version, user->version, sizeof(version))) + return -EFAULT; + + if ((DM_VERSION_MAJOR != version[0]) || + (DM_VERSION_MINOR < version[1])) { + DMWARN("ioctl interface mismatch: " + "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)", + DM_VERSION_MAJOR, DM_VERSION_MINOR, + DM_VERSION_PATCHLEVEL, + version[0], version[1], version[2], cmd); + r = -EINVAL; + } + + /* + * Fill in the kernel version. + */ + version[0] = DM_VERSION_MAJOR; + version[1] = DM_VERSION_MINOR; + version[2] = DM_VERSION_PATCHLEVEL; + if (copy_to_user(user->version, version, sizeof(version))) + return -EFAULT; + + return r; +} + +static void free_params(struct dm_ioctl *param) +{ + vfree(param); +} + +static int copy_params(struct dm_ioctl *user, struct dm_ioctl **param) +{ + struct dm_ioctl tmp, *dmi; + + if (copy_from_user(&tmp, user, sizeof(tmp))) + return -EFAULT; + + if (tmp.data_size < sizeof(tmp)) + return -EINVAL; + + dmi = (struct dm_ioctl *) vmalloc(tmp.data_size); + if (!dmi) + return -ENOMEM; + + if (copy_from_user(dmi, user, tmp.data_size)) { + vfree(dmi); + return -EFAULT; + } + + *param = dmi; + return 0; +} + +static int validate_params(uint cmd, struct dm_ioctl *param) +{ + /* Ignores parameters */ + if (cmd == DM_REMOVE_ALL_CMD) + return 0; + + /* Unless creating, either name of uuid but not both */ + if (cmd != DM_DEV_CREATE_CMD) { + if ((!*param->uuid && !*param->name) || + (*param->uuid && *param->name)) { + DMWARN("one of name or uuid must be supplied"); + return -EINVAL; + } + } + + /* Ensure strings are terminated */ + param->name[DM_NAME_LEN - 1] = '\0'; + param->uuid[DM_UUID_LEN - 1] = '\0'; + + return 0; +} + +static int ctl_ioctl(struct inode *inode, struct file *file, + uint command, ulong u) +{ + int r = 0, cmd; + struct dm_ioctl *param; + struct dm_ioctl *user = (struct dm_ioctl *) u; + ioctl_fn fn = NULL; + + /* only root can play with this */ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (_IOC_TYPE(command) != DM_IOCTL) + return -ENOTTY; + + cmd = _IOC_NR(command); + + /* + * Check the interface version passed in. This also + * writes out the kernel's interface version. + */ + r = check_version(cmd, user); + if (r) + return r; + + /* + * Nothing more to do for the version command. + */ + if (cmd == DM_VERSION_CMD) + return 0; + + fn = lookup_ioctl(cmd); + if (!fn) { + DMWARN("dm_ctl_ioctl: unknown command 0x%x", command); + return -ENOTTY; + } + + /* + * Copy the parameters into kernel space. + */ + r = copy_params(user, ¶m); + if (r) + return r; + + r = validate_params(cmd, param); + if (r) { + free_params(param); + return r; + } + + r = fn(param, user); + free_params(param); + return r; +} + +static struct file_operations _ctl_fops = { + .ioctl = ctl_ioctl, + .owner = THIS_MODULE, +}; + +static devfs_handle_t _ctl_handle; + +static struct miscdevice _dm_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = DM_NAME, + .fops = &_ctl_fops +}; + +/* + * Create misc character device and link to DM_DIR/control. + */ +int __init dm_interface_init(void) +{ + int r; + char rname[64]; + + r = dm_hash_init(); + if (r) + return r; + + r = misc_register(&_dm_misc); + if (r) { + DMERR("misc_register failed for control device"); + dm_hash_exit(); + return r; + } + + r = devfs_generate_path(_dm_misc.devfs_handle, rname + 3, + sizeof rname - 3); + if (r == -ENOSYS) + goto done; /* devfs not present */ + + if (r < 0) { + DMERR("devfs_generate_path failed for control device"); + goto failed; + } + + strncpy(rname + r, "../", 3); + r = devfs_mk_symlink(NULL, DM_DIR "/control", + DEVFS_FL_DEFAULT, rname + r, &_ctl_handle, NULL); + if (r) { + DMERR("devfs_mk_symlink failed for control device"); + goto failed; + } + devfs_auto_unregister(_dm_misc.devfs_handle, _ctl_handle); + + done: + DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR, + DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA, + DM_DRIVER_EMAIL); + return 0; + + failed: + misc_deregister(&_dm_misc); + dm_hash_exit(); + return r; +} + +void dm_interface_exit(void) +{ + if (misc_deregister(&_dm_misc) < 0) + DMERR("misc_deregister failed for control device"); + + dm_hash_exit(); +} diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/dm-linear.c x/drivers/md/dm-linear.c --- x-ref/drivers/md/dm-linear.c 1970-01-01 01:00:00.000000000 +0100 +++ x/drivers/md/dm-linear.c 2003-07-17 05:44:48.000000000 +0200 @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include + +/* + * Linear: maps a linear range of a device. + */ +struct linear_c { + struct dm_dev *dev; + sector_t start; +}; + +/* + * Construct a linear mapping: + */ +static int linear_ctr(struct dm_target *ti, int argc, char **argv) +{ + struct linear_c *lc; + + if (argc != 2) { + ti->error = "dm-linear: Not enough arguments"; + return -EINVAL; + } + + lc = kmalloc(sizeof(*lc), GFP_KERNEL); + if (lc == NULL) { + ti->error = "dm-linear: Cannot allocate linear context"; + return -ENOMEM; + } + + if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) { + ti->error = "dm-linear: Invalid device sector"; + goto bad; + } + + if (dm_get_device(ti, argv[0], lc->start, ti->len, + dm_table_get_mode(ti->table), &lc->dev)) { + ti->error = "dm-linear: Device lookup failed"; + goto bad; + } + + ti->private = lc; + return 0; + + bad: + kfree(lc); + return -EINVAL; +} + +static void linear_dtr(struct dm_target *ti) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + + dm_put_device(ti, lc->dev); + kfree(lc); +} + +static int linear_map(struct dm_target *ti, struct buffer_head *bh, int rw, + void **map_context) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + + bh->b_rdev = lc->dev->dev; + bh->b_rsector = lc->start + (bh->b_rsector - ti->begin); + + return 1; +} + +static int linear_status(struct dm_target *ti, status_type_t type, + char *result, int maxlen) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s " SECTOR_FORMAT, + kdevname(to_kdev_t(lc->dev->bdev->bd_dev)), lc->start); + break; + } + return 0; +} + +static struct target_type linear_target = { + .name = "linear", + .module = THIS_MODULE, + .ctr = linear_ctr, + .dtr = linear_dtr, + .map = linear_map, + .status = linear_status, +}; + +int __init dm_linear_init(void) +{ + int r = dm_register_target(&linear_target); + + if (r < 0) + DMERR("linear: register failed %d", r); + + return r; +} + +void dm_linear_exit(void) +{ + int r = dm_unregister_target(&linear_target); + + if (r < 0) + DMERR("linear: unregister failed %d", r); +} diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/dm-snapshot.c x/drivers/md/dm-snapshot.c --- x-ref/drivers/md/dm-snapshot.c 1970-01-01 01:00:00.000000000 +0100 +++ x/drivers/md/dm-snapshot.c 2003-07-17 05:44:48.000000000 +0200 @@ -0,0 +1,1170 @@ +/* + * dm-snapshot.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dm-snapshot.h" +#include "kcopyd.h" + +/* + * FIXME: Remove this before release. + */ +#if 0 +#define DMDEBUG(x...) DMWARN( ## x) +#else +#define DMDEBUG(x...) +#endif + +/* + * The percentage increment we will wake up users at + */ +#define WAKE_UP_PERCENT 5 + +/* + * kcopyd priority of snapshot operations + */ +#define SNAPSHOT_COPY_PRIORITY 2 + +struct pending_exception { + struct exception e; + + /* + * Origin buffers waiting for this to complete are held + * in a list (using b_reqnext). + */ + struct buffer_head *origin_bhs; + struct buffer_head *snapshot_bhs; + + /* + * Other pending_exceptions that are processing this + * chunk. When this list is empty, we know we can + * complete the origins. + */ + struct list_head siblings; + + /* Pointer back to snapshot context */ + struct dm_snapshot *snap; + + /* + * 1 indicates the exception has already been sent to + * kcopyd. + */ + int started; +}; + +/* + * Hash table mapping origin volumes to lists of snapshots and + * a lock to protect it + */ +static kmem_cache_t *exception_cache; +static kmem_cache_t *pending_cache; +static mempool_t *pending_pool; + +/* + * One of these per registered origin, held in the snapshot_origins hash + */ +struct origin { + /* The origin device */ + kdev_t dev; + + struct list_head hash_list; + + /* List of snapshots for this origin */ + struct list_head snapshots; +}; + +/* + * Size of the hash table for origin volumes. If we make this + * the size of the minors list then it should be nearly perfect + */ +#define ORIGIN_HASH_SIZE 256 +#define ORIGIN_MASK 0xFF +static struct list_head *_origins; +static struct rw_semaphore _origins_lock; + +static int init_origin_hash(void) +{ + int i; + + _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head), + GFP_KERNEL); + if (!_origins) { + DMERR("Device mapper: Snapshot: unable to allocate memory"); + return -ENOMEM; + } + + for (i = 0; i < ORIGIN_HASH_SIZE; i++) + INIT_LIST_HEAD(_origins + i); + init_rwsem(&_origins_lock); + + return 0; +} + +static void exit_origin_hash(void) +{ + kfree(_origins); +} + +static inline unsigned int origin_hash(kdev_t dev) +{ + return MINOR(dev) & ORIGIN_MASK; +} + +static struct origin *__lookup_origin(kdev_t origin) +{ + struct list_head *slist; + struct list_head *ol; + struct origin *o; + + ol = &_origins[origin_hash(origin)]; + list_for_each(slist, ol) { + o = list_entry(slist, struct origin, hash_list); + + if (o->dev == origin) + return o; + } + + return NULL; +} + +static void __insert_origin(struct origin *o) +{ + struct list_head *sl = &_origins[origin_hash(o->dev)]; + list_add_tail(&o->hash_list, sl); +} + +/* + * Make a note of the snapshot and its origin so we can look it + * up when the origin has a write on it. + */ +static int register_snapshot(struct dm_snapshot *snap) +{ + struct origin *o; + kdev_t dev = snap->origin->dev; + + down_write(&_origins_lock); + o = __lookup_origin(dev); + + if (!o) { + /* New origin */ + o = kmalloc(sizeof(*o), GFP_KERNEL); + if (!o) { + up_write(&_origins_lock); + return -ENOMEM; + } + + /* Initialise the struct */ + INIT_LIST_HEAD(&o->snapshots); + o->dev = dev; + + __insert_origin(o); + } + + list_add_tail(&snap->list, &o->snapshots); + + up_write(&_origins_lock); + return 0; +} + +static void unregister_snapshot(struct dm_snapshot *s) +{ + struct origin *o; + + down_write(&_origins_lock); + o = __lookup_origin(s->origin->dev); + + list_del(&s->list); + if (list_empty(&o->snapshots)) { + list_del(&o->hash_list); + kfree(o); + } + + up_write(&_origins_lock); +} + +/* + * Implementation of the exception hash tables. + */ +static int init_exception_table(struct exception_table *et, uint32_t size) +{ + int i; + + et->hash_mask = size - 1; + et->table = vcalloc(size, sizeof(struct list_head)); + if (!et->table) + return -ENOMEM; + + for (i = 0; i < size; i++) + INIT_LIST_HEAD(et->table + i); + + return 0; +} + +static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem) +{ + struct list_head *slot, *entry, *temp; + struct exception *ex; + int i, size; + + size = et->hash_mask + 1; + for (i = 0; i < size; i++) { + slot = et->table + i; + + list_for_each_safe(entry, temp, slot) { + ex = list_entry(entry, struct exception, hash_list); + kmem_cache_free(mem, ex); + } + } + + vfree(et->table); +} + +/* + * FIXME: check how this hash fn is performing. + */ +static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk) +{ + return chunk & et->hash_mask; +} + +static void insert_exception(struct exception_table *eh, struct exception *e) +{ + struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)]; + list_add(&e->hash_list, l); +} + +static inline void remove_exception(struct exception *e) +{ + list_del(&e->hash_list); +} + +/* + * Return the exception data for a sector, or NULL if not + * remapped. + */ +static struct exception *lookup_exception(struct exception_table *et, + chunk_t chunk) +{ + struct list_head *slot, *el; + struct exception *e; + + slot = &et->table[exception_hash(et, chunk)]; + list_for_each(el, slot) { + e = list_entry(el, struct exception, hash_list); + if (e->old_chunk == chunk) + return e; + } + + return NULL; +} + +static inline struct exception *alloc_exception(void) +{ + struct exception *e; + + e = kmem_cache_alloc(exception_cache, GFP_NOIO); + if (!e) + e = kmem_cache_alloc(exception_cache, GFP_ATOMIC); + + return e; +} + +static inline void free_exception(struct exception *e) +{ + kmem_cache_free(exception_cache, e); +} + +static inline struct pending_exception *alloc_pending_exception(void) +{ + return mempool_alloc(pending_pool, GFP_NOIO); +} + +static inline void free_pending_exception(struct pending_exception *pe) +{ + mempool_free(pe, pending_pool); +} + +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new) +{ + struct exception *e; + + e = alloc_exception(); + if (!e) + return -ENOMEM; + + e->old_chunk = old; + e->new_chunk = new; + insert_exception(&s->complete, e); + return 0; +} + +/* + * Hard coded magic. + */ +static int calc_max_buckets(void) +{ + unsigned long mem; + + mem = num_physpages << PAGE_SHIFT; + mem /= 50; + mem /= sizeof(struct list_head); + + return mem; +} + +/* + * Rounds a number down to a power of 2. + */ +static inline uint32_t round_down(uint32_t n) +{ + while (n & (n - 1)) + n &= (n - 1); + return n; +} + +/* + * Allocate room for a suitable hash table. + */ +static int init_hash_tables(struct dm_snapshot *s) +{ + sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets; + + /* + * Calculate based on the size of the original volume or + * the COW volume... + */ + cow_dev_size = get_dev_size(s->cow->dev); + origin_dev_size = get_dev_size(s->origin->dev); + max_buckets = calc_max_buckets(); + + hash_size = min(origin_dev_size, cow_dev_size) / s->chunk_size; + hash_size = min(hash_size, max_buckets); + + /* Round it down to a power of 2 */ + hash_size = round_down(hash_size); + if (init_exception_table(&s->complete, hash_size)) + return -ENOMEM; + + /* + * Allocate hash table for in-flight exceptions + * Make this smaller than the real hash table + */ + hash_size >>= 3; + if (!hash_size) + hash_size = 64; + + if (init_exception_table(&s->pending, hash_size)) { + exit_exception_table(&s->complete, exception_cache); + return -ENOMEM; + } + + return 0; +} + +/* + * Round a number up to the nearest 'size' boundary. size must + * be a power of 2. + */ +static inline ulong round_up(ulong n, ulong size) +{ + size--; + return (n + size) & ~size; +} + +/* + * Construct a snapshot mapping:

+ */ +static int snapshot_ctr(struct dm_target *ti, int argc, char **argv) +{ + struct dm_snapshot *s; + unsigned long chunk_size; + int r = -EINVAL; + char persistent; + char *origin_path; + char *cow_path; + char *value; + int blocksize; + + if (argc < 4) { + ti->error = "dm-snapshot: requires exactly 4 arguments"; + r = -EINVAL; + goto bad; + } + + origin_path = argv[0]; + cow_path = argv[1]; + persistent = toupper(*argv[2]); + + if (persistent != 'P' && persistent != 'N') { + ti->error = "Persistent flag is not P or N"; + r = -EINVAL; + goto bad; + } + + chunk_size = simple_strtoul(argv[3], &value, 10); + if (chunk_size == 0 || value == NULL) { + ti->error = "Invalid chunk size"; + r = -EINVAL; + goto bad; + } + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) { + ti->error = "Cannot allocate snapshot context private " + "structure"; + r = -ENOMEM; + goto bad; + } + + r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin); + if (r) { + ti->error = "Cannot get origin device"; + goto bad_free; + } + + /* FIXME: get cow length */ + r = dm_get_device(ti, cow_path, 0, 0, + FMODE_READ | FMODE_WRITE, &s->cow); + if (r) { + dm_put_device(ti, s->origin); + ti->error = "Cannot get COW device"; + goto bad_free; + } + + /* + * Chunk size must be multiple of page size. Silently + * round up if it's not. + */ + chunk_size = round_up(chunk_size, PAGE_SIZE / SECTOR_SIZE); + + /* Validate the chunk size against the device block size */ + blocksize = get_hardsect_size(s->cow->dev); + if (chunk_size % (blocksize / SECTOR_SIZE)) { + ti->error = "Chunk size is not a multiple of device blocksize"; + r = -EINVAL; + goto bad_putdev; + } + + /* Check the sizes are small enough to fit in one kiovec */ + if (chunk_size > KIO_MAX_SECTORS) { + ti->error = "Chunk size is too big"; + r = -EINVAL; + goto bad_putdev; + } + + /* Check chunk_size is a power of 2 */ + if (chunk_size & (chunk_size - 1)) { + ti->error = "Chunk size is not a power of 2"; + r = -EINVAL; + goto bad_putdev; + } + + s->chunk_size = chunk_size; + s->chunk_mask = chunk_size - 1; + s->type = persistent; + for (s->chunk_shift = 0; chunk_size; + s->chunk_shift++, chunk_size >>= 1) + ; + s->chunk_shift--; + + s->valid = 1; + s->last_percent = 0; + init_rwsem(&s->lock); + s->table = ti->table; + + /* Allocate hash table for COW data */ + if (init_hash_tables(s)) { + ti->error = "Unable to allocate hash table space"; + r = -ENOMEM; + goto bad_putdev; + } + + /* + * Check the persistent flag - done here because we need the iobuf + * to check the LV header + */ + s->store.snap = s; + + if (persistent == 'P') + r = dm_create_persistent(&s->store, s->chunk_size); + else + r = dm_create_transient(&s->store, s, blocksize); + + if (r) { + ti->error = "Couldn't create exception store"; + r = -EINVAL; + goto bad_free1; + } + + /* Flush IO to the origin device */ +#if LVM_VFS_ENHANCEMENT + fsync_dev_lockfs(s->origin->dev); +#else + fsync_dev(s->origin->dev); +#endif + + /* Add snapshot to the list of snapshots for this origin */ + if (register_snapshot(s)) { + r = -EINVAL; + ti->error = "Cannot register snapshot origin"; + goto bad_free2; + } +#if LVM_VFS_ENHANCEMENT + unlockfs(s->origin->dev); +#endif + kcopyd_inc_client_count(); + + ti->private = s; + return 0; + + bad_free2: +#if LVM_VFS_ENHANCEMENT + unlockfs(s->origin->dev); +#endif + s->store.destroy(&s->store); + + bad_free1: + exit_exception_table(&s->pending, pending_cache); + exit_exception_table(&s->complete, exception_cache); + + bad_putdev: + dm_put_device(ti, s->cow); + dm_put_device(ti, s->origin); + + bad_free: + kfree(s); + + bad: + return r; +} + +static void snapshot_dtr(struct dm_target *ti) +{ + struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + + dm_table_event(ti->table); + + unregister_snapshot(s); + + exit_exception_table(&s->pending, pending_cache); + exit_exception_table(&s->complete, exception_cache); + + /* Deallocate memory used */ + s->store.destroy(&s->store); + + dm_put_device(ti, s->origin); + dm_put_device(ti, s->cow); + kfree(s); + + kcopyd_dec_client_count(); +} + +/* + * We hold lists of buffer_heads, using the b_reqnext field. + */ +static void queue_buffer(struct buffer_head **queue, struct buffer_head *bh) +{ + bh->b_reqnext = *queue; + *queue = bh; +} + +/* + * Flush a list of buffers. + */ +static void flush_buffers(struct buffer_head *bh) +{ + struct buffer_head *n; + + DMDEBUG("begin flush"); + while (bh) { + n = bh->b_reqnext; + bh->b_reqnext = NULL; + DMDEBUG("flushing %p", bh); + generic_make_request(WRITE, bh); + bh = n; + } + + run_task_queue(&tq_disk); +} + +/* + * Error a list of buffers. + */ +static void error_buffers(struct buffer_head *bh) +{ + struct buffer_head *n; + + while (bh) { + n = bh->b_reqnext; + bh->b_reqnext = NULL; + buffer_IO_error(bh); + bh = n; + } +} + +static void pending_complete(struct pending_exception *pe, int success) +{ + struct exception *e; + struct dm_snapshot *s = pe->snap; + + if (success) { + e = alloc_exception(); + if (!e) { + printk("Unable to allocate exception."); + down_write(&s->lock); + s->store.drop_snapshot(&s->store); + s->valid = 0; + up_write(&s->lock); + return; + } + + /* + * Add a proper exception, and remove the + * inflight exception from the list. + */ + down_write(&s->lock); + + memcpy(e, &pe->e, sizeof(*e)); + insert_exception(&s->complete, e); + remove_exception(&pe->e); + + /* Submit any pending write BHs */ + up_write(&s->lock); + + flush_buffers(pe->snapshot_bhs); + DMDEBUG("Exception completed successfully."); + + /* Notify any interested parties */ + if (s->store.percent_full) { + int pc = s->store.percent_full(&s->store); + + if (pc >= s->last_percent + WAKE_UP_PERCENT) { + dm_table_event(s->table); + s->last_percent = pc - pc % WAKE_UP_PERCENT; + } + } + + } else { + /* Read/write error - snapshot is unusable */ + DMERR("Error reading/writing snapshot"); + + down_write(&s->lock); + s->store.drop_snapshot(&s->store); + s->valid = 0; + remove_exception(&pe->e); + up_write(&s->lock); + + error_buffers(pe->snapshot_bhs); + + dm_table_event(s->table); + DMDEBUG("Exception failed."); + } + + if (list_empty(&pe->siblings)) + flush_buffers(pe->origin_bhs); + else + list_del(&pe->siblings); + + free_pending_exception(pe); +} + +static void commit_callback(void *context, int success) +{ + struct pending_exception *pe = (struct pending_exception *) context; + pending_complete(pe, success); +} + +/* + * Called when the copy I/O has finished. kcopyd actually runs + * this code so don't block. + */ +static void copy_callback(int err, void *context) +{ + struct pending_exception *pe = (struct pending_exception *) context; + struct dm_snapshot *s = pe->snap; + + if (err) + pending_complete(pe, 0); + + else + /* Update the metadata if we are persistent */ + s->store.commit_exception(&s->store, &pe->e, commit_callback, + pe); +} + +/* + * Dispatches the copy operation to kcopyd. + */ +static inline void start_copy(struct pending_exception *pe) +{ + struct dm_snapshot *s = pe->snap; + struct kcopyd_region src, dest; + + src.dev = s->origin->dev; + src.sector = chunk_to_sector(s, pe->e.old_chunk); + src.count = s->chunk_size; + + dest.dev = s->cow->dev; + dest.sector = chunk_to_sector(s, pe->e.new_chunk); + dest.count = s->chunk_size; + + if (!pe->started) { + /* Hand over to kcopyd */ + kcopyd_copy(&src, &dest, copy_callback, pe); + pe->started = 1; + } +} + +/* + * Looks to see if this snapshot already has a pending exception + * for this chunk, otherwise it allocates a new one and inserts + * it into the pending table. + */ +static struct pending_exception *find_pending_exception(struct dm_snapshot *s, + struct buffer_head *bh) +{ + struct exception *e; + struct pending_exception *pe; + chunk_t chunk = sector_to_chunk(s, bh->b_rsector); + + /* + * Is there a pending exception for this already ? + */ + e = lookup_exception(&s->pending, chunk); + if (e) { + /* cast the exception to a pending exception */ + pe = list_entry(e, struct pending_exception, e); + + } else { + /* Create a new pending exception */ + pe = alloc_pending_exception(); + if (!pe) { + DMWARN("Couldn't allocate pending exception."); + return NULL; + } + + pe->e.old_chunk = chunk; + pe->origin_bhs = pe->snapshot_bhs = NULL; + INIT_LIST_HEAD(&pe->siblings); + pe->snap = s; + pe->started = 0; + + if (s->store.prepare_exception(&s->store, &pe->e)) { + free_pending_exception(pe); + s->valid = 0; + return NULL; + } + + insert_exception(&s->pending, &pe->e); + } + + return pe; +} + +static inline void remap_exception(struct dm_snapshot *s, struct exception *e, + struct buffer_head *bh) +{ + bh->b_rdev = s->cow->dev; + bh->b_rsector = chunk_to_sector(s, e->new_chunk) + + (bh->b_rsector & s->chunk_mask); +} + +static int snapshot_map(struct dm_target *ti, struct buffer_head *bh, int rw, + void **map_context) +{ + struct exception *e; + struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + int r = 1; + chunk_t chunk; + struct pending_exception *pe; + + chunk = sector_to_chunk(s, bh->b_rsector); + + /* Full snapshots are not usable */ + if (!s->valid) + return -1; + + /* + * Write to snapshot - higher level takes care of RW/RO + * flags so we should only get this if we are + * writeable. + */ + if (rw == WRITE) { + + down_write(&s->lock); + + /* If the block is already remapped - use that, else remap it */ + e = lookup_exception(&s->complete, chunk); + if (e) + remap_exception(s, e, bh); + + else { + pe = find_pending_exception(s, bh); + + if (!pe) { + s->store.drop_snapshot(&s->store); + s->valid = 0; + } + + queue_buffer(&pe->snapshot_bhs, bh); + start_copy(pe); + r = 0; + } + + up_write(&s->lock); + + } else { + /* + * FIXME: this read path scares me because we + * always use the origin when we have a pending + * exception. However I can't think of a + * situation where this is wrong - ejt. + */ + + /* Do reads */ + down_read(&s->lock); + + /* See if it it has been remapped */ + e = lookup_exception(&s->complete, chunk); + if (e) + remap_exception(s, e, bh); + else + bh->b_rdev = s->origin->dev; + + up_read(&s->lock); + } + + return r; +} + +static void list_merge(struct list_head *l1, struct list_head *l2) +{ + struct list_head *l1_n, *l2_p; + + l1_n = l1->next; + l2_p = l2->prev; + + l1->next = l2; + l2->prev = l1; + + l2_p->next = l1_n; + l1_n->prev = l2_p; +} + +static int __origin_write(struct list_head *snapshots, struct buffer_head *bh) +{ + int r = 1; + struct list_head *sl; + struct dm_snapshot *snap; + struct exception *e; + struct pending_exception *pe, *last = NULL; + chunk_t chunk; + + /* Do all the snapshots on this origin */ + list_for_each(sl, snapshots) { + snap = list_entry(sl, struct dm_snapshot, list); + + /* Only deal with valid snapshots */ + if (!snap->valid) + continue; + + down_write(&snap->lock); + + /* + * Remember, different snapshots can have + * different chunk sizes. + */ + chunk = sector_to_chunk(snap, bh->b_rsector); + + /* + * Check exception table to see if block + * is already remapped in this snapshot + * and trigger an exception if not. + */ + e = lookup_exception(&snap->complete, chunk); + if (!e) { + pe = find_pending_exception(snap, bh); + if (!pe) { + snap->store.drop_snapshot(&snap->store); + snap->valid = 0; + + } else { + if (last) + list_merge(&pe->siblings, + &last->siblings); + + last = pe; + r = 0; + } + } + + up_write(&snap->lock); + } + + /* + * Now that we have a complete pe list we can start the copying. + */ + if (last) { + pe = last; + do { + down_write(&pe->snap->lock); + queue_buffer(&pe->origin_bhs, bh); + start_copy(pe); + up_write(&pe->snap->lock); + pe = list_entry(pe->siblings.next, + struct pending_exception, siblings); + + } while (pe != last); + } + + return r; +} + +static int snapshot_status(struct dm_target *ti, status_type_t type, + char *result, int maxlen) +{ + struct dm_snapshot *snap = (struct dm_snapshot *) ti->private; + char cow[16]; + char org[16]; + + switch (type) { + case STATUSTYPE_INFO: + if (!snap->valid) + snprintf(result, maxlen, "Invalid"); + else { + if (snap->store.percent_full) + snprintf(result, maxlen, "%d%%", + snap->store.percent_full(&snap-> + store)); + else + snprintf(result, maxlen, "Unknown"); + } + break; + + case STATUSTYPE_TABLE: + /* + * kdevname returns a static pointer so we need + * to make private copies if the output is to + * make sense. + */ + strncpy(cow, kdevname(snap->cow->dev), sizeof(cow)); + strncpy(org, kdevname(snap->origin->dev), sizeof(org)); + snprintf(result, maxlen, "%s %s %c %ld", org, cow, + snap->type, snap->chunk_size); + break; + } + + return 0; +} + +/* + * Called on a write from the origin driver. + */ +int do_origin(struct dm_dev *origin, struct buffer_head *bh) +{ + struct origin *o; + int r; + + down_read(&_origins_lock); + o = __lookup_origin(origin->dev); + if (!o) + BUG(); + + r = __origin_write(&o->snapshots, bh); + up_read(&_origins_lock); + + return r; +} + +/* + * Origin: maps a linear range of a device, with hooks for snapshotting. + */ + +/* + * Construct an origin mapping: + * The context for an origin is merely a 'struct dm_dev *' + * pointing to the real device. + */ +static int origin_ctr(struct dm_target *ti, int argc, char **argv) +{ + int r; + struct dm_dev *dev; + + if (argc != 1) { + ti->error = "dm-origin: incorrect number of arguments"; + return -EINVAL; + } + + r = dm_get_device(ti, argv[0], 0, ti->len, + dm_table_get_mode(ti->table), &dev); + if (r) { + ti->error = "Cannot get target device"; + return r; + } + + ti->private = dev; + + return 0; +} + +static void origin_dtr(struct dm_target *ti) +{ + struct dm_dev *dev = (struct dm_dev *) ti->private; + dm_put_device(ti, dev); +} + +static int origin_map(struct dm_target *ti, struct buffer_head *bh, int rw, + void **map_context) +{ + struct dm_dev *dev = (struct dm_dev *) ti->private; + bh->b_rdev = dev->dev; + + /* Only tell snapshots if this is a write */ + return (rw == WRITE) ? do_origin(dev, bh) : 1; +} + +static int origin_status(struct dm_target *ti, status_type_t type, char *result, + int maxlen) +{ + struct dm_dev *dev = (struct dm_dev *) ti->private; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s", kdevname(dev->dev)); + break; + } + + return 0; +} + +static struct target_type origin_target = { + name: "snapshot-origin", + module: THIS_MODULE, + ctr: origin_ctr, + dtr: origin_dtr, + map: origin_map, + status: origin_status, +}; + +static struct target_type snapshot_target = { + name: "snapshot", + module: THIS_MODULE, + ctr: snapshot_ctr, + dtr: snapshot_dtr, + map: snapshot_map, + status: snapshot_status, +}; + +int __init dm_snapshot_init(void) +{ + int r; + + r = dm_register_target(&snapshot_target); + if (r) { + DMERR("snapshot target register failed %d", r); + return r; + } + + r = dm_register_target(&origin_target); + if (r < 0) { + DMERR("Device mapper: Origin: register failed %d\n", r); + goto bad1; + } + + r = init_origin_hash(); + if (r) { + DMERR("init_origin_hash failed."); + goto bad2; + } + + exception_cache = kmem_cache_create("dm-snapshot-ex", + sizeof(struct exception), + __alignof__(struct exception), + 0, NULL, NULL); + if (!exception_cache) { + DMERR("Couldn't create exception cache."); + r = -ENOMEM; + goto bad3; + } + + pending_cache = + kmem_cache_create("dm-snapshot-in", + sizeof(struct pending_exception), + __alignof__(struct pending_exception), + 0, NULL, NULL); + if (!pending_cache) { + DMERR("Couldn't create pending cache."); + r = -ENOMEM; + goto bad4; + } + + pending_pool = mempool_create(128, mempool_alloc_slab, + mempool_free_slab, pending_cache); + if (!pending_pool) { + DMERR("Couldn't create pending pool."); + r = -ENOMEM; + goto bad5; + } + + return 0; + + bad5: + kmem_cache_destroy(pending_cache); + bad4: + kmem_cache_destroy(exception_cache); + bad3: + exit_origin_hash(); + bad2: + dm_unregister_target(&origin_target); + bad1: + dm_unregister_target(&snapshot_target); + return r; +} + +void dm_snapshot_exit(void) +{ + int r; + + r = dm_unregister_target(&snapshot_target); + if (r) + DMERR("snapshot unregister failed %d", r); + + r = dm_unregister_target(&origin_target); + if (r) + DMERR("origin unregister failed %d", r); + + exit_origin_hash(); + mempool_destroy(pending_pool); + kmem_cache_destroy(pending_cache); + kmem_cache_destroy(exception_cache); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/dm-snapshot.h x/drivers/md/dm-snapshot.h --- x-ref/drivers/md/dm-snapshot.h 1970-01-01 01:00:00.000000000 +0100 +++ x/drivers/md/dm-snapshot.h 2003-07-17 05:44:48.000000000 +0200 @@ -0,0 +1,147 @@ +/* + * dm-snapshot.c + * + * Copyright (C) 2001-2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#ifndef DM_SNAPSHOT_H +#define DM_SNAPSHOT_H + +#include "dm.h" +#include + +struct exception_table { + uint32_t hash_mask; + struct list_head *table; +}; + +/* + * The snapshot code deals with largish chunks of the disk at a + * time. Typically 64k - 256k. + */ +/* FIXME: can we get away with limiting these to a uint32_t ? */ +typedef sector_t chunk_t; + +/* + * An exception is used where an old chunk of data has been + * replaced by a new one. + */ +struct exception { + struct list_head hash_list; + + chunk_t old_chunk; + chunk_t new_chunk; +}; + +/* + * Abstraction to handle the meta/layout of exception stores (the + * COW device). + */ +struct exception_store { + + /* + * Destroys this object when you've finished with it. + */ + void (*destroy) (struct exception_store *store); + + /* + * Find somewhere to store the next exception. + */ + int (*prepare_exception) (struct exception_store *store, + struct exception *e); + + /* + * Update the metadata with this exception. + */ + void (*commit_exception) (struct exception_store *store, + struct exception *e, + void (*callback) (void *, int success), + void *callback_context); + + /* + * The snapshot is invalid, note this in the metadata. + */ + void (*drop_snapshot) (struct exception_store *store); + + /* + * Return the %age full of the snapshot + */ + int (*percent_full) (struct exception_store *store); + + struct dm_snapshot *snap; + void *context; +}; + +struct dm_snapshot { + struct rw_semaphore lock; + struct dm_table *table; + + struct dm_dev *origin; + struct dm_dev *cow; + + /* List of snapshots per Origin */ + struct list_head list; + + /* Size of data blocks saved - must be a power of 2 */ + chunk_t chunk_size; + chunk_t chunk_mask; + chunk_t chunk_shift; + + /* You can't use a snapshot if this is 0 (e.g. if full) */ + int valid; + + /* Used for display of table */ + char type; + + /* The last percentage we notified */ + int last_percent; + + struct exception_table pending; + struct exception_table complete; + + /* The on disk metadata handler */ + struct exception_store store; +}; + +/* + * Used by the exception stores to load exceptions hen + * initialising. + */ +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new); + +/* + * Constructor and destructor for the default persistent + * store. + */ +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size); + +int dm_create_transient(struct exception_store *store, + struct dm_snapshot *s, int blocksize); + +/* + * Return the number of sectors in the device. + */ +static inline sector_t get_dev_size(kdev_t dev) +{ + int *sizes; + + sizes = blk_size[MAJOR(dev)]; + if (sizes) + return sizes[MINOR(dev)] << 1; + + return 0; +} + +static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector) +{ + return (sector & ~s->chunk_mask) >> s->chunk_shift; +} + +static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk) +{ + return chunk << s->chunk_shift; +} + +#endif diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/dm-stripe.c x/drivers/md/dm-stripe.c --- x-ref/drivers/md/dm-stripe.c 1970-01-01 01:00:00.000000000 +0100 +++ x/drivers/md/dm-stripe.c 2003-07-17 05:44:48.000000000 +0200 @@ -0,0 +1,257 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include + +struct stripe { + struct dm_dev *dev; + sector_t physical_start; +}; + +struct stripe_c { + uint32_t stripes; + + /* The size of this target / num. stripes */ + uint32_t stripe_width; + + /* stripe chunk size */ + uint32_t chunk_shift; + sector_t chunk_mask; + + struct stripe stripe[0]; +}; + +static inline struct stripe_c *alloc_context(int stripes) +{ + size_t len; + + if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe), + stripes)) + return NULL; + + len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes); + + return kmalloc(len, GFP_KERNEL); +} + +/* + * Parse a single pair + */ +static int get_stripe(struct dm_target *ti, struct stripe_c *sc, + int stripe, char **argv) +{ + sector_t start; + + if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1) + return -EINVAL; + + if (dm_get_device(ti, argv[0], start, sc->stripe_width, + dm_table_get_mode(ti->table), + &sc->stripe[stripe].dev)) + return -ENXIO; + + sc->stripe[stripe].physical_start = start; + return 0; +} + +/* + * FIXME: Nasty function, only present because we can't link + * against __moddi3 and __divdi3. + * + * returns a == b * n + */ +static int multiple(sector_t a, sector_t b, sector_t *n) +{ + sector_t acc, prev, i; + + *n = 0; + while (a >= b) { + for (acc = b, prev = 0, i = 1; + acc <= a; + prev = acc, acc <<= 1, i <<= 1) + ; + + a -= prev; + *n += i >> 1; + } + + return a == 0; +} + +/* + * Construct a striped mapping. + * [ ]+ + */ +static int stripe_ctr(struct dm_target *ti, int argc, char **argv) +{ + struct stripe_c *sc; + sector_t width; + uint32_t stripes; + uint32_t chunk_size; + char *end; + int r, i; + + if (argc < 2) { + ti->error = "dm-stripe: Not enough arguments"; + return -EINVAL; + } + + stripes = simple_strtoul(argv[0], &end, 10); + if (*end) { + ti->error = "dm-stripe: Invalid stripe count"; + return -EINVAL; + } + + chunk_size = simple_strtoul(argv[1], &end, 10); + if (*end) { + ti->error = "dm-stripe: Invalid chunk_size"; + return -EINVAL; + } + + /* + * chunk_size is a power of two + */ + if (!chunk_size || (chunk_size & (chunk_size - 1))) { + ti->error = "dm-stripe: Invalid chunk size"; + return -EINVAL; + } + + if (!multiple(ti->len, stripes, &width)) { + ti->error = "dm-stripe: Target length not divisable by " + "number of stripes"; + return -EINVAL; + } + + /* + * Do we have enough arguments for that many stripes ? + */ + if (argc != (2 + 2 * stripes)) { + ti->error = "dm-stripe: Not enough destinations specified"; + return -EINVAL; + } + + sc = alloc_context(stripes); + if (!sc) { + ti->error = "dm-stripe: Memory allocation for striped context " + "failed"; + return -ENOMEM; + } + + sc->stripes = stripes; + sc->stripe_width = width; + + sc->chunk_mask = ((sector_t) chunk_size) - 1; + for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) + chunk_size >>= 1; + sc->chunk_shift--; + + /* + * Get the stripe destinations. + */ + for (i = 0; i < stripes; i++) { + argv += 2; + + r = get_stripe(ti, sc, i, argv); + if (r < 0) { + ti->error = "dm-stripe: Couldn't parse stripe " + "destination"; + while (i--) + dm_put_device(ti, sc->stripe[i].dev); + kfree(sc); + return r; + } + } + + ti->private = sc; + return 0; +} + +static void stripe_dtr(struct dm_target *ti) +{ + unsigned int i; + struct stripe_c *sc = (struct stripe_c *) ti->private; + + for (i = 0; i < sc->stripes; i++) + dm_put_device(ti, sc->stripe[i].dev); + + kfree(sc); +} + +static int stripe_map(struct dm_target *ti, struct buffer_head *bh, int rw, + void **context) +{ + struct stripe_c *sc = (struct stripe_c *) ti->private; + + sector_t offset = bh->b_rsector - ti->begin; + uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift); + uint32_t stripe = chunk % sc->stripes; /* 32bit modulus */ + chunk = chunk / sc->stripes; + + bh->b_rdev = sc->stripe[stripe].dev->dev; + bh->b_rsector = sc->stripe[stripe].physical_start + + (chunk << sc->chunk_shift) + (offset & sc->chunk_mask); + return 1; +} + +static int stripe_status(struct dm_target *ti, + status_type_t type, char *result, int maxlen) +{ + struct stripe_c *sc = (struct stripe_c *) ti->private; + int offset; + int i; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + offset = snprintf(result, maxlen, "%d " SECTOR_FORMAT, + sc->stripes, sc->chunk_mask + 1); + for (i = 0; i < sc->stripes; i++) { + offset += + snprintf(result + offset, maxlen - offset, + " %s " SECTOR_FORMAT, + kdevname(to_kdev_t(sc->stripe[i].dev->bdev->bd_dev)), + sc->stripe[i].physical_start); + } + break; + } + return 0; +} + +static struct target_type stripe_target = { + .name = "striped", + .module = THIS_MODULE, + .ctr = stripe_ctr, + .dtr = stripe_dtr, + .map = stripe_map, + .status = stripe_status, +}; + +int __init dm_stripe_init(void) +{ + int r; + + r = dm_register_target(&stripe_target); + if (r < 0) + DMWARN("striped target registration failed"); + + return r; +} + +void dm_stripe_exit(void) +{ + if (dm_unregister_target(&stripe_target)) + DMWARN("striped target unregistration failed"); + + return; +} diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/dm-table.c x/drivers/md/dm-table.c --- x-ref/drivers/md/dm-table.c 1970-01-01 01:00:00.000000000 +0100 +++ x/drivers/md/dm-table.c 2003-07-17 05:44:48.000000000 +0200 @@ -0,0 +1,666 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include +#include +#include + +#define MAX_DEPTH 16 +#define NODE_SIZE L1_CACHE_BYTES +#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) +#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) + +struct dm_table { + atomic_t holders; + + /* btree table */ + int depth; + int counts[MAX_DEPTH]; /* in nodes */ + sector_t *index[MAX_DEPTH]; + + int num_targets; + int num_allocated; + sector_t *highs; + struct dm_target *targets; + + /* + * Indicates the rw permissions for the new logical + * device. This should be a combination of FMODE_READ + * and FMODE_WRITE. + */ + int mode; + + /* a list of devices used by this table */ + struct list_head devices; + + /* + * A waitqueue for processes waiting for something + * interesting to happen to this table. + */ + wait_queue_head_t eventq; +}; + +/* + * Ceiling(n / size) + */ +static inline unsigned long div_up(unsigned long n, unsigned long size) +{ + return dm_round_up(n, size) / size; +} + +/* + * Similar to ceiling(log_size(n)) + */ +static unsigned int int_log(unsigned long n, unsigned long base) +{ + int result = 0; + + while (n > 1) { + n = div_up(n, base); + result++; + } + + return result; +} + +/* + * Calculate the index of the child node of the n'th node k'th key. + */ +static inline int get_child(int n, int k) +{ + return (n * CHILDREN_PER_NODE) + k; +} + +/* + * Return the n'th node of level l from table t. + */ +static inline sector_t *get_node(struct dm_table *t, int l, int n) +{ + return t->index[l] + (n * KEYS_PER_NODE); +} + +/* + * Return the highest key that you could lookup from the n'th + * node on level l of the btree. + */ +static sector_t high(struct dm_table *t, int l, int n) +{ + for (; l < t->depth - 1; l++) + n = get_child(n, CHILDREN_PER_NODE - 1); + + if (n >= t->counts[l]) + return (sector_t) - 1; + + return get_node(t, l, n)[KEYS_PER_NODE - 1]; +} + +/* + * Fills in a level of the btree based on the highs of the level + * below it. + */ +static int setup_btree_index(int l, struct dm_table *t) +{ + int n, k; + sector_t *node; + + for (n = 0; n < t->counts[l]; n++) { + node = get_node(t, l, n); + + for (k = 0; k < KEYS_PER_NODE; k++) + node[k] = high(t, l + 1, get_child(n, k)); + } + + return 0; +} + +/* + * highs, and targets are managed as dynamic arrays during a + * table load. + */ +static int alloc_targets(struct dm_table *t, int num) +{ + sector_t *n_highs; + struct dm_target *n_targets; + int n = t->num_targets; + + /* + * Allocate both the target array and offset array at once. + */ + n_highs = (sector_t *) vcalloc(sizeof(struct dm_target) + + sizeof(sector_t), num); + if (!n_highs) + return -ENOMEM; + + n_targets = (struct dm_target *) (n_highs + num); + + if (n) { + memcpy(n_highs, t->highs, sizeof(*n_highs) * n); + memcpy(n_targets, t->targets, sizeof(*n_targets) * n); + } + + memset(n_highs + n, -1, sizeof(*n_highs) * (num - n)); + vfree(t->highs); + + t->num_allocated = num; + t->highs = n_highs; + t->targets = n_targets; + + return 0; +} + +int dm_table_create(struct dm_table **result, int mode) +{ + struct dm_table *t = kmalloc(sizeof(*t), GFP_NOIO); + + if (!t) + return -ENOMEM; + + memset(t, 0, sizeof(*t)); + INIT_LIST_HEAD(&t->devices); + atomic_set(&t->holders, 1); + + /* allocate a single nodes worth of targets to begin with */ + if (alloc_targets(t, KEYS_PER_NODE)) { + kfree(t); + t = NULL; + return -ENOMEM; + } + + init_waitqueue_head(&t->eventq); + t->mode = mode; + *result = t; + return 0; +} + +static void free_devices(struct list_head *devices) +{ + struct list_head *tmp, *next; + + for (tmp = devices->next; tmp != devices; tmp = next) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + next = tmp->next; + kfree(dd); + } +} + +void table_destroy(struct dm_table *t) +{ + int i; + + /* destroying the table counts as an event */ + dm_table_event(t); + + /* free the indexes (see dm_table_complete) */ + if (t->depth >= 2) + vfree(t->index[t->depth - 2]); + + /* free the targets */ + for (i = 0; i < t->num_targets; i++) { + struct dm_target *tgt = t->targets + i; + + if (tgt->type->dtr) + tgt->type->dtr(tgt); + + dm_put_target_type(tgt->type); + } + + vfree(t->highs); + + /* free the device list */ + if (t->devices.next != &t->devices) { + DMWARN("devices still present during destroy: " + "dm_table_remove_device calls missing"); + + free_devices(&t->devices); + } + + kfree(t); +} + +void dm_table_get(struct dm_table *t) +{ + atomic_inc(&t->holders); +} + +void dm_table_put(struct dm_table *t) +{ + if (atomic_dec_and_test(&t->holders)) + table_destroy(t); +} + +/* + * Checks to see if we need to extend highs or targets. + */ +static inline int check_space(struct dm_table *t) +{ + if (t->num_targets >= t->num_allocated) + return alloc_targets(t, t->num_allocated * 2); + + return 0; +} + +/* + * Convert a device path to a dev_t. + */ +static int lookup_device(const char *path, kdev_t *dev) +{ + int r; + struct nameidata nd; + struct inode *inode; + + if (!path_init(path, LOOKUP_FOLLOW, &nd)) + return 0; + + if ((r = path_walk(path, &nd))) + goto out; + + inode = nd.dentry->d_inode; + if (!inode) { + r = -ENOENT; + goto out; + } + + if (!S_ISBLK(inode->i_mode)) { + r = -ENOTBLK; + goto out; + } + + *dev = inode->i_rdev; + + out: + path_release(&nd); + return r; +} + +/* + * See if we've already got a device in the list. + */ +static struct dm_dev *find_device(struct list_head *l, kdev_t dev) +{ + struct list_head *tmp; + + list_for_each(tmp, l) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + if (kdev_same(dd->dev, dev)) + return dd; + } + + return NULL; +} + +/* + * Open a device so we can use it as a map destination. + */ +static int open_dev(struct dm_dev *dd) +{ + if (dd->bdev) + BUG(); + + dd->bdev = bdget(kdev_t_to_nr(dd->dev)); + if (!dd->bdev) + return -ENOMEM; + + return blkdev_get(dd->bdev, dd->mode, 0, BDEV_RAW); +} + +/* + * Close a device that we've been using. + */ +static void close_dev(struct dm_dev *dd) +{ + if (!dd->bdev) + return; + + blkdev_put(dd->bdev, BDEV_RAW); + dd->bdev = NULL; +} + +/* + * If possible (ie. blk_size[major] is set), this checks an area + * of a destination device is valid. + */ +static int check_device_area(kdev_t dev, sector_t start, sector_t len) +{ + int *sizes; + sector_t dev_size; + + if (!(sizes = blk_size[major(dev)]) || !(dev_size = sizes[minor(dev)])) + /* we don't know the device details, + * so give the benefit of the doubt */ + return 1; + + /* convert to 512-byte sectors */ + dev_size <<= 1; + + return ((start < dev_size) && (len <= (dev_size - start))); +} + +/* + * This upgrades the mode on an already open dm_dev. Being + * careful to leave things as they were if we fail to reopen the + * device. + */ +static int upgrade_mode(struct dm_dev *dd, int new_mode) +{ + int r; + struct dm_dev dd_copy; + + memcpy(&dd_copy, dd, sizeof(dd_copy)); + + dd->mode |= new_mode; + dd->bdev = NULL; + r = open_dev(dd); + if (!r) + close_dev(&dd_copy); + else + memcpy(dd, &dd_copy, sizeof(dd_copy)); + + return r; +} + +/* + * Add a device to the list, or just increment the usage count if + * it's already present. + */ +int dm_get_device(struct dm_target *ti, const char *path, sector_t start, + sector_t len, int mode, struct dm_dev **result) +{ + int r; + kdev_t dev; + struct dm_dev *dd; + int major, minor; + struct dm_table *t = ti->table; + + if (!t) + BUG(); + + if (sscanf(path, "%x:%x", &major, &minor) == 2) { + /* Extract the major/minor numbers */ + dev = mk_kdev(major, minor); + } else { + /* convert the path to a device */ + if ((r = lookup_device(path, &dev))) + return r; + } + + dd = find_device(&t->devices, dev); + if (!dd) { + dd = kmalloc(sizeof(*dd), GFP_KERNEL); + if (!dd) + return -ENOMEM; + + dd->dev = dev; + dd->mode = mode; + dd->bdev = NULL; + + if ((r = open_dev(dd))) { + kfree(dd); + return r; + } + + atomic_set(&dd->count, 0); + list_add(&dd->list, &t->devices); + + } else if (dd->mode != (mode | dd->mode)) { + r = upgrade_mode(dd, mode); + if (r) + return r; + } + atomic_inc(&dd->count); + + if (!check_device_area(dd->dev, start, len)) { + DMWARN("device %s too small for target", path); + dm_put_device(ti, dd); + return -EINVAL; + } + + *result = dd; + + return 0; +} + +/* + * Decrement a devices use count and remove it if neccessary. + */ +void dm_put_device(struct dm_target *ti, struct dm_dev *dd) +{ + if (atomic_dec_and_test(&dd->count)) { + close_dev(dd); + list_del(&dd->list); + kfree(dd); + } +} + +/* + * Checks to see if the target joins onto the end of the table. + */ +static int adjoin(struct dm_table *table, struct dm_target *ti) +{ + struct dm_target *prev; + + if (!table->num_targets) + return !ti->begin; + + prev = &table->targets[table->num_targets - 1]; + return (ti->begin == (prev->begin + prev->len)); +} + +/* + * Destructively splits up the argument list to pass to ctr. + */ +static int split_args(int max, int *argc, char **argv, char *input) +{ + char *start, *end = input, *out; + *argc = 0; + + while (1) { + start = end; + + /* Skip whitespace */ + while (*start && isspace(*start)) + start++; + + if (!*start) + break; /* success, we hit the end */ + + /* 'out' is used to remove any back-quotes */ + end = out = start; + while (*end) { + /* Everything apart from '\0' can be quoted */ + if (*end == '\\' && *(end + 1)) { + *out++ = *(end + 1); + end += 2; + continue; + } + + if (isspace(*end)) + break; /* end of token */ + + *out++ = *end++; + } + + /* have we already filled the array ? */ + if ((*argc + 1) > max) + return -EINVAL; + + /* we know this is whitespace */ + if (*end) + end++; + + /* terminate the string and put it in the array */ + *out = '\0'; + argv[*argc] = start; + (*argc)++; + } + + return 0; +} + +int dm_table_add_target(struct dm_table *t, const char *type, + sector_t start, sector_t len, char *params) +{ + int r = -EINVAL, argc; + char *argv[32]; + struct dm_target *tgt; + + if ((r = check_space(t))) + return r; + + tgt = t->targets + t->num_targets; + memset(tgt, 0, sizeof(*tgt)); + + tgt->type = dm_get_target_type(type); + if (!tgt->type) { + tgt->error = "unknown target type"; + return -EINVAL; + } + + tgt->table = t; + tgt->begin = start; + tgt->len = len; + tgt->error = "Unknown error"; + + /* + * Does this target adjoin the previous one ? + */ + if (!adjoin(t, tgt)) { + tgt->error = "Gap in table"; + r = -EINVAL; + goto bad; + } + + r = split_args(ARRAY_SIZE(argv), &argc, argv, params); + if (r) { + tgt->error = "couldn't split parameters"; + goto bad; + } + + r = tgt->type->ctr(tgt, argc, argv); + if (r) + goto bad; + + t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; + return 0; + + bad: + printk(KERN_ERR DM_NAME ": %s\n", tgt->error); + dm_put_target_type(tgt->type); + return r; +} + +static int setup_indexes(struct dm_table *t) +{ + int i, total = 0; + sector_t *indexes; + + /* allocate the space for *all* the indexes */ + for (i = t->depth - 2; i >= 0; i--) { + t->counts[i] = div_up(t->counts[i + 1], CHILDREN_PER_NODE); + total += t->counts[i]; + } + + indexes = (sector_t *) vcalloc(total, (unsigned long) NODE_SIZE); + if (!indexes) + return -ENOMEM; + + /* set up internal nodes, bottom-up */ + for (i = t->depth - 2, total = 0; i >= 0; i--) { + t->index[i] = indexes; + indexes += (KEYS_PER_NODE * t->counts[i]); + setup_btree_index(i, t); + } + + return 0; +} + +/* + * Builds the btree to index the map. + */ +int dm_table_complete(struct dm_table *t) +{ + int leaf_nodes, r = 0; + + /* how many indexes will the btree have ? */ + leaf_nodes = div_up(t->num_targets, KEYS_PER_NODE); + t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); + + /* leaf layer has already been set up */ + t->counts[t->depth - 1] = leaf_nodes; + t->index[t->depth - 1] = t->highs; + + if (t->depth >= 2) + r = setup_indexes(t); + + return r; +} + +void dm_table_event(struct dm_table *t) +{ + wake_up_interruptible(&t->eventq); +} + +sector_t dm_table_get_size(struct dm_table *t) +{ + return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; +} + +struct dm_target *dm_table_get_target(struct dm_table *t, int index) +{ + if (index > t->num_targets) + return NULL; + + return t->targets + index; +} + +/* + * Search the btree for the correct target. + */ +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) +{ + int l, n = 0, k = 0; + sector_t *node; + + for (l = 0; l < t->depth; l++) { + n = get_child(n, k); + node = get_node(t, l, n); + + for (k = 0; k < KEYS_PER_NODE; k++) + if (node[k] >= sector) + break; + } + + return &t->targets[(KEYS_PER_NODE * n) + k]; +} + +unsigned int dm_table_get_num_targets(struct dm_table *t) +{ + return t->num_targets; +} + +struct list_head *dm_table_get_devices(struct dm_table *t) +{ + return &t->devices; +} + +int dm_table_get_mode(struct dm_table *t) +{ + return t->mode; +} + +void dm_table_add_wait_queue(struct dm_table *t, wait_queue_t *wq) +{ + add_wait_queue(&t->eventq, wq); +} + +EXPORT_SYMBOL(dm_get_device); +EXPORT_SYMBOL(dm_put_device); +EXPORT_SYMBOL(dm_table_event); +EXPORT_SYMBOL(dm_table_get_mode); diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/dm-target.c x/drivers/md/dm-target.c --- x-ref/drivers/md/dm-target.c 1970-01-01 01:00:00.000000000 +0100 +++ x/drivers/md/dm-target.c 2003-07-17 05:44:48.000000000 +0200 @@ -0,0 +1,187 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include + +struct tt_internal { + struct target_type tt; + + struct list_head list; + long use; +}; + +static LIST_HEAD(_targets); +static DECLARE_RWSEM(_lock); + +#define DM_MOD_NAME_SIZE 32 + +static inline struct tt_internal *__find_target_type(const char *name) +{ + struct list_head *tih; + struct tt_internal *ti; + + list_for_each(tih, &_targets) { + ti = list_entry(tih, struct tt_internal, list); + + if (!strcmp(name, ti->tt.name)) + return ti; + } + + return NULL; +} + +static struct tt_internal *get_target_type(const char *name) +{ + struct tt_internal *ti; + + down_read(&_lock); + ti = __find_target_type(name); + + if (ti) { + if (ti->use == 0 && ti->tt.module) + __MOD_INC_USE_COUNT(ti->tt.module); + ti->use++; + } + up_read(&_lock); + + return ti; +} + +static void load_module(const char *name) +{ + char module_name[DM_MOD_NAME_SIZE] = "dm-"; + + /* Length check for strcat() below */ + if (strlen(name) > (DM_MOD_NAME_SIZE - 4)) + return; + + strcat(module_name, name); + request_module(module_name); +} + +struct target_type *dm_get_target_type(const char *name) +{ + struct tt_internal *ti = get_target_type(name); + + if (!ti) { + load_module(name); + ti = get_target_type(name); + } + + return ti ? &ti->tt : NULL; +} + +void dm_put_target_type(struct target_type *t) +{ + struct tt_internal *ti = (struct tt_internal *) t; + + down_read(&_lock); + if (--ti->use == 0 && ti->tt.module) + __MOD_DEC_USE_COUNT(ti->tt.module); + + if (ti->use < 0) + BUG(); + up_read(&_lock); + + return; +} + +static struct tt_internal *alloc_target(struct target_type *t) +{ + struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL); + + if (ti) { + memset(ti, 0, sizeof(*ti)); + ti->tt = *t; + } + + return ti; +} + +int dm_register_target(struct target_type *t) +{ + int rv = 0; + struct tt_internal *ti = alloc_target(t); + + if (!ti) + return -ENOMEM; + + down_write(&_lock); + if (__find_target_type(t->name)) + rv = -EEXIST; + else + list_add(&ti->list, &_targets); + + up_write(&_lock); + return rv; +} + +int dm_unregister_target(struct target_type *t) +{ + struct tt_internal *ti; + + down_write(&_lock); + if (!(ti = __find_target_type(t->name))) { + up_write(&_lock); + return -EINVAL; + } + + if (ti->use) { + up_write(&_lock); + return -ETXTBSY; + } + + list_del(&ti->list); + kfree(ti); + + up_write(&_lock); + return 0; +} + +/* + * io-err: always fails an io, useful for bringing + * up LVs that have holes in them. + */ +static int io_err_ctr(struct dm_target *ti, int argc, char **args) +{ + return 0; +} + +static void io_err_dtr(struct dm_target *ti) +{ + /* empty */ +} + +static int io_err_map(struct dm_target *ti, struct buffer_head *bh, int rw, + void **map_context) +{ + return -EIO; +} + +static struct target_type error_target = { + .name = "error", + .ctr = io_err_ctr, + .dtr = io_err_dtr, + .map = io_err_map, +}; + +int dm_target_init(void) +{ + return dm_register_target(&error_target); +} + +void dm_target_exit(void) +{ + if (dm_unregister_target(&error_target)) + DMWARN("error target unregistration failed"); +} + +EXPORT_SYMBOL(dm_register_target); +EXPORT_SYMBOL(dm_unregister_target); diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/dm.c x/drivers/md/dm.c --- x-ref/drivers/md/dm.c 1970-01-01 01:00:00.000000000 +0100 +++ x/drivers/md/dm.c 2003-07-17 05:44:48.000000000 +0200 @@ -0,0 +1,878 @@ +/* + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static const char *_name = DM_NAME; +#define MAX_DEVICES (1 << MINORBITS) +#define DEFAULT_READ_AHEAD 64 + +static int major = 0; +static int _major = 0; + +struct dm_io { + struct mapped_device *md; + + struct dm_target *ti; + int rw; + void *map_context; + void (*end_io) (struct buffer_head * bh, int uptodate); + void *context; +}; + +struct deferred_io { + int rw; + struct buffer_head *bh; + struct deferred_io *next; +}; + +/* + * Bits for the md->flags field. + */ +#define DMF_BLOCK_IO 0 +#define DMF_SUSPENDED 1 + +struct mapped_device { + struct rw_semaphore lock; + atomic_t holders; + + kdev_t dev; + unsigned long flags; + + /* + * A list of ios that arrived while we were suspended. + */ + atomic_t pending; + wait_queue_head_t wait; + struct deferred_io *deferred; + + /* + * The current mapping. + */ + struct dm_table *map; + + /* + * io objects are allocated from here. + */ + mempool_t *io_pool; +}; + +#define MIN_IOS 256 +static kmem_cache_t *_io_cache; + +/* block device arrays */ +static int _block_size[MAX_DEVICES]; +static int _blksize_size[MAX_DEVICES]; +static int _hardsect_size[MAX_DEVICES]; + +static struct mapped_device *get_kdev(kdev_t dev); +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh); +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb); + +static __init int local_init(void) +{ + int r; + + /* allocate a slab for the dm_ios */ + _io_cache = kmem_cache_create("dm io", + sizeof(struct dm_io), 0, 0, NULL, NULL); + + if (!_io_cache) + return -ENOMEM; + + _major = major; + r = register_blkdev(_major, _name, &dm_blk_dops); + if (r < 0) { + DMERR("register_blkdev failed"); + kmem_cache_destroy(_io_cache); + return r; + } + + if (!_major) + _major = r; + + /* set up the arrays */ + read_ahead[_major] = DEFAULT_READ_AHEAD; + blk_size[_major] = _block_size; + blksize_size[_major] = _blksize_size; + hardsect_size[_major] = _hardsect_size; + + blk_queue_make_request(BLK_DEFAULT_QUEUE(_major), dm_request); + + return 0; +} + +static void local_exit(void) +{ + kmem_cache_destroy(_io_cache); + + if (unregister_blkdev(_major, _name) < 0) + DMERR("devfs_unregister_blkdev failed"); + + read_ahead[_major] = 0; + blk_size[_major] = NULL; + blksize_size[_major] = NULL; + hardsect_size[_major] = NULL; + _major = 0; + + DMINFO("cleaned up"); +} + +/* + * We have a lot of init/exit functions, so it seems easier to + * store them in an array. The disposable macro 'xx' + * expands a prefix into a pair of function names. + */ +static struct { + int (*init) (void); + void (*exit) (void); + +} _inits[] = { +#define xx(n) {n ## _init, n ## _exit}, + xx(local) + xx(dm_target) + xx(dm_linear) + xx(dm_stripe) + xx(dm_snapshot) + xx(dm_interface) +#undef xx +}; + +static int __init dm_init(void) +{ + const int count = ARRAY_SIZE(_inits); + + int r, i; + + for (i = 0; i < count; i++) { + r = _inits[i].init(); + if (r) + goto bad; + } + + return 0; + + bad: + while (i--) + _inits[i].exit(); + + return r; +} + +static void __exit dm_exit(void) +{ + int i = ARRAY_SIZE(_inits); + + while (i--) + _inits[i].exit(); +} + +/* + * Block device functions + */ +static int dm_blk_open(struct inode *inode, struct file *file) +{ + struct mapped_device *md; + + md = get_kdev(inode->i_rdev); + if (!md) + return -ENXIO; + + return 0; +} + +static int dm_blk_close(struct inode *inode, struct file *file) +{ + struct mapped_device *md; + + md = get_kdev(inode->i_rdev); + dm_put(md); /* put the reference gained by dm_blk_open */ + dm_put(md); + return 0; +} + +static inline struct dm_io *alloc_io(struct mapped_device *md) +{ + return mempool_alloc(md->io_pool, GFP_NOIO); +} + +static inline void free_io(struct mapped_device *md, struct dm_io *io) +{ + mempool_free(io, md->io_pool); +} + +static inline struct deferred_io *alloc_deferred(void) +{ + return kmalloc(sizeof(struct deferred_io), GFP_NOIO); +} + +static inline void free_deferred(struct deferred_io *di) +{ + kfree(di); +} + +/* In 512-byte units */ +#define VOLUME_SIZE(minor) (_block_size[(minor)] << 1) + +/* FIXME: check this */ +static int dm_blk_ioctl(struct inode *inode, struct file *file, + uint command, unsigned long a) +{ + int minor = MINOR(inode->i_rdev); + long size; + + if (minor >= MAX_DEVICES) + return -ENXIO; + + switch (command) { + case BLKROSET: + case BLKROGET: + case BLKRASET: + case BLKRAGET: + case BLKFLSBUF: + case BLKSSZGET: + //case BLKRRPART: /* Re-read partition tables */ + //case BLKPG: + case BLKELVGET: + case BLKELVSET: + case BLKBSZGET: + case BLKBSZSET: + return blk_ioctl(inode->i_rdev, command, a); + break; + + case BLKGETSIZE: + size = VOLUME_SIZE(minor); + if (copy_to_user((void *) a, &size, sizeof(long))) + return -EFAULT; + break; + + case BLKGETSIZE64: + size = VOLUME_SIZE(minor); + if (put_user((u64) ((u64) size) << 9, (u64 *) a)) + return -EFAULT; + break; + + case BLKRRPART: + return -ENOTTY; + + case LV_BMAP: + return dm_user_bmap(inode, (struct lv_bmap *) a); + + default: + DMWARN("unknown block ioctl 0x%x", command); + return -ENOTTY; + } + + return 0; +} + +/* + * Add the buffer to the list of deferred io. + */ +static int queue_io(struct mapped_device *md, struct buffer_head *bh, int rw) +{ + struct deferred_io *di; + + di = alloc_deferred(); + if (!di) + return -ENOMEM; + + down_write(&md->lock); + + if (!test_bit(DMF_BLOCK_IO, &md->flags)) { + up_write(&md->lock); + free_deferred(di); + return 1; + } + + di->bh = bh; + di->rw = rw; + di->next = md->deferred; + md->deferred = di; + + up_write(&md->lock); + return 0; /* deferred successfully */ +} + +/* + * bh->b_end_io routine that decrements the pending count + * and then calls the original bh->b_end_io fn. + */ +static void dec_pending(struct buffer_head *bh, int uptodate) +{ + int r; + struct dm_io *io = bh->b_private; + dm_endio_fn endio = io->ti->type->end_io; + + if (endio) { + r = endio(io->ti, bh, io->rw, uptodate ? 0 : -EIO, + io->map_context); + if (r < 0) + uptodate = 0; + + else if (r > 0) + /* the target wants another shot at the io */ + return; + } + + if (atomic_dec_and_test(&io->md->pending)) + /* nudge anyone waiting on suspend queue */ + wake_up(&io->md->wait); + + bh->b_end_io = io->end_io; + bh->b_private = io->context; + free_io(io->md, io); + + bh->b_end_io(bh, uptodate); +} + +/* + * Do the bh mapping for a given leaf + */ +static inline int __map_buffer(struct mapped_device *md, int rw, + struct buffer_head *bh, struct dm_io *io) +{ + struct dm_target *ti; + + ti = dm_table_find_target(md->map, bh->b_rsector); + if (!ti || !ti->type) + return -EINVAL; + + /* hook the end io request fn */ + atomic_inc(&md->pending); + io->md = md; + io->ti = ti; + io->rw = rw; + io->end_io = bh->b_end_io; + io->context = bh->b_private; + bh->b_end_io = dec_pending; + bh->b_private = io; + + return ti->type->map(ti, bh, rw, &io->map_context); +} + +/* + * Checks to see if we should be deferring io, if so it queues it + * and returns 1. + */ +static inline int __deferring(struct mapped_device *md, int rw, + struct buffer_head *bh) +{ + int r; + + /* + * If we're suspended we have to queue this io for later. + */ + while (test_bit(DMF_BLOCK_IO, &md->flags)) { + up_read(&md->lock); + + /* + * There's no point deferring a read ahead + * request, just drop it. + */ + if (rw == READA) { + down_read(&md->lock); + return -EIO; + } + + r = queue_io(md, bh, rw); + down_read(&md->lock); + + if (r < 0) + return r; + + if (r == 0) + return 1; /* deferred successfully */ + + } + + return 0; +} + +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh) +{ + int r; + struct dm_io *io; + struct mapped_device *md; + + md = get_kdev(bh->b_rdev); + if (!md) { + buffer_IO_error(bh); + return 0; + } + + io = alloc_io(md); + down_read(&md->lock); + + r = __deferring(md, rw, bh); + if (r < 0) + goto bad; + + else if (!r) { + /* not deferring */ + r = __map_buffer(md, rw, bh, io); + if (r < 0) + goto bad; + } else + r = 0; + + up_read(&md->lock); + dm_put(md); + return r; + + bad: + buffer_IO_error(bh); + up_read(&md->lock); + dm_put(md); + return 0; +} + +static int check_dev_size(kdev_t dev, unsigned long block) +{ + /* FIXME: check this */ + int minor = MINOR(dev); + unsigned long max_sector = (_block_size[minor] << 1) + 1; + unsigned long sector = (block + 1) * (_blksize_size[minor] >> 9); + + return (sector > max_sector) ? 0 : 1; +} + +/* + * Creates a dummy buffer head and maps it (for lilo). + */ +static int __bmap(struct mapped_device *md, kdev_t dev, unsigned long block, + kdev_t *r_dev, unsigned long *r_block) +{ + struct buffer_head bh; + struct dm_target *ti; + void *map_context; + int r; + + if (test_bit(DMF_BLOCK_IO, &md->flags)) { + return -EPERM; + } + + if (!check_dev_size(dev, block)) { + return -EINVAL; + } + + /* setup dummy bh */ + memset(&bh, 0, sizeof(bh)); + bh.b_blocknr = block; + bh.b_dev = bh.b_rdev = dev; + bh.b_size = _blksize_size[MINOR(dev)]; + bh.b_rsector = block * (bh.b_size >> 9); + + /* find target */ + ti = dm_table_find_target(md->map, bh.b_rsector); + + /* do the mapping */ + r = ti->type->map(ti, &bh, READ, &map_context); + ti->type->end_io(ti, &bh, READ, 0, map_context); + + if (!r) { + *r_dev = bh.b_rdev; + *r_block = bh.b_rsector / (bh.b_size >> 9); + } + + return r; +} + +/* + * Marshals arguments and results between user and kernel space. + */ +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb) +{ + struct mapped_device *md; + unsigned long block, r_block; + kdev_t r_dev; + int r; + + if (get_user(block, &lvb->lv_block)) + return -EFAULT; + + md = get_kdev(inode->i_rdev); + if (!md) + return -ENXIO; + + down_read(&md->lock); + r = __bmap(md, inode->i_rdev, block, &r_dev, &r_block); + up_read(&md->lock); + dm_put(md); + + if (!r && (put_user(kdev_t_to_nr(r_dev), &lvb->lv_dev) || + put_user(r_block, &lvb->lv_block))) + r = -EFAULT; + + return r; +} + +/*----------------------------------------------------------------- + * A bitset is used to keep track of allocated minor numbers. + *---------------------------------------------------------------*/ +static spinlock_t _minor_lock = SPIN_LOCK_UNLOCKED; +static struct mapped_device *_mds[MAX_DEVICES]; + +static void free_minor(int minor) +{ + spin_lock(&_minor_lock); + _mds[minor] = NULL; + spin_unlock(&_minor_lock); +} + +/* + * See if the device with a specific minor # is free. + */ +static int specific_minor(int minor, struct mapped_device *md) +{ + int r = -EBUSY; + + if (minor >= MAX_DEVICES) { + DMWARN("request for a mapped_device beyond MAX_DEVICES (%d)", + MAX_DEVICES); + return -EINVAL; + } + + spin_lock(&_minor_lock); + if (!_mds[minor]) { + _mds[minor] = md; + r = minor; + } + spin_unlock(&_minor_lock); + + return r; +} + +static int next_free_minor(struct mapped_device *md) +{ + int i; + + spin_lock(&_minor_lock); + for (i = 0; i < MAX_DEVICES; i++) { + if (!_mds[i]) { + _mds[i] = md; + break; + } + } + spin_unlock(&_minor_lock); + + return (i < MAX_DEVICES) ? i : -EBUSY; +} + +static struct mapped_device *get_kdev(kdev_t dev) +{ + struct mapped_device *md; + + if (major(dev) != _major) + return NULL; + + spin_lock(&_minor_lock); + md = _mds[minor(dev)]; + if (md) + dm_get(md); + spin_unlock(&_minor_lock); + + return md; +} + +/* + * Allocate and initialise a blank device with a given minor. + */ +static struct mapped_device *alloc_dev(int minor) +{ + struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); + + if (!md) { + DMWARN("unable to allocate device, out of memory."); + return NULL; + } + + /* get a minor number for the dev */ + minor = (minor < 0) ? next_free_minor(md) : specific_minor(minor, md); + if (minor < 0) { + kfree(md); + return NULL; + } + + memset(md, 0, sizeof(*md)); + + md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, + mempool_free_slab, _io_cache); + if (!md->io_pool) { + free_minor(minor); + kfree(md); + return NULL; + } + + md->dev = mk_kdev(_major, minor); + init_rwsem(&md->lock); + atomic_set(&md->holders, 1); + atomic_set(&md->pending, 0); + init_waitqueue_head(&md->wait); + + return md; +} + +static void free_dev(struct mapped_device *md) +{ + free_minor(minor(md->dev)); + mempool_destroy(md->io_pool); + kfree(md); +} + +/* + * The hardsect size for a mapped device is the largest hardsect size + * from the devices it maps onto. + */ +static int __find_hardsect_size(struct list_head *devices) +{ + int result = 512, size; + struct list_head *tmp; + + list_for_each(tmp, devices) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + size = get_hardsect_size(dd->dev); + if (size > result) + result = size; + } + + return result; +} + +/* + * Bind a table to the device. + */ +static int __bind(struct mapped_device *md, struct dm_table *t) +{ + int minor = minor(md->dev); + md->map = t; + + /* in k */ + _block_size[minor] = dm_table_get_size(t) >> 1; + _blksize_size[minor] = BLOCK_SIZE; + _hardsect_size[minor] = __find_hardsect_size(dm_table_get_devices(t)); + register_disk(NULL, md->dev, 1, &dm_blk_dops, _block_size[minor]); + + dm_table_get(t); + return 0; +} + +static void __unbind(struct mapped_device *md) +{ + int minor = minor(md->dev); + + dm_table_put(md->map); + md->map = NULL; + + _block_size[minor] = 0; + _blksize_size[minor] = 0; + _hardsect_size[minor] = 0; +} + +/* + * Constructor for a new device. + */ +int dm_create(int minor, struct dm_table *table, struct mapped_device **result) +{ + int r; + struct mapped_device *md; + + md = alloc_dev(minor); + if (!md) + return -ENXIO; + + r = __bind(md, table); + if (r) { + free_dev(md); + return r; + } + + *result = md; + return 0; +} + +void dm_get(struct mapped_device *md) +{ + atomic_inc(&md->holders); +} + +void dm_put(struct mapped_device *md) +{ + if (atomic_dec_and_test(&md->holders)) { + __unbind(md); + free_dev(md); + } +} + +/* + * Requeue the deferred io by calling generic_make_request. + */ +static void flush_deferred_io(struct deferred_io *c) +{ + struct deferred_io *n; + + while (c) { + n = c->next; + generic_make_request(c->rw, c->bh); + free_deferred(c); + c = n; + } +} + +/* + * Swap in a new table (destroying old one). + */ +int dm_swap_table(struct mapped_device *md, struct dm_table *table) +{ + int r; + + down_write(&md->lock); + + /* device must be suspended */ + if (!test_bit(DMF_SUSPENDED, &md->flags)) { + up_write(&md->lock); + return -EPERM; + } + + __unbind(md); + r = __bind(md, table); + if (r) + return r; + + up_write(&md->lock); + return 0; +} + +/* + * We need to be able to change a mapping table under a mounted + * filesystem. For example we might want to move some data in + * the background. Before the table can be swapped with + * dm_bind_table, dm_suspend must be called to flush any in + * flight io and ensure that any further io gets deferred. + */ +int dm_suspend(struct mapped_device *md) +{ + DECLARE_WAITQUEUE(wait, current); + + down_write(&md->lock); + + /* + * First we set the BLOCK_IO flag so no more ios will be + * mapped. + */ + if (test_bit(DMF_BLOCK_IO, &md->flags)) { + up_write(&md->lock); + return -EINVAL; + } + + set_bit(DMF_BLOCK_IO, &md->flags); + add_wait_queue(&md->wait, &wait); + up_write(&md->lock); + + /* + * Then we wait for the already mapped ios to + * complete. + */ + run_task_queue(&tq_disk); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (!atomic_read(&md->pending)) + break; + + schedule(); + } + + current->state = TASK_RUNNING; + + down_write(&md->lock); + remove_wait_queue(&md->wait, &wait); + set_bit(DMF_SUSPENDED, &md->flags); + up_write(&md->lock); + + return 0; +} + +int dm_resume(struct mapped_device *md) +{ + struct deferred_io *def; + + down_write(&md->lock); + if (!test_bit(DMF_SUSPENDED, &md->flags) || + !dm_table_get_size(md->map)) { + up_write(&md->lock); + return -EINVAL; + } + + clear_bit(DMF_SUSPENDED, &md->flags); + clear_bit(DMF_BLOCK_IO, &md->flags); + def = md->deferred; + md->deferred = NULL; + up_write(&md->lock); + + flush_deferred_io(def); + run_task_queue(&tq_disk); + + return 0; +} + +struct dm_table *dm_get_table(struct mapped_device *md) +{ + struct dm_table *t; + + down_read(&md->lock); + t = md->map; + dm_table_get(t); + up_read(&md->lock); + + return t; +} + +kdev_t dm_kdev(struct mapped_device *md) +{ + kdev_t dev; + + down_read(&md->lock); + dev = md->dev; + up_read(&md->lock); + + return dev; +} + +int dm_suspended(struct mapped_device *md) +{ + return test_bit(DMF_SUSPENDED, &md->flags); +} + +struct block_device_operations dm_blk_dops = { + .open = dm_blk_open, + .release = dm_blk_close, + .ioctl = dm_blk_ioctl, + .owner = THIS_MODULE +}; + +/* + * module hooks + */ +module_init(dm_init); +module_exit(dm_exit); + +MODULE_PARM(major, "i"); +MODULE_PARM_DESC(major, "The major number of the device mapper"); +MODULE_DESCRIPTION(DM_NAME " driver"); +MODULE_AUTHOR("Joe Thornber "); +MODULE_LICENSE("GPL"); diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/dm.h x/drivers/md/dm.h --- x-ref/drivers/md/dm.h 1970-01-01 01:00:00.000000000 +0100 +++ x/drivers/md/dm.h 2003-07-17 05:44:48.000000000 +0200 @@ -0,0 +1,154 @@ +/* + * Internal header file for device mapper + * + * Copyright (C) 2001, 2002 Sistina Software + * + * This file is released under the LGPL. + */ + +#ifndef DM_INTERNAL_H +#define DM_INTERNAL_H + +#include +#include +#include +#include + +#define DM_NAME "device-mapper" +#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x) +#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x) +#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x) + +/* + * FIXME: I think this should be with the definition of sector_t + * in types.h. + */ +#ifdef CONFIG_LBD +#define SECTOR_FORMAT "%Lu" +#else +#define SECTOR_FORMAT "%lu" +#endif + +#define SECTOR_SHIFT 9 +#define SECTOR_SIZE (1 << SECTOR_SHIFT) + +extern struct block_device_operations dm_blk_dops; + +/* + * List of devices that a metadevice uses and should open/close. + */ +struct dm_dev { + struct list_head list; + + atomic_t count; + int mode; + kdev_t dev; + struct block_device *bdev; +}; + +struct dm_table; +struct mapped_device; + +/*----------------------------------------------------------------- + * Functions for manipulating a struct mapped_device. + * Drop the reference with dm_put when you finish with the object. + *---------------------------------------------------------------*/ +int dm_create(int minor, struct dm_table *table, struct mapped_device **md); + +/* + * Reference counting for md. + */ +void dm_get(struct mapped_device *md); +void dm_put(struct mapped_device *md); + +/* + * A device can still be used while suspended, but I/O is deferred. + */ +int dm_suspend(struct mapped_device *md); +int dm_resume(struct mapped_device *md); + +/* + * The device must be suspended before calling this method. + */ +int dm_swap_table(struct mapped_device *md, struct dm_table *t); + +/* + * Drop a reference on the table when you've finished with the + * result. + */ +struct dm_table *dm_get_table(struct mapped_device *md); + +/* + * Info functions. + */ +kdev_t dm_kdev(struct mapped_device *md); +int dm_suspended(struct mapped_device *md); + +/*----------------------------------------------------------------- + * Functions for manipulating a table. Tables are also reference + * counted. + *---------------------------------------------------------------*/ +int dm_table_create(struct dm_table **result, int mode); + +void dm_table_get(struct dm_table *t); +void dm_table_put(struct dm_table *t); + +int dm_table_add_target(struct dm_table *t, const char *type, + sector_t start, sector_t len, char *params); +int dm_table_complete(struct dm_table *t); +void dm_table_event(struct dm_table *t); +sector_t dm_table_get_size(struct dm_table *t); +struct dm_target *dm_table_get_target(struct dm_table *t, int index); +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); +unsigned int dm_table_get_num_targets(struct dm_table *t); +struct list_head *dm_table_get_devices(struct dm_table *t); +int dm_table_get_mode(struct dm_table *t); +void dm_table_add_wait_queue(struct dm_table *t, wait_queue_t *wq); + +/*----------------------------------------------------------------- + * A registry of target types. + *---------------------------------------------------------------*/ +int dm_target_init(void); +void dm_target_exit(void); +struct target_type *dm_get_target_type(const char *name); +void dm_put_target_type(struct target_type *t); + + +/*----------------------------------------------------------------- + * Useful inlines. + *---------------------------------------------------------------*/ +static inline int array_too_big(unsigned long fixed, unsigned long obj, + unsigned long num) +{ + return (num > (ULONG_MAX - fixed) / obj); +} + +/* + * ceiling(n / size) * size + */ +static inline unsigned long dm_round_up(unsigned long n, unsigned long size) +{ + unsigned long r = n % size; + return n + (r ? (size - r) : 0); +} + +/* + * The device-mapper can be driven through one of two interfaces; + * ioctl or filesystem, depending which patch you have applied. + */ +int dm_interface_init(void); +void dm_interface_exit(void); + +/* + * Targets for linear and striped mappings + */ +int dm_linear_init(void); +void dm_linear_exit(void); + +int dm_stripe_init(void); +void dm_stripe_exit(void); + +int dm_snapshot_init(void); +void dm_snapshot_exit(void); + +#endif diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/kcopyd.c x/drivers/md/kcopyd.c --- x-ref/drivers/md/kcopyd.c 1970-01-01 01:00:00.000000000 +0100 +++ x/drivers/md/kcopyd.c 2003-07-17 05:44:48.000000000 +0200 @@ -0,0 +1,839 @@ +/* + * Copyright (C) 2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kcopyd.h" + +/* FIXME: this is only needed for the DMERR macros */ +#include "dm.h" + +static void wake_kcopyd(void); + +/*----------------------------------------------------------------- + * We reserve our own pool of preallocated pages that are + * only used for kcopyd io. + *---------------------------------------------------------------*/ + +/* + * FIXME: This should be configurable. + */ +#define NUM_PAGES 512 + +static DECLARE_MUTEX(_pages_lock); +static int _num_free_pages; +static struct page *_pages_array[NUM_PAGES]; +static DECLARE_MUTEX(start_lock); + +static int init_pages(void) +{ + int i; + struct page *p; + + for (i = 0; i < NUM_PAGES; i++) { + p = alloc_page(GFP_KERNEL); + if (!p) + goto bad; + + LockPage(p); + _pages_array[i] = p; + } + + _num_free_pages = NUM_PAGES; + return 0; + + bad: + while (i--) { + UnlockPage(_pages_array[i]); + __free_page(_pages_array[i]); + } + return -ENOMEM; +} + +static void exit_pages(void) +{ + int i; + struct page *p; + + for (i = 0; i < NUM_PAGES; i++) { + p = _pages_array[i]; + UnlockPage(p); + __free_page(p); + } + + _num_free_pages = 0; +} + +static int kcopyd_get_pages(int num, struct page **result) +{ + int i; + + down(&_pages_lock); + if (_num_free_pages < num) { + up(&_pages_lock); + return -ENOMEM; + } + + for (i = 0; i < num; i++) { + _num_free_pages--; + result[i] = _pages_array[_num_free_pages]; + } + up(&_pages_lock); + + return 0; +} + +static void kcopyd_free_pages(int num, struct page **result) +{ + int i; + + down(&_pages_lock); + for (i = 0; i < num; i++) + _pages_array[_num_free_pages++] = result[i]; + up(&_pages_lock); +} + +/*----------------------------------------------------------------- + * We keep our own private pool of buffer_heads. These are just + * held in a list on the b_reqnext field. + *---------------------------------------------------------------*/ + +/* + * Make sure we have enough buffers to always keep the pages + * occupied. So we assume the worst case scenario where blocks + * are the size of a single sector. + */ +#define NUM_BUFFERS NUM_PAGES * (PAGE_SIZE / SECTOR_SIZE) + +static spinlock_t _buffer_lock = SPIN_LOCK_UNLOCKED; +static struct buffer_head *_all_buffers; +static struct buffer_head *_free_buffers; + +static int init_buffers(void) +{ + int i; + struct buffer_head *buffers; + + buffers = vcalloc(NUM_BUFFERS, sizeof(struct buffer_head)); + if (!buffers) { + DMWARN("Couldn't allocate buffer heads."); + return -ENOMEM; + } + + for (i = 0; i < NUM_BUFFERS; i++) { + if (i < NUM_BUFFERS - 1) + buffers[i].b_reqnext = &buffers[i + 1]; + init_waitqueue_head(&buffers[i].b_wait); + INIT_LIST_HEAD(&buffers[i].b_inode_buffers); + } + + _all_buffers = _free_buffers = buffers; + return 0; +} + +static void exit_buffers(void) +{ + vfree(_all_buffers); +} + +static struct buffer_head *alloc_buffer(void) +{ + struct buffer_head *r; + int flags; + + spin_lock_irqsave(&_buffer_lock, flags); + + if (!_free_buffers) + r = NULL; + else { + r = _free_buffers; + _free_buffers = _free_buffers->b_reqnext; + r->b_reqnext = NULL; + } + + spin_unlock_irqrestore(&_buffer_lock, flags); + + return r; +} + +/* + * Only called from interrupt context. + */ +static void free_buffer(struct buffer_head *bh) +{ + int flags, was_empty; + + spin_lock_irqsave(&_buffer_lock, flags); + was_empty = (_free_buffers == NULL) ? 1 : 0; + bh->b_reqnext = _free_buffers; + _free_buffers = bh; + spin_unlock_irqrestore(&_buffer_lock, flags); + + /* + * If the buffer list was empty then kcopyd probably went + * to sleep because it ran out of buffer heads, so let's + * wake it up. + */ + if (was_empty) + wake_kcopyd(); +} + +/*----------------------------------------------------------------- + * kcopyd_jobs need to be allocated by the *clients* of kcopyd, + * for this reason we use a mempool to prevent the client from + * ever having to do io (which could cause a + * deadlock). + *---------------------------------------------------------------*/ +#define MIN_JOBS NUM_PAGES + +static kmem_cache_t *_job_cache = NULL; +static mempool_t *_job_pool = NULL; + +/* + * We maintain three lists of jobs: + * + * i) jobs waiting for pages + * ii) jobs that have pages, and are waiting for the io to be issued. + * iii) jobs that have completed. + * + * All three of these are protected by job_lock. + */ + +static spinlock_t _job_lock = SPIN_LOCK_UNLOCKED; + +static LIST_HEAD(_complete_jobs); +static LIST_HEAD(_io_jobs); +static LIST_HEAD(_pages_jobs); + +static int init_jobs(void) +{ + INIT_LIST_HEAD(&_complete_jobs); + INIT_LIST_HEAD(&_io_jobs); + INIT_LIST_HEAD(&_pages_jobs); + + _job_cache = kmem_cache_create("kcopyd-jobs", sizeof(struct kcopyd_job), + __alignof__(struct kcopyd_job), + 0, NULL, NULL); + if (!_job_cache) + return -ENOMEM; + + _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab, + mempool_free_slab, _job_cache); + if (!_job_pool) { + kmem_cache_destroy(_job_cache); + return -ENOMEM; + } + + return 0; +} + +static void exit_jobs(void) +{ + mempool_destroy(_job_pool); + kmem_cache_destroy(_job_cache); +} + +struct kcopyd_job *kcopyd_alloc_job(void) +{ + struct kcopyd_job *job; + + job = mempool_alloc(_job_pool, GFP_NOIO); + if (!job) + return NULL; + + memset(job, 0, sizeof(*job)); + return job; +} + +void kcopyd_free_job(struct kcopyd_job *job) +{ + mempool_free(job, _job_pool); +} + +/* + * Functions to push and pop a job onto the head of a given job + * list. + */ +static inline struct kcopyd_job *pop(struct list_head *jobs) +{ + struct kcopyd_job *job = NULL; + int flags; + + spin_lock_irqsave(&_job_lock, flags); + + if (!list_empty(jobs)) { + job = list_entry(jobs->next, struct kcopyd_job, list); + list_del(&job->list); + } + spin_unlock_irqrestore(&_job_lock, flags); + + return job; +} + +static inline void push(struct list_head *jobs, struct kcopyd_job *job) +{ + int flags; + + spin_lock_irqsave(&_job_lock, flags); + list_add(&job->list, jobs); + spin_unlock_irqrestore(&_job_lock, flags); +} + +/* + * Completion function for one of our buffers. + */ +static void end_bh(struct buffer_head *bh, int uptodate) +{ + struct kcopyd_job *job = bh->b_private; + + mark_buffer_uptodate(bh, uptodate); + unlock_buffer(bh); + + if (!uptodate) + job->err = -EIO; + + /* are we the last ? */ + if (atomic_dec_and_test(&job->nr_incomplete)) { + push(&_complete_jobs, job); + wake_kcopyd(); + } + + free_buffer(bh); +} + +static void dispatch_bh(struct kcopyd_job *job, + struct buffer_head *bh, int block) +{ + int p; + + /* + * Add in the job offset + */ + bh->b_blocknr = (job->disk.sector >> job->block_shift) + block; + + p = block >> job->bpp_shift; + block &= job->bpp_mask; + + bh->b_size = job->block_size; + set_bh_page(bh, job->pages[p], ((block << job->block_shift) + + job->offset) << SECTOR_SHIFT); + bh->b_this_page = bh; + + init_buffer(bh, end_bh, job); + + bh->b_dev = job->disk.dev; + atomic_set(&bh->b_count, 1); + + bh->b_state = ((1 << BH_Uptodate) | (1 << BH_Mapped) | + (1 << BH_Lock) | (1 << BH_Req)); + + if (job->rw == WRITE) + clear_bit(BH_Dirty, &bh->b_state); + + submit_bh(job->rw, bh); +} + +/* + * These three functions process 1 item from the corresponding + * job list. + * + * They return: + * < 0: error + * 0: success + * > 0: can't process yet. + */ +static int run_complete_job(struct kcopyd_job *job) +{ + job->callback(job); + return 0; +} + +/* + * Request io on as many buffer heads as we can currently get for + * a particular job. + */ +static int run_io_job(struct kcopyd_job *job) +{ + unsigned int block; + struct buffer_head *bh; + + for (block = atomic_read(&job->nr_requested); + block < job->nr_blocks; block++) { + bh = alloc_buffer(); + if (!bh) + break; + + atomic_inc(&job->nr_requested); + dispatch_bh(job, bh, block); + } + + return (block == job->nr_blocks) ? 0 : 1; +} + +static int run_pages_job(struct kcopyd_job *job) +{ + int r; + + job->nr_pages = (job->disk.count + job->offset) / + (PAGE_SIZE / SECTOR_SIZE); + r = kcopyd_get_pages(job->nr_pages, job->pages); + + if (!r) { + /* this job is ready for io */ + push(&_io_jobs, job); + return 0; + } + + if (r == -ENOMEM) + /* can't complete now */ + return 1; + + return r; +} + +/* + * Run through a list for as long as possible. Returns the count + * of successful jobs. + */ +static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *)) +{ + struct kcopyd_job *job; + int r, count = 0; + + while ((job = pop(jobs))) { + + r = fn(job); + + if (r < 0) { + /* error this rogue job */ + job->err = r; + push(&_complete_jobs, job); + break; + } + + if (r > 0) { + /* + * We couldn't service this job ATM, so + * push this job back onto the list. + */ + push(jobs, job); + break; + } + + count++; + } + + return count; +} + +/* + * kcopyd does this every time it's woken up. + */ +static void do_work(void) +{ + int count; + + /* + * We loop round until there is no more work to do. + */ + do { + count = process_jobs(&_complete_jobs, run_complete_job); + count += process_jobs(&_io_jobs, run_io_job); + count += process_jobs(&_pages_jobs, run_pages_job); + + } while (count); + + run_task_queue(&tq_disk); +} + +/*----------------------------------------------------------------- + * The daemon + *---------------------------------------------------------------*/ +static atomic_t _kcopyd_must_die; +static DECLARE_MUTEX(_run_lock); +static DECLARE_WAIT_QUEUE_HEAD(_job_queue); + +static int kcopyd(void *arg) +{ + DECLARE_WAITQUEUE(wq, current); + + daemonize(); + strcpy(current->comm, "kcopyd"); + atomic_set(&_kcopyd_must_die, 0); + + add_wait_queue(&_job_queue, &wq); + + down(&_run_lock); + up(&start_lock); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (atomic_read(&_kcopyd_must_die)) + break; + + do_work(); + schedule(); + } + + set_current_state(TASK_RUNNING); + remove_wait_queue(&_job_queue, &wq); + + up(&_run_lock); + + return 0; +} + +static int start_daemon(void) +{ + static pid_t pid = 0; + + down(&start_lock); + + pid = kernel_thread(kcopyd, NULL, 0); + if (pid <= 0) { + DMERR("Failed to start kcopyd thread"); + return -EAGAIN; + } + + /* + * wait for the daemon to up this mutex. + */ + down(&start_lock); + up(&start_lock); + + return 0; +} + +static int stop_daemon(void) +{ + atomic_set(&_kcopyd_must_die, 1); + wake_kcopyd(); + down(&_run_lock); + up(&_run_lock); + + return 0; +} + +static void wake_kcopyd(void) +{ + wake_up_interruptible(&_job_queue); +} + +static int calc_shift(unsigned int n) +{ + int s; + + for (s = 0; n; s++, n >>= 1) + ; + + return --s; +} + +static void calc_block_sizes(struct kcopyd_job *job) +{ + job->block_size = get_hardsect_size(job->disk.dev); + job->block_shift = calc_shift(job->block_size / SECTOR_SIZE); + job->bpp_shift = PAGE_SHIFT - job->block_shift - SECTOR_SHIFT; + job->bpp_mask = (1 << job->bpp_shift) - 1; + job->nr_blocks = job->disk.count >> job->block_shift; + atomic_set(&job->nr_requested, 0); + atomic_set(&job->nr_incomplete, job->nr_blocks); +} + +int kcopyd_io(struct kcopyd_job *job) +{ + calc_block_sizes(job); + push(job->pages[0] ? &_io_jobs : &_pages_jobs, job); + wake_kcopyd(); + return 0; +} + +/*----------------------------------------------------------------- + * The copier is implemented on top of the simpler async io + * daemon above. + *---------------------------------------------------------------*/ +struct copy_info { + kcopyd_notify_fn notify; + void *notify_context; + + struct kcopyd_region to; +}; + +#define MIN_INFOS 128 +static kmem_cache_t *_copy_cache = NULL; +static mempool_t *_copy_pool = NULL; + +static int init_copier(void) +{ + _copy_cache = kmem_cache_create("kcopyd-info", + sizeof(struct copy_info), + __alignof__(struct copy_info), + 0, NULL, NULL); + if (!_copy_cache) + return -ENOMEM; + + _copy_pool = mempool_create(MIN_INFOS, mempool_alloc_slab, + mempool_free_slab, _copy_cache); + if (!_copy_pool) { + kmem_cache_destroy(_copy_cache); + return -ENOMEM; + } + + return 0; +} + +static void exit_copier(void) +{ + if (_copy_pool) + mempool_destroy(_copy_pool); + + if (_copy_cache) + kmem_cache_destroy(_copy_cache); +} + +static inline struct copy_info *alloc_copy_info(void) +{ + return mempool_alloc(_copy_pool, GFP_NOIO); +} + +static inline void free_copy_info(struct copy_info *info) +{ + mempool_free(info, _copy_pool); +} + +void copy_complete(struct kcopyd_job *job) +{ + struct copy_info *info = (struct copy_info *) job->context; + + if (info->notify) + info->notify(job->err, info->notify_context); + + free_copy_info(info); + + kcopyd_free_pages(job->nr_pages, job->pages); + + kcopyd_free_job(job); +} + +static void page_write_complete(struct kcopyd_job *job) +{ + struct copy_info *info = (struct copy_info *) job->context; + int i; + + if (info->notify) + info->notify(job->err, info->notify_context); + + free_copy_info(info); + for (i = 0; i < job->nr_pages; i++) + put_page(job->pages[i]); + + kcopyd_free_job(job); +} + +/* + * These callback functions implement the state machine that copies regions. + */ +void copy_write(struct kcopyd_job *job) +{ + struct copy_info *info = (struct copy_info *) job->context; + + if (job->err) { + if (info->notify) + info->notify(job->err, job->context); + + kcopyd_free_job(job); + free_copy_info(info); + return; + } + + job->rw = WRITE; + memcpy(&job->disk, &info->to, sizeof(job->disk)); + job->callback = copy_complete; + + /* + * Queue the write. + */ + kcopyd_io(job); +} + +int kcopyd_write_pages(struct kcopyd_region *to, int nr_pages, + struct page **pages, int offset, kcopyd_notify_fn fn, + void *context) +{ + struct copy_info *info; + struct kcopyd_job *job; + int i; + + /* + * Allocate a new copy_info. + */ + info = alloc_copy_info(); + if (!info) + return -ENOMEM; + + job = kcopyd_alloc_job(); + if (!job) { + free_copy_info(info); + return -ENOMEM; + } + + /* + * set up for the write. + */ + info->notify = fn; + info->notify_context = context; + memcpy(&info->to, to, sizeof(*to)); + + /* Get the pages */ + job->nr_pages = nr_pages; + for (i = 0; i < nr_pages; i++) { + get_page(pages[i]); + job->pages[i] = pages[i]; + } + + job->rw = WRITE; + + memcpy(&job->disk, &info->to, sizeof(job->disk)); + job->offset = offset; + job->callback = page_write_complete; + job->context = info; + + /* + * Trigger job. + */ + kcopyd_io(job); + return 0; +} + +int kcopyd_copy(struct kcopyd_region *from, struct kcopyd_region *to, + kcopyd_notify_fn fn, void *context) +{ + struct copy_info *info; + struct kcopyd_job *job; + + /* + * Allocate a new copy_info. + */ + info = alloc_copy_info(); + if (!info) + return -ENOMEM; + + job = kcopyd_alloc_job(); + if (!job) { + free_copy_info(info); + return -ENOMEM; + } + + /* + * set up for the read. + */ + info->notify = fn; + info->notify_context = context; + memcpy(&info->to, to, sizeof(*to)); + + job->rw = READ; + memcpy(&job->disk, from, sizeof(*from)); + + job->offset = 0; + job->callback = copy_write; + job->context = info; + + /* + * Trigger job. + */ + kcopyd_io(job); + return 0; +} + +/*----------------------------------------------------------------- + * Unit setup + *---------------------------------------------------------------*/ +static struct { + int (*init) (void); + void (*exit) (void); + +} _inits[] = { +#define xx(n) { init_ ## n, exit_ ## n} + xx(pages), + xx(buffers), + xx(jobs), + xx(copier) +#undef xx +}; + +static int _client_count = 0; +static DECLARE_MUTEX(_client_count_sem); + +static int kcopyd_init(void) +{ + const int count = sizeof(_inits) / sizeof(*_inits); + + int r, i; + + for (i = 0; i < count; i++) { + r = _inits[i].init(); + if (r) + goto bad; + } + + start_daemon(); + return 0; + + bad: + while (i--) + _inits[i].exit(); + + return r; +} + +static void kcopyd_exit(void) +{ + int i = sizeof(_inits) / sizeof(*_inits); + + if (stop_daemon()) + DMWARN("Couldn't stop kcopyd."); + + while (i--) + _inits[i].exit(); +} + +void kcopyd_inc_client_count(void) +{ + /* + * What I need here is an atomic_test_and_inc that returns + * the previous value of the atomic... In its absence I lock + * an int with a semaphore. :-( + */ + down(&_client_count_sem); + if (_client_count == 0) + kcopyd_init(); + _client_count++; + + up(&_client_count_sem); +} + +void kcopyd_dec_client_count(void) +{ + down(&_client_count_sem); + if (--_client_count == 0) + kcopyd_exit(); + + up(&_client_count_sem); +} diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/md/kcopyd.h x/drivers/md/kcopyd.h --- x-ref/drivers/md/kcopyd.h 1970-01-01 01:00:00.000000000 +0100 +++ x/drivers/md/kcopyd.h 2003-07-17 05:44:48.000000000 +0200 @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2001 Sistina Software + * + * This file is released under the GPL. + */ + +#ifndef DM_KCOPYD_H +#define DM_KCOPYD_H + +/* + * Needed for the definition of offset_t. + */ +#include +#include + +struct kcopyd_region { + kdev_t dev; + sector_t sector; + sector_t count; +}; + +#define MAX_KCOPYD_PAGES 128 + +struct kcopyd_job { + struct list_head list; + + /* + * Error state of the job. + */ + int err; + + /* + * Either READ or WRITE + */ + int rw; + + /* + * The source or destination for the transfer. + */ + struct kcopyd_region disk; + + int nr_pages; + struct page *pages[MAX_KCOPYD_PAGES]; + + /* + * Shifts and masks that will be useful when dispatching + * each buffer_head. + */ + sector_t offset; + sector_t block_size; + sector_t block_shift; + sector_t bpp_shift; /* blocks per page */ + sector_t bpp_mask; + + /* + * nr_blocks is how many buffer heads will have to be + * displatched to service this job, nr_requested is how + * many have been dispatched and nr_complete is how many + * have come back. + */ + unsigned int nr_blocks; + atomic_t nr_requested; + atomic_t nr_incomplete; + + /* + * Set this to ensure you are notified when the job has + * completed. 'context' is for callback to use. + */ + void (*callback) (struct kcopyd_job * job); + void *context; +}; + +/* + * Low level async io routines. + */ +struct kcopyd_job *kcopyd_alloc_job(void); +void kcopyd_free_job(struct kcopyd_job *job); + +int kcopyd_queue_job(struct kcopyd_job *job); + +/* + * Submit a copy job to kcopyd. This is built on top of the + * previous three fns. + */ +typedef void (*kcopyd_notify_fn) (int err, void *context); + +int kcopyd_copy(struct kcopyd_region *from, struct kcopyd_region *to, + kcopyd_notify_fn fn, void *context); + +int kcopyd_write_pages(struct kcopyd_region *to, int nr_pages, + struct page **pages, int offset, kcopyd_notify_fn fn, + void *context); + +/* + * We only want kcopyd to reserve resources if someone is + * actually using it. + */ +void kcopyd_inc_client_count(void); +void kcopyd_dec_client_count(void); + +#endif diff -urNp --exclude CVS --exclude BitKeeper x-ref/fs/buffer.c x/fs/buffer.c --- x-ref/fs/buffer.c 2003-07-17 05:44:46.000000000 +0200 +++ x/fs/buffer.c 2003-07-17 05:44:48.000000000 +0200 @@ -789,6 +789,7 @@ void init_buffer(struct buffer_head *bh, bh->b_list = BUF_CLEAN; bh->b_end_io = handler; bh->b_private = private; + bh->b_journal_head = NULL; } void end_buffer_io_async(struct buffer_head * bh, int uptodate) diff -urNp --exclude CVS --exclude BitKeeper x-ref/fs/jbd/journal.c x/fs/jbd/journal.c --- x-ref/fs/jbd/journal.c 2003-07-17 05:44:45.000000000 +0200 +++ x/fs/jbd/journal.c 2003-07-17 05:44:48.000000000 +0200 @@ -1803,9 +1803,9 @@ struct journal_head *journal_add_journal if (buffer_jbd(bh)) { /* Someone did it for us! */ - J_ASSERT_BH(bh, bh->b_private != NULL); + J_ASSERT_BH(bh, bh->b_journal_head != NULL); journal_free_journal_head(jh); - jh = bh->b_private; + jh = bh->b_journal_head; } else { /* * We actually don't need jh_splice_lock when @@ -1813,7 +1813,7 @@ struct journal_head *journal_add_journal */ spin_lock(&jh_splice_lock); set_bit(BH_JBD, &bh->b_state); - bh->b_private = jh; + bh->b_journal_head = jh; jh->b_bh = bh; atomic_inc(&bh->b_count); spin_unlock(&jh_splice_lock); @@ -1822,7 +1822,7 @@ struct journal_head *journal_add_journal } jh->b_jcount++; spin_unlock(&journal_datalist_lock); - return bh->b_private; + return bh->b_journal_head; } /* @@ -1855,7 +1855,7 @@ void __journal_remove_journal_head(struc J_ASSERT_BH(bh, jh2bh(jh) == bh); BUFFER_TRACE(bh, "remove journal_head"); spin_lock(&jh_splice_lock); - bh->b_private = NULL; + bh->b_journal_head = NULL; jh->b_bh = NULL; /* debug, really */ clear_bit(BH_JBD, &bh->b_state); __brelse(bh); diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/device-mapper.h x/include/linux/device-mapper.h --- x-ref/include/linux/device-mapper.h 1970-01-01 01:00:00.000000000 +0100 +++ x/include/linux/device-mapper.h 2003-07-17 05:44:48.000000000 +0200 @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the LGPL. + */ + +#ifndef _LINUX_DEVICE_MAPPER_H +#define _LINUX_DEVICE_MAPPER_H + +typedef unsigned long sector_t; + +struct dm_target; +struct dm_table; +struct dm_dev; + +typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; + +/* + * In the constructor the target parameter will already have the + * table, type, begin and len fields filled in. + */ +typedef int (*dm_ctr_fn) (struct dm_target * target, int argc, char **argv); + +/* + * The destructor doesn't need to free the dm_target, just + * anything hidden ti->private. + */ +typedef void (*dm_dtr_fn) (struct dm_target * ti); + +/* + * The map function must return: + * < 0: error + * = 0: The target will handle the io by resubmitting it later + * > 0: simple remap complete + */ +typedef int (*dm_map_fn) (struct dm_target * ti, struct buffer_head * bh, + int rw, void **map_context); + +/* + * Returns: + * < 0 : error (currently ignored) + * 0 : ended successfully + * 1 : for some reason the io has still not completed (eg, + * multipath target might want to requeue a failed io). + */ +typedef int (*dm_endio_fn) (struct dm_target * ti, + struct buffer_head * bh, int rw, int error, + void *map_context); +typedef int (*dm_status_fn) (struct dm_target * ti, status_type_t status_type, + char *result, int maxlen); + +void dm_error(const char *message); + +/* + * Constructors should call these functions to ensure destination devices + * are opened/closed correctly. + * FIXME: too many arguments. + */ +int dm_get_device(struct dm_target *ti, const char *path, sector_t start, + sector_t len, int mode, struct dm_dev **result); +void dm_put_device(struct dm_target *ti, struct dm_dev *d); + +/* + * Information about a target type + */ +struct target_type { + const char *name; + struct module *module; + dm_ctr_fn ctr; + dm_dtr_fn dtr; + dm_map_fn map; + dm_endio_fn end_io; + dm_status_fn status; +}; + +struct dm_target { + struct dm_table *table; + struct target_type *type; + + /* target limits */ + sector_t begin; + sector_t len; + + /* target specific data */ + void *private; + + /* Used to provide an error string from the ctr */ + char *error; +}; + +int dm_register_target(struct target_type *t); +int dm_unregister_target(struct target_type *t); + +#endif /* _LINUX_DEVICE_MAPPER_H */ diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/dm-ioctl.h x/include/linux/dm-ioctl.h --- x-ref/include/linux/dm-ioctl.h 1970-01-01 01:00:00.000000000 +0100 +++ x/include/linux/dm-ioctl.h 2003-07-17 05:44:48.000000000 +0200 @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the LGPL. + */ + +#ifndef _LINUX_DM_IOCTL_H +#define _LINUX_DM_IOCTL_H + +#include + +#define DM_DIR "mapper" /* Slashes not supported */ +#define DM_MAX_TYPE_NAME 16 +#define DM_NAME_LEN 128 +#define DM_UUID_LEN 129 + +/* + * Implements a traditional ioctl interface to the device mapper. + */ + +/* + * All ioctl arguments consist of a single chunk of memory, with + * this structure at the start. If a uuid is specified any + * lookup (eg. for a DM_INFO) will be done on that, *not* the + * name. + */ +struct dm_ioctl { + /* + * The version number is made up of three parts: + * major - no backward or forward compatibility, + * minor - only backwards compatible, + * patch - both backwards and forwards compatible. + * + * All clients of the ioctl interface should fill in the + * version number of the interface that they were + * compiled with. + * + * All recognised ioctl commands (ie. those that don't + * return -ENOTTY) fill out this field, even if the + * command failed. + */ + uint32_t version[3]; /* in/out */ + uint32_t data_size; /* total size of data passed in + * including this struct */ + + uint32_t data_start; /* offset to start of data + * relative to start of this struct */ + + uint32_t target_count; /* in/out */ + uint32_t open_count; /* out */ + uint32_t flags; /* in/out */ + + __kernel_dev_t dev; /* in/out */ + + char name[DM_NAME_LEN]; /* device name */ + char uuid[DM_UUID_LEN]; /* unique identifier for + * the block device */ +}; + +/* + * Used to specify tables. These structures appear after the + * dm_ioctl. + */ +struct dm_target_spec { + int32_t status; /* used when reading from kernel only */ + uint64_t sector_start; + uint32_t length; + + /* + * Offset in bytes (from the start of this struct) to + * next target_spec. + */ + uint32_t next; + + char target_type[DM_MAX_TYPE_NAME]; + + /* + * Parameter string starts immediately after this object. + * Be careful to add padding after string to ensure correct + * alignment of subsequent dm_target_spec. + */ +}; + +/* + * Used to retrieve the target dependencies. + */ +struct dm_target_deps { + uint32_t count; + + __kernel_dev_t dev[0]; /* out */ +}; + +/* + * If you change this make sure you make the corresponding change + * to dm-ioctl.c:lookup_ioctl() + */ +enum { + /* Top level cmds */ + DM_VERSION_CMD = 0, + DM_REMOVE_ALL_CMD, + + /* device level cmds */ + DM_DEV_CREATE_CMD, + DM_DEV_REMOVE_CMD, + DM_DEV_RELOAD_CMD, + DM_DEV_RENAME_CMD, + DM_DEV_SUSPEND_CMD, + DM_DEV_DEPS_CMD, + DM_DEV_STATUS_CMD, + + /* target level cmds */ + DM_TARGET_STATUS_CMD, + DM_TARGET_WAIT_CMD +}; + +#define DM_IOCTL 0xfd + +#define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl) +#define DM_REMOVE_ALL _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl) + +#define DM_DEV_CREATE _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, struct dm_ioctl) +#define DM_DEV_REMOVE _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, struct dm_ioctl) +#define DM_DEV_RELOAD _IOWR(DM_IOCTL, DM_DEV_RELOAD_CMD, struct dm_ioctl) +#define DM_DEV_SUSPEND _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl) +#define DM_DEV_RENAME _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, struct dm_ioctl) +#define DM_DEV_DEPS _IOWR(DM_IOCTL, DM_DEV_DEPS_CMD, struct dm_ioctl) +#define DM_DEV_STATUS _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl) + +#define DM_TARGET_STATUS _IOWR(DM_IOCTL, DM_TARGET_STATUS_CMD, struct dm_ioctl) +#define DM_TARGET_WAIT _IOWR(DM_IOCTL, DM_TARGET_WAIT_CMD, struct dm_ioctl) + +#define DM_VERSION_MAJOR 1 +#define DM_VERSION_MINOR 0 +#define DM_VERSION_PATCHLEVEL 10 +#define DM_VERSION_EXTRA "-ioctl (2003-03-27)" + +/* Status bits */ +#define DM_READONLY_FLAG 0x00000001 +#define DM_SUSPEND_FLAG 0x00000002 +#define DM_EXISTS_FLAG 0x00000004 +#define DM_PERSISTENT_DEV_FLAG 0x00000008 + +/* + * Flag passed into ioctl STATUS command to get table information + * rather than current status. + */ +#define DM_STATUS_TABLE_FLAG 0x00000010 + +#endif /* _LINUX_DM_IOCTL_H */ diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/fs.h x/include/linux/fs.h --- x-ref/include/linux/fs.h 2003-07-17 05:44:46.000000000 +0200 +++ x/include/linux/fs.h 2003-07-17 05:44:48.000000000 +0200 @@ -269,6 +269,7 @@ struct buffer_head { struct page *b_page; /* the page this bh is mapped to */ void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */ void *b_private; /* reserved for b_end_io */ + void *b_journal_head; /* ext3 journal_heads */ unsigned long b_rsector; /* Real buffer location on disk */ int b_elv_sequence; /* for atomic blocks */ diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/jbd.h x/include/linux/jbd.h --- x-ref/include/linux/jbd.h 2003-06-13 22:07:42.000000000 +0200 +++ x/include/linux/jbd.h 2003-07-17 05:44:48.000000000 +0200 @@ -311,7 +311,7 @@ static inline struct buffer_head *jh2bh( static inline struct journal_head *bh2jh(struct buffer_head *bh) { - return bh->b_private; + return bh->b_journal_head; } #define HAVE_JOURNAL_CALLBACK_STATUS diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/mempool.h x/include/linux/mempool.h --- x-ref/include/linux/mempool.h 1970-01-01 01:00:00.000000000 +0100 +++ x/include/linux/mempool.h 2003-07-17 05:44:48.000000000 +0200 @@ -0,0 +1,31 @@ +/* + * memory buffer pool support + */ +#ifndef _LINUX_MEMPOOL_H +#define _LINUX_MEMPOOL_H + +#include +#include + +struct mempool_s; +typedef struct mempool_s mempool_t; + +typedef void * (mempool_alloc_t)(int gfp_mask, void *pool_data); +typedef void (mempool_free_t)(void *element, void *pool_data); + +extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data); +extern int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask); +extern void mempool_destroy(mempool_t *pool); +extern void * mempool_alloc(mempool_t *pool, int gfp_mask); +extern void mempool_free(void *element, mempool_t *pool); + +/* + * A mempool_alloc_t and mempool_free_t that get the memory from + * a slab that is passed in through pool_data. + */ +void *mempool_alloc_slab(int gfp_mask, void *pool_data); +void mempool_free_slab(void *element, void *pool_data); + + +#endif /* _LINUX_MEMPOOL_H */ diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/vmalloc.h x/include/linux/vmalloc.h --- x-ref/include/linux/vmalloc.h 2003-07-17 05:44:43.000000000 +0200 +++ x/include/linux/vmalloc.h 2003-07-17 05:44:48.000000000 +0200 @@ -28,6 +28,7 @@ extern long vread(char *buf, char *addr, extern void vmfree_area_pages(unsigned long address, unsigned long size); extern int vmalloc_area_pages(unsigned long address, unsigned long size, int gfp_mask, pgprot_t prot); +extern void *vcalloc(unsigned long nmemb, unsigned long elem_size); /* * Allocate any pages diff -urNp --exclude CVS --exclude BitKeeper x-ref/kernel/ksyms.c x/kernel/ksyms.c --- x-ref/kernel/ksyms.c 2003-07-17 05:44:46.000000000 +0200 +++ x/kernel/ksyms.c 2003-07-17 05:44:48.000000000 +0200 @@ -121,6 +121,7 @@ EXPORT_SYMBOL(vfree); EXPORT_SYMBOL(__vmalloc); EXPORT_SYMBOL(vmap); EXPORT_SYMBOL(vmalloc_to_page); +EXPORT_SYMBOL(vcalloc); #ifndef CONFIG_DISCONTIGMEM EXPORT_SYMBOL(contig_page_data); EXPORT_SYMBOL(mem_map); diff -urNp --exclude CVS --exclude BitKeeper x-ref/mm/Makefile x/mm/Makefile --- x-ref/mm/Makefile 2003-07-17 05:44:45.000000000 +0200 +++ x/mm/Makefile 2003-07-17 05:44:48.000000000 +0200 @@ -9,12 +9,12 @@ O_TARGET := mm.o -export-objs := shmem.o filemap.o memory.o page_alloc.o +export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ - shmem.o bigpages.o + shmem.o bigpages.o mempool.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PROC_MM) += proc_mm.o diff -urNp --exclude CVS --exclude BitKeeper x-ref/mm/mempool.c x/mm/mempool.c --- x-ref/mm/mempool.c 1970-01-01 01:00:00.000000000 +0100 +++ x/mm/mempool.c 2003-07-17 05:44:48.000000000 +0200 @@ -0,0 +1,299 @@ +/* + * linux/mm/mempool.c + * + * memory buffer pool support. Such pools are mostly used + * for guaranteed, deadlock-free memory allocations during + * extreme VM load. + * + * started by Ingo Molnar, Copyright (C) 2001 + */ + +#include +#include +#include +#include + +struct mempool_s { + spinlock_t lock; + int min_nr; /* nr of elements at *elements */ + int curr_nr; /* Current nr of elements at *elements */ + void **elements; + + void *pool_data; + mempool_alloc_t *alloc; + mempool_free_t *free; + wait_queue_head_t wait; +}; + +static void add_element(mempool_t *pool, void *element) +{ + BUG_ON(pool->curr_nr >= pool->min_nr); + pool->elements[pool->curr_nr++] = element; +} + +static void *remove_element(mempool_t *pool) +{ + BUG_ON(pool->curr_nr <= 0); + return pool->elements[--pool->curr_nr]; +} + +static void free_pool(mempool_t *pool) +{ + while (pool->curr_nr) { + void *element = remove_element(pool); + pool->free(element, pool->pool_data); + } + kfree(pool->elements); + kfree(pool); +} + +/** + * mempool_create - create a memory pool + * @min_nr: the minimum number of elements guaranteed to be + * allocated for this pool. + * @alloc_fn: user-defined element-allocation function. + * @free_fn: user-defined element-freeing function. + * @pool_data: optional private data available to the user-defined functions. + * + * this function creates and allocates a guaranteed size, preallocated + * memory pool. The pool can be used from the mempool_alloc and mempool_free + * functions. This function might sleep. Both the alloc_fn() and the free_fn() + * functions might sleep - as long as the mempool_alloc function is not called + * from IRQ contexts. + */ +mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data) +{ + mempool_t *pool; + + pool = kmalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + memset(pool, 0, sizeof(*pool)); + pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL); + if (!pool->elements) { + kfree(pool); + return NULL; + } + spin_lock_init(&pool->lock); + pool->min_nr = min_nr; + pool->pool_data = pool_data; + init_waitqueue_head(&pool->wait); + pool->alloc = alloc_fn; + pool->free = free_fn; + + /* + * First pre-allocate the guaranteed number of buffers. + */ + while (pool->curr_nr < pool->min_nr) { + void *element; + + element = pool->alloc(GFP_KERNEL, pool->pool_data); + if (unlikely(!element)) { + free_pool(pool); + return NULL; + } + add_element(pool, element); + } + return pool; +} + +/** + * mempool_resize - resize an existing memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @new_min_nr: the new minimum number of elements guaranteed to be + * allocated for this pool. + * @gfp_mask: the usual allocation bitmask. + * + * This function shrinks/grows the pool. In the case of growing, + * it cannot be guaranteed that the pool will be grown to the new + * size immediately, but new mempool_free() calls will refill it. + * + * Note, the caller must guarantee that no mempool_destroy is called + * while this function is running. mempool_alloc() & mempool_free() + * might be called (eg. from IRQ contexts) while this function executes. + */ +int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask) +{ + void *element; + void **new_elements; + unsigned long flags; + + BUG_ON(new_min_nr <= 0); + + spin_lock_irqsave(&pool->lock, flags); + if (new_min_nr < pool->min_nr) { + while (pool->curr_nr > new_min_nr) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + pool->free(element, pool->pool_data); + spin_lock_irqsave(&pool->lock, flags); + } + pool->min_nr = new_min_nr; + goto out_unlock; + } + spin_unlock_irqrestore(&pool->lock, flags); + + /* Grow the pool */ + new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask); + if (!new_elements) + return -ENOMEM; + + spin_lock_irqsave(&pool->lock, flags); + memcpy(new_elements, pool->elements, + pool->curr_nr * sizeof(*new_elements)); + kfree(pool->elements); + pool->elements = new_elements; + pool->min_nr = new_min_nr; + + while (pool->curr_nr < pool->min_nr) { + spin_unlock_irqrestore(&pool->lock, flags); + element = pool->alloc(gfp_mask, pool->pool_data); + if (!element) + goto out; + spin_lock_irqsave(&pool->lock, flags); + if (pool->curr_nr < pool->min_nr) + add_element(pool, element); + else + kfree(element); /* Raced */ + } +out_unlock: + spin_unlock_irqrestore(&pool->lock, flags); +out: + return 0; +} + +/** + * mempool_destroy - deallocate a memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * this function only sleeps if the free_fn() function sleeps. The caller + * has to guarantee that all elements have been returned to the pool (ie: + * freed) prior to calling mempool_destroy(). + */ +void mempool_destroy(mempool_t *pool) +{ + if (pool->curr_nr != pool->min_nr) + BUG(); /* There were outstanding elements */ + free_pool(pool); +} + +/** + * mempool_alloc - allocate an element from a specific memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @gfp_mask: the usual allocation bitmask. + * + * this function only sleeps if the alloc_fn function sleeps or + * returns NULL. Note that due to preallocation, this function + * *never* fails when called from process contexts. (it might + * fail if called from an IRQ context.) + */ +void * mempool_alloc(mempool_t *pool, int gfp_mask) +{ + void *element; + unsigned long flags; + int curr_nr; + DECLARE_WAITQUEUE(wait, current); + int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + +repeat_alloc: + element = pool->alloc(gfp_nowait, pool->pool_data); + if (likely(element != NULL)) + return element; + + /* + * If the pool is less than 50% full then try harder + * to allocate an element: + */ + if ((gfp_mask != gfp_nowait) && (pool->curr_nr <= pool->min_nr/2)) { + element = pool->alloc(gfp_mask, pool->pool_data); + if (likely(element != NULL)) + return element; + } + + /* + * Kick the VM at this point. + */ + wakeup_bdflush(); + + spin_lock_irqsave(&pool->lock, flags); + if (likely(pool->curr_nr)) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + return element; + } + spin_unlock_irqrestore(&pool->lock, flags); + + /* We must not sleep in the GFP_ATOMIC case */ + if (gfp_mask == gfp_nowait) + return NULL; + + run_task_queue(&tq_disk); + + add_wait_queue_exclusive(&pool->wait, &wait); + set_task_state(current, TASK_UNINTERRUPTIBLE); + + spin_lock_irqsave(&pool->lock, flags); + curr_nr = pool->curr_nr; + spin_unlock_irqrestore(&pool->lock, flags); + + if (!curr_nr) + schedule(); + + current->state = TASK_RUNNING; + remove_wait_queue(&pool->wait, &wait); + + goto repeat_alloc; +} + +/** + * mempool_free - return an element to the pool. + * @element: pool element pointer. + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * this function only sleeps if the free_fn() function sleeps. + */ +void mempool_free(void *element, mempool_t *pool) +{ + unsigned long flags; + + if (pool->curr_nr < pool->min_nr) { + spin_lock_irqsave(&pool->lock, flags); + if (pool->curr_nr < pool->min_nr) { + add_element(pool, element); + spin_unlock_irqrestore(&pool->lock, flags); + wake_up(&pool->wait); + return; + } + spin_unlock_irqrestore(&pool->lock, flags); + } + pool->free(element, pool->pool_data); +} + +/* + * A commonly used alloc and free fn. + */ +void *mempool_alloc_slab(int gfp_mask, void *pool_data) +{ + kmem_cache_t *mem = (kmem_cache_t *) pool_data; + return kmem_cache_alloc(mem, gfp_mask); +} + +void mempool_free_slab(void *element, void *pool_data) +{ + kmem_cache_t *mem = (kmem_cache_t *) pool_data; + kmem_cache_free(mem, element); +} + + +EXPORT_SYMBOL(mempool_create); +EXPORT_SYMBOL(mempool_resize); +EXPORT_SYMBOL(mempool_destroy); +EXPORT_SYMBOL(mempool_alloc); +EXPORT_SYMBOL(mempool_free); +EXPORT_SYMBOL(mempool_alloc_slab); +EXPORT_SYMBOL(mempool_free_slab); diff -urNp --exclude CVS --exclude BitKeeper x-ref/mm/vmalloc.c x/mm/vmalloc.c --- x-ref/mm/vmalloc.c 2003-07-17 05:44:43.000000000 +0200 +++ x/mm/vmalloc.c 2003-07-17 05:44:48.000000000 +0200 @@ -377,3 +377,22 @@ finished: read_unlock(&vmlist_lock); return buf - buf_start; } + +void *vcalloc(unsigned long nmemb, unsigned long elem_size) +{ + unsigned long size; + void *addr; + + /* + * Check that we're not going to overflow. + */ + if (nmemb > (ULONG_MAX / elem_size)) + return NULL; + + size = nmemb * elem_size; + addr = vmalloc(size); + if (addr) + memset(addr, 0, size); + + return addr; +}